In [0]:
%matplotlib inline

import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import KFold, train_test_split

warnings.filterwarnings("ignore")
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 40)
random_state = 123456789

In [0]:
URL = 'https://raw.githubusercontent.com/ccny-data/College_Major_Data/master/data/IPEDS_data%20-%20Data.csv'
df = pd.read_csv(URL)

In [3]:
df.shape

(1534, 145)

In [4]:
df.sample(5)

Unnamed: 0,ID number,Name,year,ZIP code,Highest degree offered,County name,Longitude location of institution,Latitude location of institution,Religious affiliation,Offers Less than one year certificate,Offers One but less than two years certificate,Offers Associate's degree,Offers Two but less than 4 years certificate,Offers Bachelor's degree,Offers Postbaccalaureate certificate,Offers Master's degree,Offers Post-master's certificate,Offers Doctor's degree - research/scholarship,Offers Doctor's degree - professional practice,Offers Doctor's degree - other,...,Percent of first-time undergraduates - out-of-state,Number of first-time undergraduates - foreign countries,Percent of first-time undergraduates - foreign countries,Number of first-time undergraduates - residence unknown,Percent of first-time undergraduates - residence unknown,"Graduation rate - Bachelor degree within 4 years, total","Graduation rate - Bachelor degree within 5 years, total","Graduation rate - Bachelor degree within 6 years, total",Percent of freshmen receiving any financial aid,"Percent of freshmen receiving federal, state, local or institutional grant aid",Percent of freshmen receiving federal grant aid,Percent of freshmen receiving Pell grants,Percent of freshmen receiving other federal grant aid,Percent of freshmen receiving state/local grant aid,Percent of freshmen receiving institutional grant aid,Percent of freshmen receiving student loan aid,Percent of freshmen receiving federal student loans,Percent of freshmen receiving other loan aid,Endowment assets (year end) per FTE enrollment (GASB),Endowment assets (year end) per FTE enrollment (FASB)
1401,235316,Gonzaga University,2013,99258-0001,Doctor's degree - research/scholarship and pro...,Spokane County,-117.403062,47.668144,Roman Catholic,Implied no,Implied no,Implied no,Implied no,Yes,Implied no,Yes,Implied no,Yes,Yes,Implied no,...,51.0,12.0,1.0,1.0,0.0,71.0,81.0,82.0,99.0,99.0,15.0,15.0,7.0,6.0,98.0,55.0,54.0,4.0,,22282.0
1275,225548,Howard Payne University,2013,76801-2794,Master's degree,Brown County,-98.984956,31.716345,Baptist,Yes,Implied no,Yes,Implied no,Yes,Implied no,Yes,Implied no,Implied no,Implied no,Implied no,...,1.0,0.0,0.0,0.0,0.0,30.0,42.0,42.0,99.0,99.0,45.0,45.0,22.0,60.0,99.0,74.0,73.0,8.0,,53054.0
1360,231624,College of William and Mary,2013,23187-8795,Doctor's degree - research/scholarship and pro...,Williamsburg city,-76.709395,37.27195,Not applicable,Implied no,Implied no,Implied no,Implied no,Yes,Implied no,Yes,Yes,Yes,Yes,Implied no,...,32.0,62.0,4.0,22.0,1.0,83.0,89.0,90.0,53.0,33.0,12.0,11.0,1.0,12.0,30.0,27.0,27.0,2.0,84573.0,
683,178387,Missouri Western State University,2013,64507,Master's degree,Buchanan County,-94.785643,39.7575,Not applicable,Implied no,Yes,Yes,Implied no,Yes,Yes,Yes,Implied no,Implied no,Implied no,Implied no,...,8.0,9.0,1.0,0.0,0.0,13.0,27.0,33.0,97.0,93.0,51.0,51.0,13.0,54.0,54.0,64.0,64.0,1.0,5765.0,
461,159382,Louisiana State University-Alexandria,2013,71302-9121,Bachelor's degree,Rapides Parish,-92.414409,31.178616,Not applicable,Implied no,Yes,Yes,Implied no,Yes,Yes,Implied no,Implied no,Implied no,Implied no,Implied no,...,,,,,,7.0,17.0,20.0,92.0,92.0,59.0,59.0,3.0,66.0,30.0,35.0,34.0,2.0,8162.0,


In [0]:
df = df.dropna(subset=['Admissions total', 'Applicants total'])

In [0]:
df['is_religious'] = (df['Religious affiliation'] != 'Not applicable')*1
df['offers_associates'] = (df["Offers Associate's degree"] == 'Yes')*1
df['offers_bachelors'] = (df["Offers Bachelor's degree"] == 'Yes')*1
df['offers_masters'] = (df["Offers Master's degree"] == 'Yes')*1
df['is_private'] = (df['Control of institution'] == 'Private not-for-profit')*1
df['is_public'] = (df['Control of institution'] == 'Public')*1
#df['is_urban'] = (df['Degree of urbanization (Urban-centric locale)']=='City: Large')*1
#df['is_suburb'] = ('Suburb' in str(df['Degree of urbanization (Urban-centric locale)']))*1
#df['is_town'] = ('Town' in str(df['Degree of urbanization (Urban-centric locale)']))*1
#df['is_rural'] = ('Rural' in str(df['Degree of urbanization (Urban-centric locale)']))*1

In [7]:
df['Degree of urbanization (Urban-centric locale)'].value_counts()

City: Large        269
Suburb: Large      259
City: Small        199
City: Midsize      167
Town: Distant      148
Town: Remote       116
Town: Fringe        56
Rural: Fringe       47
Suburb: Midsize     47
Suburb: Small       31
Rural: Distant      25
Rural: Remote       13
Name: Degree of urbanization (Urban-centric locale), dtype: int64

In [8]:
df.sample(3)

Unnamed: 0,ID number,Name,year,ZIP code,Highest degree offered,County name,Longitude location of institution,Latitude location of institution,Religious affiliation,Offers Less than one year certificate,Offers One but less than two years certificate,Offers Associate's degree,Offers Two but less than 4 years certificate,Offers Bachelor's degree,Offers Postbaccalaureate certificate,Offers Master's degree,Offers Post-master's certificate,Offers Doctor's degree - research/scholarship,Offers Doctor's degree - professional practice,Offers Doctor's degree - other,...,"Graduation rate - Bachelor degree within 5 years, total","Graduation rate - Bachelor degree within 6 years, total",Percent of freshmen receiving any financial aid,"Percent of freshmen receiving federal, state, local or institutional grant aid",Percent of freshmen receiving federal grant aid,Percent of freshmen receiving Pell grants,Percent of freshmen receiving other federal grant aid,Percent of freshmen receiving state/local grant aid,Percent of freshmen receiving institutional grant aid,Percent of freshmen receiving student loan aid,Percent of freshmen receiving federal student loans,Percent of freshmen receiving other loan aid,Endowment assets (year end) per FTE enrollment (GASB),Endowment assets (year end) per FTE enrollment (FASB),is_religious,offers_associates,offers_bachelors,offers_masters,is_private,is_public
451,157535,University of Pikeville,2013,41501,Doctor's degree - professional practice,Pike County,-82.520265,37.478835,Presbyterian Church (USA),Implied no,Implied no,Yes,Implied no,Yes,Implied no,Yes,Implied no,Implied no,Yes,Implied no,...,27.0,29.0,100.0,100.0,65.0,65.0,7.0,78.0,99.0,95.0,95.0,1.0,,11474.0,1,1,1,1,1,0
661,176053,Mississippi College,2013,39058,Doctor's degree - research/scholarship and pro...,Hinds County,-90.330452,32.334565,Southern Baptist,Implied no,Implied no,Implied no,Implied no,Yes,Yes,Yes,Yes,Yes,Yes,Implied no,...,50.0,53.0,99.0,99.0,32.0,31.0,9.0,42.0,97.0,57.0,57.0,2.0,,14482.0,1,0,1,1,1,0
418,155414,Kansas Wesleyan University,2013,67401-6196,Master's degree,Saline County,-97.609476,38.813923,United Methodist,Implied no,Implied no,Yes,Implied no,Yes,Implied no,Yes,Implied no,Implied no,Implied no,Implied no,...,39.0,43.0,99.0,99.0,33.0,33.0,4.0,21.0,99.0,77.0,77.0,14.0,,34444.0,1,1,1,1,1,0


In [0]:
# Lots of features here, might be worth looking into finding which are most importand and trim the least important
# Note: the Total enrollment column, it has two spaces between "Total" and "enrollment".

#features = ['is_religious', 'offers_associates', 'offers_bachelors', 'offers_masters', 'Percent of freshmen submitting SAT scores', 'Percent of freshmen submitting ACT scores', 'SAT Critical Reading 25th percentile score', 
#            'SAT Critical Reading 75th percentile score', 'SAT Math 25th percentile score', 'SAT Math 75th percentile score', 'SAT Writing 25th percentile score', 'SAT Writing 75th percentile score', 
#            'ACT Composite 25th percentile score', 'ACT Composite 75th percentile score', 'Tuition and fees, 2013-14', 'Total  enrollment', 'Percent of total enrollment that are American Indian or Alaska Native',
#           'Percent of total enrollment that are Asian', 'Percent of total enrollment that are Black or African American', 'Percent of total enrollment that are Hispanic/Latino', 
#            'Percent of total enrollment that are Native Hawaiian or Other Pacific Islander', 'Percent of total enrollment that are White', 'Percent of total enrollment that are two or more races', 'Percent of total enrollment that are women',
#           'Percent of freshmen receiving any financial aid', 'Graduation rate - Bachelor degree within 4 years, total', 'Graduation rate - Bachelor degree within 6 years, total']

features = ['offers_associates', 'offers_masters', 'Percent of freshmen submitting SAT scores', 'Percent of freshmen submitting ACT scores', 'SAT Critical Reading 25th percentile score', 
            'SAT Critical Reading 75th percentile score', 'SAT Math 25th percentile score', 'SAT Math 75th percentile score', 'SAT Writing 25th percentile score', 'SAT Writing 75th percentile score', 
            'ACT Composite 25th percentile score', 'ACT Composite 75th percentile score', 'Tuition and fees, 2013-14', 'Total  enrollment', 'Graduation rate - Bachelor degree within 4 years, total', 'Graduation rate - Bachelor degree within 6 years, total']

In [0]:
model_df = df[(features + ['Percent admitted - total'])].dropna().reset_index()

train_df, holdout_df, y_train, y_holdout = train_test_split(
    model_df[features], 
    model_df['Percent admitted - total'], test_size=0.25,
    random_state=random_state)

train_df['Percent admitted - total'] = y_train
holdout_df['Percent admitted - total'] = y_holdout

train_df.reset_index(inplace=True)
holdout_df.reset_index(inplace=True)


In [0]:
k_fold = KFold(n_splits=5, random_state=random_state)

In [0]:
def get_cv_results(classifier):
    
    results = []
    for train, test in k_fold.split(train_df):
        classifier.fit(train_df.loc[train, features], train_df.loc[train, 'Percent admitted - total'])
        y_predicted = classifier.predict(train_df.loc[test, features])
        
        #Accuracy score doesn't work for regression problems!
        accuracy = accuracy_score(train_df.loc[test, 'Percent admitted - total'], y_predicted)
        results.append(accuracy)
        #print(accuracy)
    
    return np.mean(results), np.std(results)

In [22]:
logreg = LogisticRegression(
    random_state=random_state, 
    solver='lbfgs'
)

get_cv_results(logreg)

(0.013822558726460881, 0.007891999850850526)

In [23]:
dtree = DecisionTreeRegressor(
    random_state=random_state, 
    #max_depth=max_depth
)

get_cv_results(dtree)

(0.019782566491943314, 0.014015889100325227)

In [24]:
rforest = RandomForestRegressor(
    random_state=random_state, 
    #max_depth=max_depth,
    n_estimators=100
)

get_cv_results(rforest)

ValueError: ignored

In [16]:
gbm = GradientBoostingRegressor(
    random_state=random_state, 
    #max_depth=max_depth,
    n_estimators=100
)

get_cv_results(gbm)

ValueError: ignored