In [58]:
import TIdatabase as ti
%matplotlib inline 
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import statsmodels.api as sm
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
from matplotlib import rcParams
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn import svm
from sklearn.preprocessing import Imputer
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

Read the data. 

In [59]:
df = pd.read_csv("collegedata_normalized.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,studentID,classrank,admissionstest,AP,averageAP,SATsubject,GPA,GPA_w,program,...,alumni,outofstate,acceptStatus,acceptProb,name,acceptrate,size,public,finAidPct,instatePct
0,0,PWY05BUB4I,,0.926899,7,1.067339,0.324272,-0.18711,0.059947,Biomedical engineering,...,0,0,1,,Rice,0.151,6621,0,0,0
1,1,3UVDFVI9Z0,,0.293054,7,0.660575,-0.440777,0.493474,0.398944,Classics,...,0,1,1,,Rice,0.151,6621,0,0,0
2,2,BCCBHJUP0M,,0.293054,0,,0.324272,0.396247,-1.035273,Biological Science,...,0,1,0,,Rice,0.151,6621,0,0,0
3,3,WZFPWHSQMS,,1.387878,7,0.863957,1.08932,0.104569,-0.383356,Physics,...,0,1,0,,Rice,0.151,6621,0,0,0
4,4,5W1JNQA7G0,,0.408299,1,-0.356334,-0.440777,0.542087,,,...,0,1,1,,Rice,0.151,6621,0,0,0


We now check for NaNs. The code below shows that 23% of all entries are null. This is way too high. 

In [60]:
x = df.isnull().sum(axis=1).tolist()
y = float(sum(x)) / (df.shape[0]*df.shape[1])
print y

0.225931657684


To decrease the proportion of NaNs, we get rid of columns which are almost all null. From the `df.head()` above, we see that either a column is almost all null, or it is almost all non-null. Therefore the 23% above is probably driven mainly by columns that are basically all null. 

So we remove any column that has 50% or more null values. This takes out any predictors that would have been useless anyway. 

In [77]:
cols_to_drop = []
for i in df.columns:
    if 1.0* df[i].isnull().sum() / len(df[i]) >= 0.5:
        cols_to_drop.append(i)
print cols_to_drop
dfr = df.drop(cols_to_drop,axis=1)

['classrank', 'canAfford', 'firstinfamily', 'artist', 'workexp', 'visited', 'acceptProb']


Now drop all columns where the accept status is NaN. 

In [78]:
dfr = dfr[pd.notnull(df["acceptStatus"])]

In [79]:
x = dfr.isnull().sum(axis=1).tolist()
y = float(sum(x)) / (dfr.shape[0]*dfr.shape[1])
print y

0.0304018831218


Now we have only 3% null values. Good! The next step is to choose which columns we want to use to predict. Obviously columns like `studentID`, while crucial, are not actually predictors. Also, we remove weighted GPA in favour of GPA, as we have already normalised everything. 

In [80]:
print dfr.columns
predictor_cols = ["admissionstest","AP","averageAP","SATsubject","GPA","schooltype","intendedgradyear","female","MinorityRace","international","sports","earlyAppl","alumni","outofstate","acceptrate","size","public","finAidPct","instatePct"]
dfpredict = dfr[predictor_cols]
dfresponse = dfr["acceptStatus"]

Index([u'Unnamed: 0', u'studentID', u'admissionstest', u'AP', u'averageAP',
       u'SATsubject', u'GPA', u'GPA_w', u'program', u'schooltype',
       u'intendedgradyear', u'addInfo', u'female', u'MinorityGender',
       u'MinorityRace', u'international', u'sports', u'collegeID',
       u'earlyAppl', u'alumni', u'outofstate', u'acceptStatus', u'name',
       u'acceptrate', u'size', u'public', u'finAidPct', u'instatePct'],
      dtype='object')


The code below demonstrates that if we remove all the rows with ANY nulls in it, we reduce our dataset from 16k to 13k. This reduces our dataset too much. So we will have to impute the missing values. We initially tried to do this using the `mice` package in R, but there does not seem to be an equivalent in Python. Since the % of nulls is just 4%, it shouldn't matter too much what method we use. Since some of the variables are factor, not numerical, we can't use mean or media. We are looking into KNN imputation, but for the time being, just use median. As stated, it shouldn't matter too much what method we use. 

In [81]:
print dfpredict.dropna(axis=0,how="any").shape
print dfpredict.shape

(10897, 19)
(13291, 19)


In [65]:
imp = Imputer(missing_values="NaN", strategy="median", axis=1)
imp.fit(dfpredict)
X = imp.transform(dfpredict)
y = dfresponse
X.shape, y.shape

((16062, 19), (16062,))

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print X_train.shape, X_test.shape, y_train.shape, y_test.shape, y.shape

(12849, 19) (3213, 19) (12849,) (3213,) (16062,)


In [76]:
y

0         1
1         1
2         0
3         0
4         1
5       NaN
6         0
7         0
8         0
9         1
10        0
11        0
12        1
13        0
14        0
15        0
16        1
17        0
18        0
19        1
20        0
21        0
22        0
23        0
24        0
25        1
26        0
27      NaN
28        0
29        0
         ..
16032   NaN
16033     0
16034     1
16035     0
16036     0
16037   NaN
16038     0
16039     1
16040     0
16041     0
16042     0
16043     0
16044     0
16045     1
16046     0
16047     0
16048     0
16049     0
16050     0
16051     0
16052     0
16053     1
16054     0
16055   NaN
16056     0
16057     0
16058     0
16059     0
16060     1
16061   NaN
Name: acceptStatus, dtype: float64

In [57]:
clf = linear_model.LogisticRegression(C=1000)
clf.fit(X_train,y_train)
# predicted = clf.predict(X_test)
# print metrics.accuracy_score(y_test, predicted)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').