In [None]:
import pandas as pd
from sklearn import feature_selection, tree
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC, LinearSVR
from sklearn.linear_model import BayesianRidge, LinearRegression, LassoCV
from sklearn import metrics
import numpy as np

from itertools import product, combinations
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
# Get Data for Training...

data_dir = '../.data'
fp_train = '{}/train.csv'.format(data_dir)
df_test = pd.read_csv(fp_train,index_col=0)

In [None]:
# Get Feature Data...

df_path = "./../FFChallenge_v2/background.dta"

df_features = None
with open(df_path, "r") as f:
    df_features = pd.read_stata(f)
    print df_features.head()
    
df_train = df_features.set_index('challengeID')
df_train_na = df_train.replace('NA', np.NaN)
df_train_na.cf4fint = pd.to_datetime(df_train_na.cf4fint)
df_train_na_cols = df_train_na.columns[df_train_na.dtypes == 'object']
df_train_na[df_train_na_cols] = df_train_na[df_train_na_cols].apply(lambda x: pd.to_numeric(x, errors = 'ignore'))

df_train_no_obj = df_train_na[df_train_na.columns[df_train_na.dtypes != 'object']]
final_cols = df_train_no_obj.columns[~ df_train_no_obj.isnull().all()]
df_final = pd.DataFrame(df_train_no_obj[final_cols])
print df_final.shape

# Find number of unique values in each column. If unique == 1, then remove from final data frame.
n = df_final.apply(lambda x: len(x.unique()))
df_final = pd.DataFrame(df_final[df_final.columns[n>1]])
print df_final.shape

In [None]:
df_final.hv5_wj9raw

In [None]:
cols_interest = [
    't5c13a',
    't5c13b',
    't5c13c',
    't5b1a',
    't5b1b',
    't5b1c',
    't5b1d',
    't5b1e',
    't5b1f',
    't5b1g',
    't5b1h',
    't5b1i',
    't5b1j',
    't5b1k',
    't5b1l',
    't5b1m',
    't5b1n',
    't5b2c',
    't5d5',
    't5d6',
    't5d7',
]

cols_child = [
    'k5g1a',
    'k5g1b',
    'k5g1c',
    'k5g1d',
    'k5g1e',
    'k5g2a',
    'k5g2b',
    'k5g2c',
    'k5g2d',
    'k5g2e',
    'k5g2f',
    'k5g2h',
]

cols_routine = [
    'k5d1a',
    'k5d1b',
    'k5d1c',
    'k5d1d',
    'k5d1e',
    'k5d1f',
    'k5d1g',
    'k5d1h',
]

cols_income = [
    'm5j1',
]

other = [
    'p5i1i'
]

home_visit = [
#    'hv4l34',
#    'hv5_wj9raw',
#    'hv5_wj10raw',
#    'hv5_wj9ss',
#    'hv5_wj10ss',
    'hv5_ppvtraw',
#    'hv5_ppvtss',
]

cols = home_visit #cols_child + cols_routine + cols_income #+ other + cols_routine + cols_child + cols_interest

df = pd.DataFrame(df_final[cols])
#df = pd.DataFrame(df_final)

In [None]:
df['gpa'] = df_test.gpa
teacher_survey = df.dropna()
print 'Before cutting: {}'.format(teacher_survey.shape)

def prepare(x):
    if isinstance(x, basestring):
        return int(x.split(' ')[0])
    else:
        return x

teacher_survey[cols] = pd.DataFrame(teacher_survey[cols].applymap(prepare))
teacher_survey = teacher_survey[teacher_survey[cols[0]] > -9]
#teacher_survey = teacher_survey[teacher_survey[cols[0]] > 0]
teacher_survey = teacher_survey.applymap(lambda x: x * (x > 0))
print 'After cutting: {}'.format(teacher_survey.shape)


In [None]:
b = 400
#reg = BayesianRidge()
reg = LassoCV()
#reg = tree.DecisionTreeRegressor()
#reg = LinearRegression()
X = teacher_survey[cols]
Y = teacher_survey.gpa
reg.fit(X.iloc[:b], Y.iloc[:b])

In [None]:
out_reg = reg.predict(X.iloc[b:])
metrics.mean_squared_error(Y[b:], out_reg)

In [None]:
plt.hist(out_reg)

In [None]:
keys = dict(zip(Y.unique(),range(len(Y.unique()))))
keys_back = dict(zip(range(len(Y.unique())), Y.unique()))
Y = Y.apply(lambda x: keys[x])

In [None]:
def checkNegative(x):
    return isinstance(x, basestring) and x.startswith('-')

In [None]:
chi2, pval = feature_selection.chi2(X,Y)
feat_indx = X.columns[~np.isnan(chi2)]
chi2_nona = chi2[~np.isnan(chi2)]
pval_nona = pval[~np.isnan(chi2)]
n_unique = X.apply(lambda x: len(x.unique()))
n_nan = X.apply(lambda x: x.isnull().sum())
#n_other = train.applymap(checkNegative).sum()

feat_rank = pd.DataFrame({
    'chi2': chi2_nona, 
    'pval': pval_nona,
    'unqe': n_unique,
    'n_nan': n_nan,
#    'other': n_other,
}, index = feat_indx)

In [None]:
ordered = feat_rank.sort_values('pval')
print ordered

In [None]:
cols = ordered.index[0:11]
cutoff = 500
#clf = MultinomialNB()
#clf = LinearSVC()
clf = tree.DecisionTreeClassifier()
clf.fit(X[cols].iloc[:cutoff], Y[:cutoff])

In [None]:
len(map(lambda x: keys_back[x], out))

In [None]:
out = clf.predict(X[cols].iloc[cutoff:])
plt.hist(out)
Y[cutoff:].hist(alpha = 0.5)
print metrics.classification_report(Y[cutoff:], out)
print metrics.confusion_matrix(Y[cutoff:], out)

#print metrics.brier_score_loss(Y[cutoff:], out)
print metrics.mean_squared_error(Y[cutoff:].apply(lambda x: keys_back[x]), map(lambda x: keys_back[x] * 0 + 2.866, out))

In [None]:
analysis = 'cm1bsex'
df_final[analysis].cat.codes.loc[df_test.index[df_test.gpa > 2.75]].hist(normed=True, alpha=0.5, bins = 10)
df_final[analysis].cat.codes.loc[df_test.index[df_test.gpa <= 2.75]].hist(normed=True, alpha=0.5, bins = 10)
#plt.ylim([0,.2])

In [None]:
df_final['t5a2b'].unique()

### Way forward: Make list of all binary measures (to create ORs) and then supplement with logistic regression