In [None]:
import pandas as pd
from sklearn import feature_selection, tree
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC, LinearSVR
from sklearn.linear_model import BayesianRidge, LinearRegression, LassoCV, LogisticRegressionCV
from sklearn import metrics
from sklearn.preprocessing import normalize
import numpy as np

from itertools import product, combinations
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
# Get Data for Training...

data_dir = '../.data'
fp_train = '{}/train.csv'.format(data_dir)
df_test = pd.read_csv(fp_train,index_col=0)

In [None]:
# Get Feature Data...

df_path = "./../FFChallenge_v2/background.dta"

df_features = None
with open(df_path, "r") as f:
    df_features = pd.read_stata(f)
    #print df_features.head()
    
df_train = df_features.set_index('challengeID')
df_train_na = df_train.replace('NA', np.NaN)
df_train_na.cf4fint = pd.to_datetime(df_train_na.cf4fint)
df_train_na_cols = df_train_na.columns[df_train_na.dtypes == 'object']
df_train_na[df_train_na_cols] = df_train_na[df_train_na_cols].apply(lambda x: pd.to_numeric(x, errors = 'ignore'))

df_train_no_obj = df_train_na[df_train_na.columns[df_train_na.dtypes != 'object']]
final_cols = df_train_no_obj.columns[~ df_train_no_obj.isnull().all()]
df_final = pd.DataFrame(df_train_no_obj[final_cols])
print df_final.shape

# Find number of unique values in each column. If unique == 1, then remove from final data frame.
n = df_final.apply(lambda x: len(x.unique()))
df_final = pd.DataFrame(df_final[df_final.columns[n>1]])
print df_final.shape

In [None]:
def prepare(x):
    if isinstance(x, basestring):
        return int(x.split(' ')[0])
    else:
        return x

In [None]:
grps = [
#    't5b1',
#    't5b2',
#    't5b3',
#    't5b4',
#    't5c13',
#    't5d1',
#    'k5g1',
#    'k5g2',
#    'k5d1',
    'hv4ppvtraw',
#    'hv4ppvtraw_m',
    'hv5_wj9raw',
    'hv5_wj10raw',
    'hv5_wj9ss',
    'hv5_wj10ss',
    'hv5_ppvtraw',
    'hv5_ppvtss',
#    'm5j1',
#    'hv4t',
#    'cm1bsex'
]

df = pd.DataFrame()
cols = {}
for grp in grps:
    col_name = '{}_avg'.format(grp)
    cols[grp] = df_final.columns[list(df_final.columns.map(lambda x: x.startswith(grp)))]
    df_temp = df_final[cols[grp]].applymap(prepare)
    df_temp = df_temp[df_temp[cols[grp][0]] > -9]
    df_temp = df_temp.apply(lambda x: x * (x >= 0))
    df_temp = df_temp.mean(axis=1)
    
    df[col_name] = df_temp
    
#df = pd.DataFrame(df_final[cols])
#df = pd.DataFrame(df_final)

In [None]:
threshold = 2.85
df['gpa'] = df_test.gpa
teacher_survey = df.dropna()
print 'Before cutting: {}'.format(teacher_survey.shape)


In [None]:
b = 500
cols = teacher_survey.columns[teacher_survey.columns != 'gpa']

clf = LogisticRegressionCV()
X = teacher_survey[cols]
Y = teacher_survey.gpa
clf.fit(X.iloc[:b], Y.iloc[:b] > threshold)

In [None]:
out = clf.predict(X[cols].iloc[b:])
out_proba = clf.predict_proba(X[cols].iloc[b:])
Y[b:].hist(alpha = 0.5)
print metrics.classification_report(Y[b:] > threshold, out)
print metrics.confusion_matrix(Y[b:]> threshold, out)


In [None]:
reg = LassoCV(normalize=True)
xr = clf.predict_proba(X[cols].iloc[:b])
yr = Y[:b].values
reg.fit(xr, yr)
out_reg = reg.predict(out_proba)
print metrics.mean_squared_error(Y[b:], out_reg)

In [None]:
plt.hist(out_reg)

In [None]:
chi2, pval = feature_selection.f_regression(X,Y)
feat_indx = X.columns[~np.isnan(chi2)]
chi2_nona = chi2[~np.isnan(chi2)]
pval_nona = pval[~np.isnan(chi2)]
#n_unique = X.apply(lambda x: len(x.unique()))
#n_nan = X.apply(lambda x: x.isnull().sum())
#n_other = train.applymap(checkNegative).sum()

feat_rank = pd.DataFrame({
    'chi2': chi2_nona, 
    'pval': pval_nona,
#    'unqe': n_unique,
#    'n_nan': n_nan,
#    'other': n_other,
}, index = feat_indx)
ordered = feat_rank.sort_values('pval')
print ordered

In [None]:
reg = LassoCV(normalize=True)
#reg = tree.DecisionTreeRegressor()
reg.fit(X.iloc[:b], Y.iloc[:b])
out_lasso = reg.predict(X.iloc[b:])
print metrics.mean_squared_error(Y[b:], out_lasso)