In [None]:
import pandas as pd
from sklearn import feature_selection, tree
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np

from itertools import product, combinations
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
# Get Data for Training...

data_dir = '../.data'
fp_train = '{}/train.csv'.format(data_dir)
df_test = pd.read_csv(fp_train,index_col=0)

In [None]:
# Get Feature Data...

df_path = "./../FFChallenge_v2/background.dta"

df_features = None
with open(df_path, "r") as f:
    df_features = pd.read_stata(f)
    print df_features.head()
    
df_train = df_features.set_index('challengeID')
df_train_na = df_train.replace('NA', np.NaN)
df_train_na.cf4fint = pd.to_datetime(df_train_na.cf4fint)
df_train_na_cols = df_train_na.columns[df_train_na.dtypes == 'object']
df_train_na[df_train_na_cols] = df_train_na[df_train_na_cols].apply(lambda x: pd.to_numeric(x, errors = 'ignore'))

df_train_no_obj = df_train_na[df_train_na.columns[df_train_na.dtypes != 'object']]
final_cols = df_train_no_obj.columns[~ df_train_no_obj.isnull().all()]
df_final = pd.DataFrame(df_train_no_obj[final_cols])
print df_final.shape

# Find number of unique values in each column. If unique == 1, then remove from final data frame.
n = df_final.apply(lambda x: len(x.unique()))
df_final = pd.DataFrame(df_final[df_final.columns[n>1]])
print df_final.shape

In [None]:
# Explore Categories
cat_cols = df_final[df_final.columns[df_final.dtypes == 'category']]
cat_cols_codes = cat_cols.apply(lambda x: x.cat.codes)

In [None]:
cat_cols.head()

In [None]:
print df_test.columns

In [None]:
# Form dataset
col = 'grit'
Y = df_test[col][~ df_test[col].isnull()]
keys = dict(zip(Y.unique(),range(len(Y.unique()))))
Y = Y.apply(lambda x: keys[x])
X = cat_cols_codes.loc[Y.index]

# Have to figure out NaNs...

# Check Indices for equivalence:
nDiff = len(set.difference(set(Y.index), set(X.index)))
print nDiff

In [None]:
chi2, pval = feature_selection.chi2(X,Y)
feat_indx = X.columns[~np.isnan(chi2)]
chi2_nona = chi2[~np.isnan(chi2)]
pval_nona = pval[~np.isnan(chi2)]
n_unique = X.apply(lambda x: len(x.unique()))
n_nan = X.apply(lambda x: x.isnull().sum())
n_other = cat_cols.applymap(checkNegative).sum()

feat_rank = pd.DataFrame({
    'chi2': chi2_nona, 
    'pval': pval_nona,
    'unqe': n_unique,
    'n_nan': n_nan,
    'other': n_other,
}, index = feat_indx)

In [None]:
H = plt.hist(chi2_nona[pval_nona < .001], bins=100)

In [None]:
feat_rank.n_nan.hist(bins=100)

In [None]:
ordered = feat_rank[(feat_rank.unqe < 10) & (feat_rank.other <20)].sort_values('pval')
print ordered

In [None]:
test = 'cm1povca'
x = df_final[test].cat.codes

for val in Y.unique():
    vals = Y[Y==val].index
    x.loc[vals].plot.hist(normed = True, alpha = .75, label=val)
plt.legend()

In [None]:
def checkNegative(x):
    return isinstance(x, basestring) and x.startswith('-')

In [None]:
cols = ordered.index[0:1]
cutoff = 500
#clf = MultinomialNB()
clf = tree.DecisionTreeClassifier()
clf.fit(X[cols].iloc[:cutoff], Y[:cutoff])

In [None]:
out = clf.predict(X[cols].iloc[cutoff:])

In [None]:
plt.hist(out)
print metrics.classification_report(Y[cutoff:], out)
print metrics.confusion_matrix(Y[cutoff:], out)

In [None]:
Y.hist()

In [None]:
x = df_final.t5b1m.loc[df_test.index[~df_test.gpa.isnull()]]
y = df_test.gpa[~df_test.gpa.isnull()]

In [None]:
plt.plot(x.cat.codes, y, '.')

In [None]:
teacher_test = df_final[df_final.columns[list(df_final.columns.map(lambda x: x.startswith('t5')))]]

In [None]:
teacher_test

In [None]:
teacher_test.loc[1].value_counts()

In [None]:
teacher_test.t5a4.apply(lambda x: x.startswith('-')).sum()

In [None]:
teacher_test