In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.cross_validation import train_test_split
from sklearn.metrics import precision_score, recall_score
from time import time

In [None]:
%matplotlib inline

In [None]:
# Read data, use your own path
df = pd.read_csv("../data/data.csv")
df_quiz = pd.read_csv("../data/quiz.csv")

In [None]:
# Data shape
print df.shape, df_quiz.shape

In [None]:
# Separate labels from input vectors
df_y = df['label'].copy()
del df['label']

In [None]:
df.describe()

In [None]:
# Delete columns that have no variance (all 0's or all 1's)
cols_to_delete = ['18','25','29', '31', '32', '35', '23', '26', '58']  
for col in cols_to_delete:
    del df[col]
    del df_quiz[col]

In [None]:
# Get list of remaining columns
df_cols = list(df.columns.values)
print len(df_cols)

In [None]:
# Holds the unique values for each column, will be used later on for sparsifying the quiz data
df_cols_dict = {}

In [None]:
# Print the number of values each column takes and populate df_cols_dict
for col in df_cols:
    print "*****column: ", col
    print pd.value_counts(df[col].values)
    df_cols_dict[col] = pd.value_counts(df[col].values).to_dict().keys()

In [None]:
# Normalize the continuous columns
##df['59'] = (df['59'] - df['59'].mean()) /  (df['59'].max() - df['59'].min())
##df['60'] = (df['60'] - df['60'].mean()) /  (df['60'].max() - df['60'].min())

##df_quiz['59'] = (df_quiz['59'] - df_quiz['59'].mean()) /  (df_quiz['59'].max() - df_quiz['59'].min())
##df_quiz['60'] = (df_quiz['60'] - df_quiz['60'].mean()) /  (df_quiz['60'].max() - df_quiz['60'].min())


In [None]:
# Define categorical columns
categorical_cols = ['56', '20', '14', '17', '16', '57', '0', '5', '7', '9', '8']
num_to_categorical_cols = [str(i) for i in range(38,52)] # numerical columns with [0,1,2] vals, also converting
categorical_cols_enhanced = categorical_cols + num_to_categorical_cols  

In [None]:
# Convert categorical to one-hot sparse column
df_one_hot = pd.get_dummies(df, columns=categorical_cols_enhanced)

In [None]:
df_one_hot.shape

## Inflating quiz data

In [None]:
# list of dicts with inflated data, will be used to construct quiz dataframe
quiz_raw_data = []

# different column types, will be inflated differently 
cols_set = set(df_cols)
cols_categ_set = set(categorical_cols_enhanced)
cols_num_set = cols_set - cols_categ_set
cols_num_to_categ = set(num_to_categorical_cols)

# Sanity check
if len(set.union(cols_num_set,cols_categ_set)) != len(cols_set):
    raise RuntimeError

for i in range(len(df_quiz)):
    if i%1000 == 0:
        print "inflating row", i
    x = df_quiz.iloc[i].to_dict()
    x_inflated = {}
    for k,v in x.items():
        if k in cols_num_set:
            x_inflated[k] = v
        elif k in cols_num_to_categ:
            for k2 in ['0.0','1.0','2.0']:
                inflated_col = k + "_" + k2
                x_inflated[inflated_col] = 0
            x_inflated[k + "_" + str(v)] = 1
        else:
            for k2 in df_cols_dict[k]:
                inflated_col = k + "_" + k2
                if v == k2:
                    x_inflated[inflated_col] = 1.0 
                else:
                    x_inflated[inflated_col] = 0.0
                    
    quiz_raw_data.append(x_inflated)
            
print len(quiz_raw_data)    

In [None]:
df_cols_inflated = list(df_one_hot.columns.values)

In [None]:
print df_cols_inflated

In [None]:
df_one_hot_quiz = pd.DataFrame(data=quiz_raw_data, columns=df_cols_inflated)

In [None]:
#df_one_hot_quiz.describe()

In [None]:
#df_one_hot_quiz.to_csv('/Users/tonatiuh/fmcode/dsi/ml/kaggle/data/quiz_inflated.csv.gz', compression='gzip')

In [None]:
def get_predictions(cls, test_features):
    predicted = cls.predict(test_features)
    predicted_probs = cls.predict_proba(test_features).transpose()[1]
    predicted = pd.Series(predicted, index=test_features.index)
    predicted.name = 'predicted'
    return predicted, predicted_probs

In [None]:
def run_rfc(rfc):
    rfc.fit(X_train, y_train)
    print (rfc)
    test_preds, _ = get_predictions(rfc, X_test)
    train_preds, _ = get_predictions(rfc, X_train)
    
    print ('train accuracy:', rfc.score(X_train, y_train))
    print ('train precision:', precision_score(y_train, train_preds))
    print ('train recall:', recall_score(y_train, train_preds))
    
    print ('test accuracy:', rfc.score(X_test, y_test))
    print ('test precision:', precision_score(y_test, test_preds))
    print ('test recall:', recall_score(y_test, test_preds))

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(df_one_hot, df_y, random_state=1, train_size=1.0)
# X_train, y_train = df_one_hot, df_y

# Random Forest

In [None]:
rfc = RandomForestClassifier(random_state=1, n_estimators=40, )

In [None]:
run_rfc(rfc)

In [None]:
print rfc.feature_importances_

In [None]:
print len(rfc.feature_importances_)

In [None]:
rfc.fit(df_one_hot, df_y,)

In [None]:
submission_ver = "1000"
path = '/Users/tonatiuh/fmcode/dsi/ml/kaggle/data/'
submission_file = 'submission_' + submission_ver + '.csv'
f_out = open(path+submission_file, 'w')
f_out.write("Id,Prediction\n")
for i in range(1,len(y_svm_pred)+1):
    f_out.write(str(i)+','+str(y_svm_pred[i-1])+'\n')
f_out.close()