In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import precision_score, recall_score

In [2]:
df = pd.read_csv("./data/data.csv")
df_quiz = pd.read_csv("./data/quiz.csv")

In [3]:
df_y = df['label'].copy()
del df['label']

In [4]:
cols_to_delete = ['18','25','29', '31', '32', '35', '23', '26', '58']
for col in cols_to_delete:
    del df[col]
    del df_quiz[col]

In [5]:
# Normalize the continuous columns
df['59'] = (df['59'] - df['59'].mean()) /  (df['59'].max() - df['59'].min())
df['60'] = (df['60'] - df['60'].mean()) /  (df['60'].max() - df['60'].min())

df_quiz['59'] = (df_quiz['59'] - df_quiz['59'].mean()) /  (df_quiz['59'].max() - df_quiz['59'].min())
df_quiz['60'] = (df_quiz['60'] - df_quiz['60'].mean()) /  (df_quiz['60'].max() - df_quiz['60'].min())

In [6]:
# Define categorical columns
categorical_cols = ['56', '20', '14', '17', '16', '57', '0', '5', '7', '9', '8']
num_to_categorical_cols = [str(i) for i in range(38,52)] # numerical columns with [0,1,2] vals, also converting
categorical_cols_enhanced = categorical_cols + num_to_categorical_cols  

In [7]:
# Convert categorical to one-hot sparse column
df_one_hot = pd.get_dummies(df, columns=categorical_cols_enhanced)

In [8]:
df_one_hot_quiz = pd.get_dummies(df_quiz, columns=categorical_cols_enhanced)

col_to_add = np.setdiff1d(df_one_hot.columns, df_one_hot_quiz.columns)
for c in col_to_add:
    df_one_hot_quiz[c] = 0

df_one_hot_quiz = df_one_hot_quiz[df_one_hot.columns]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df_one_hot, df_y, random_state=3, train_size=0.8)

In [10]:
def get_predictions(cls, test_features):
    predicted = cls.predict(test_features)
    predicted_probs = cls.predict_proba(test_features).transpose()[1]
    predicted = pd.Series(predicted, index=test_features.index)
    predicted.name = 'predicted'
    return predicted, predicted_probs

In [11]:
def run_rfc(rfc):
    rfc.fit(X_train, y_train)
    print (rfc)
    test_preds, _ = get_predictions(rfc, X_test)
    train_preds, _ = get_predictions(rfc, X_train)
    
    print ('train accuracy:', rfc.score(X_train, y_train))
    print ('train precision:', precision_score(y_train, train_preds))
    print ('train recall:', recall_score(y_train, train_preds))
    
    print ('test accuracy:', rfc.score(X_test, y_test))
    print ('test precision:', precision_score(y_test, test_preds))
    print ('test recall:', recall_score(y_test, test_preds))

In [12]:
rfc = RandomForestClassifier(n_estimators=9, max_features='log2', random_state=3, min_samples_leaf=2, 
                             min_samples_split=3, max_depth=20)

In [13]:
rfc_default = RandomForestClassifier()

In [14]:
run_rfc(rfc)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='log2', max_leaf_nodes=None,
            min_samples_leaf=2, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=9, n_jobs=1,
            oob_score=False, random_state=3, verbose=0, warm_start=False)
train accuracy: 0.89055770728
train precision: 0.913536429064
train recall: 0.829896330131
test accuracy: 0.887496058026
test precision: 0.903394513519
test recall: 0.830009066183


In [15]:
run_rfc(rfc_default)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
train accuracy: 0.994402231223
train precision: 0.9986203777
train recall: 0.988647813529
test accuracy: 0.942959634185
test precision: 0.957422434368
test recall: 0.9092475068
