In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import precision_score, recall_score

In [2]:
df = pd.read_csv("./data/data.csv")
df_quiz = pd.read_csv("./data/quiz.csv")

In [3]:
df_y = df['label'].copy()
del df['label']

In [4]:
cols_to_delete = ['18','25','29', '31', '32', '35', '23', '26', '58']
for col in cols_to_delete:
    del df[col]
    del df_quiz[col]

In [5]:
# Define categorical columns
categorical_cols_enhanced = list(df.columns)

# Remove purely numeric columns
categorical_cols_enhanced.remove('59')
categorical_cols_enhanced.remove('60')

In [6]:
# Convert categorical to one-hot sparse column
df_one_hot = pd.get_dummies(df, columns=categorical_cols_enhanced)

In [7]:
df_one_hot_quiz = pd.get_dummies(df_quiz, columns=categorical_cols_enhanced)

col_to_add = np.setdiff1d(df_one_hot.columns, df_one_hot_quiz.columns)
for c in col_to_add:
    df_one_hot_quiz[c] = 0

df_one_hot_quiz = df_one_hot_quiz[df_one_hot.columns]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df_one_hot, df_y, random_state=1, train_size=0.8)
# X_train, y_train = df_one_hot, df_y

In [9]:
def get_predictions(cls, test_features):
    predicted = cls.predict(test_features)
    predicted_probs = cls.predict_proba(test_features).transpose()[1]
    predicted = pd.Series(predicted, index=test_features.index)
    predicted.name = 'predicted'
    return predicted, predicted_probs

In [10]:
def run_rfc(rfc):
    rfc.fit(X_train, y_train)
    print (rfc)
    test_preds, _ = get_predictions(rfc, X_test)
    train_preds, _ = get_predictions(rfc, X_train)
    
    print ('train accuracy:', rfc.score(X_train, y_train))
    print ('train precision:', precision_score(y_train, train_preds))
    print ('train recall:', recall_score(y_train, train_preds))
    
    print ('test accuracy:', rfc.score(X_test, y_test))
    print ('test precision:', precision_score(y_test, test_preds))
    print ('test recall:', recall_score(y_test, test_preds))

In [11]:
rfc = RandomForestClassifier(random_state=1, n_estimators=20, min_samples_leaf=3)

In [12]:
run_rfc(rfc)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
train accuracy: 0.943746365885
train precision: 0.961155378486
train recall: 0.908744394619
test accuracy: 0.931173131504
test precision: 0.945976901785
test recall: 0.893607429447


In [13]:
quiz_preds = rfc.predict(df_one_hot_quiz)

In [14]:
def submission(preds, ver):
    with open("./submissions/submission_{}.csv".format(ver), 'w') as f:
        f.write("Id,Prediction\n")
        for i, pred in enumerate(preds):
            f.write("{},{}\n".format(i+1,pred))

In [15]:
#submission(quiz_preds, "08")

# RandomForestClassifier Results

### n_estimators = 20
### min_samples_leaf = 1
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
train accuracy: 0.997585469454
train precision: 0.999009967824
train recall: 0.995493273543
test accuracy: 0.9456401766
test precision: 0.960895975702
test recall: 0.912812190064


### n_estimators = 20
### min_samples_leaf = 20
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
train accuracy: 0.908967270792
train precision: 0.926049974699
train recall: 0.861704035874
test accuracy: 0.904367707348
test precision: 0.919044395009
test recall: 0.856730682535


### n_estimators = 20
### min_samples_leaf = 10
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
train accuracy: 0.920763977175
train precision: 0.938138152533
train recall: 0.877600896861
test accuracy: 0.914853358562
test precision: 0.928838951311
test recall: 0.872058425751


### n_estimators = 20
### min_samples_leaf = 5
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
train accuracy: 0.932008790862
train precision: 0.950311746017
train recall: 0.891950672646
test accuracy: 0.924471775465
test precision: 0.938785270206
test recall: 0.88495176269


### n_estimators = 20
### min_samples_leaf = 3
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
train accuracy: 0.943746365885
train precision: 0.961155378486
train recall: 0.908744394619
test accuracy: 0.931173131504
test precision: 0.945976901785
test recall: 0.893607429447