In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import precision_score, recall_score, confusion_matrix

In [2]:
df = pd.read_csv("./data/data.csv")
df_quiz = pd.read_csv("./data/quiz.csv")

In [3]:
df_y = df['label'].copy()
del df['label']

In [4]:
cols_to_delete = ['18','23','25','26', '29', '31', '32', '35',  '58']
for col in cols_to_delete:
    del df[col]
    del df_quiz[col]

In [5]:
# Define categorical columns
categorical_cols_enhanced = list(df.columns)

# Remove purely numeric columns
categorical_cols_enhanced.remove('59')
categorical_cols_enhanced.remove('60')

In [6]:
# Convert categorical to one-hot sparse column
df_one_hot = pd.get_dummies(df, columns=categorical_cols_enhanced)

In [7]:
df_one_hot_quiz = pd.get_dummies(df_quiz, columns=categorical_cols_enhanced)

col_to_add = np.setdiff1d(df_one_hot.columns, df_one_hot_quiz.columns)
for c in col_to_add:
    # df_one_hot_quiz[c] = 0
    # alternatively, remove any features we don't have in either
    if c in df_one_hot.columns: del df_one_hot[c]
    if c in df_one_hot_quiz.columns: del df_one_hot_quiz[c]

df_one_hot_quiz = df_one_hot_quiz[df_one_hot.columns]

In [8]:
# for c in df_one_hot.columns:
#     if df_one_hot[c].std() < 0.5:
#         del df_one_hot[c]
#         del df_one_hot_quiz[c]

In [9]:
print ('columns:', len(df_one_hot.columns))

columns: 528


In [10]:
X_train, X_test, y_train, y_test = train_test_split(df_one_hot, df_y, random_state=1, train_size=0.8)
# X_train, y_train = df_one_hot, df_y

In [11]:
def get_predictions(cls, test_features):
    predicted = cls.predict(test_features)
    predicted_probs = cls.predict_proba(test_features).transpose()[1]
    predicted = pd.Series(predicted, index=test_features.index)
    predicted.name = 'predicted'
    return predicted, predicted_probs

In [12]:
def run_rfc(rfc):
    rfc.fit(X_train, y_train)
    print (rfc)
    test_preds, _ = get_predictions(rfc, X_test)
    train_preds, _ = get_predictions(rfc, X_train)
    
    print ('train accuracy:', rfc.score(X_train, y_train))
    print ('train precision:', precision_score(y_train, train_preds))
    print ('train recall:', recall_score(y_train, train_preds))
    
    print ('test accuracy:', rfc.score(X_test, y_test))
    print ('test precision:', precision_score(y_test, test_preds))
    print ('test recall:', recall_score(y_test, test_preds))

In [13]:
rfc = RandomForestClassifier(random_state=1, n_estimators=20, min_samples_leaf=1)

In [14]:
run_rfc(rfc)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
train accuracy: 0.997881126255
train precision: 0.999077967931
train recall: 0.996098654709
test accuracy: 0.946073793756
test precision: 0.960064351282
test recall: 0.914705617167


In [15]:
quiz_preds = rfc.predict(df_one_hot_quiz)
new_quiz_preds = [int(b) for b in rfc.predict_proba(df_one_hot_quiz).transpose()[1] > 0.4]



In [16]:
len(quiz_preds != new_quiz_preds)

31709

In [17]:
test_predicted = rfc.predict(X_test)
# predicted = pd.Series(predicted, index=test_features.index)
confusion_matrix(y_test, test_predicted)

array([[13855,   422],
       [  946, 10145]])

In [33]:
test_predicted_probs = rfc.predict_proba(X_test).transpose()[1]
new_test_preds = [int(b) for b in test_predicted_probs > 0.01]
# new_test_preds = pd.Series(new_test_preds, index=X_test.index)
print ('test accuracy:', rfc.score(X_test, new_test_preds))
print ('test precision:', precision_score(y_test, new_test_preds))
print ('test recall:', recall_score(y_test, new_test_preds))

test accuracy: 0.416548407442
test precision: 0.603589799715
test recall: 0.994500045082


[1, 1, 0, 1, 0, 1, 1, 0, 0, 1]

In [19]:
def submission(preds, ver):
    with open("./submissions/submission_{}.csv".format(ver), 'w') as f:
        f.write("Id,Prediction\n")
        for i, pred in enumerate(preds):
            f.write("{},{}\n".format(i+1,pred))

In [20]:
submission(quiz_preds, "09")

# RandomForestClassifier Results

### n_estimators = 20
### min_samples_leaf = 1
### train_size = 0.8
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
            
train accuracy: 0.997585469454

train precision: 0.999009967824

train recall: 0.995493273543

test accuracy: 0.9456401766

test precision: 0.960895975702

test recall: 0.912812190064


### n_estimators = 20
### min_samples_leaf = 20
### train_size = 0.8
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
            
train accuracy: 0.908967270792

train precision: 0.926049974699

train recall: 0.861704035874

test accuracy: 0.904367707348

test precision: 0.919044395009

test recall: 0.856730682535


### n_estimators = 20
### min_samples_leaf = 10
### train_size = 0.8
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
            
train accuracy: 0.920763977175

train precision: 0.938138152533

train recall: 0.877600896861

test accuracy: 0.914853358562

test precision: 0.928838951311

test recall: 0.872058425751


### n_estimators = 20
### min_samples_leaf = 5
### train_size = 0.8
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
            
train accuracy: 0.932008790862

train precision: 0.950311746017

train recall: 0.891950672646

test accuracy: 0.924471775465

test precision: 0.938785270206

test recall: 0.88495176269


### n_estimators = 20
### min_samples_leaf = 3
### train_size = 0.8
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
            
train accuracy: 0.943746365885

train precision: 0.961155378486

train recall: 0.908744394619

test accuracy: 0.931173131504

test precision: 0.945976901785

test recall: 0.893607429447


### n_estimators = 20
### min_samples_leaf = 1
### train_size = 0.6
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
            
train accuracy: 0.997871278022

train precision: 0.999309661734

train recall: 0.995842431131

test accuracy: 0.943865181827

test precision: 0.959212643134

test recall: 0.910773654416


### n_estimators = 20
### min_samples_leaf = 1
### train_size = 0.4
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
train accuracy: 0.997772696811

train precision: 0.999059982095

train recall: 0.995894873053

test accuracy: 0.936415121612

test precision: 0.950600462626

test recall: 0.901442307692


### n_estimators = 20
### min_samples_leaf = 1
### train_size = 0.6
### std > 0.2
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
            
train accuracy: 0.99563743397

train precision: 0.996281747579

train recall: 0.99377860198

test accuracy: 0.936710357741

test precision: 0.942069349673

test recall: 0.911806990745


### n_estimators = 20
### min_samples_leaf = 1
### train_size = 0.8
### std > 0.2
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
            
train accuracy: 0.995289201628

train precision: 0.996087249831

train recall: 0.993183856502

test accuracy: 0.941934720908

test precision: 0.948182665424

test recall: 0.917320349833


### n_estimators = 50
### min_samples_leaf = 1
### train_size = 0.8
### std > 0.2
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
            
train accuracy: 0.996649222915

train precision: 0.996722930508

train recall: 0.995650224215

test accuracy: 0.94311731315

test precision: 0.948327137546

test recall: 0.920025245695

### n_estimators = 100
### min_samples_leaf = 1
### train_size = 0.8
### std > 0.2
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
            
train accuracy: 0.99690545881

train precision: 0.996591163938

train recall: 0.996367713004

test accuracy: 0.943353831599

test precision: 0.94744160178

test recall: 0.921558020016

### n_estimators = 100
### min_samples_leaf = 1
### train_size = 0.8
### std > 0.3
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
            
train accuracy: 0.993475839912

train precision: 0.992843682699

train recall: 0.99230941704

test accuracy: 0.93637653737

test precision: 0.932937414344

test recall: 0.920656388062


### n_estimators = 200
### min_samples_leaf = 1
### train_size = 0.8
### std > 0.2
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
            
train accuracy: 0.996915314037

train precision: 0.99654669806

train recall: 0.996434977578

test accuracy: 0.943511510564

test precision: 0.947378173059

test recall: 0.922008835993
