# Predict

In [6]:
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import ensemble

from sklearn import model_selection

In [2]:
random_state = 42

In [3]:
X = pd.read_csv('data/train-cleaned.csv')
y = X.pop('y')

In [4]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=random_state)

In [21]:
classifiers = [ensemble.RandomForestClassifier(n_estimators=5),
               naive_bayes.GaussianNB(), 
               linear_model.LogisticRegression(solver='sag', max_iter=100000)]
clf_names = ['RandomForest', 'GaussianNB', 'Logistic Regression']
metrics = ['f1', 'roc_auc', 'average_precision', 'accuracy', 'precision', 'recall']

In [22]:
scv = model_selection.StratifiedKFold(n_splits=3)

scores_df = pd.DataFrame(index=metrics, columns=clf_names)
clf_scores = []
for clf, name in zip(classifiers, clf_names):
    print('clf: ', clf)
    for metric in metrics:
        score = model_selection.cross_val_score(clf, X, y, scoring=metric, cv=scv).mean()
        clf_scores.append(score)
        print('{} score: {}'.format(metric, score))
    scores_df[name] = clf_scores
    clf_scores = []

clf:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
f1 score: 0.5160562534501788
roc_auc score: 0.8118602143957748
average_precision score: 0.531339120005106
accuracy score: 0.7720328751254494
precision score: 0.6317616596786072
recall score: 0.4350970688998858
clf:  GaussianNB(priors=None, var_smoothing=1e-09)
f1 score: 0.6035064018091153
roc_auc score: 0.8888502635640161
average_precision score: 0.719803024155512
accuracy score: 0.8680069564821351
precision score: 0.7613209525232875
recall score: 0.5008247684304022
clf:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          interce

In [24]:
scores_df

Unnamed: 0,RandomForest,GaussianNB,Logistic Regression
f1,0.516056,0.603506,0.623528
roc_auc,0.81186,0.88885,0.890687
average_precision,0.531339,0.719803,0.7533
accuracy,0.772033,0.868007,0.864742
precision,0.631762,0.761321,0.746577
recall,0.435097,0.500825,0.582921


## Clean Test Dataset

In [None]:
test = pd.read_csv('data/exercise_01_test.csv')