# Model 1

For the first model, let's pick a scikit-learn machine learning algorithm to model the data.

In [69]:
import numpy as np
import pandas as pd

import sklearn
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import ensemble

from sklearn import metrics
from sklearn import model_selection
from sklearn.externals import joblib

random_state = 42

# plots
import plotly
import plotly.io as pio
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

plt_filepath = 'plots/Model-1/'

In [70]:
print('numpy version: {}'.format(np.__version__))
print('pandas version: {}'.format(pd.__version__))
print('sklearn version: {}'.format(sklearn.__version__))
print('plotly version: {}'.format(plotly.__version__))

numpy version: 1.16.2
pandas version: 0.24.1
sklearn version: 0.20.0
plotly version: 3.6.1


In [5]:
X = np.loadtxt("data/train.csv", delimiter=",", skiprows=1) # skip first row which is the column labels 
y = np.loadtxt("data/train_y.csv", delimiter=",", skiprows=0)

In [6]:
X.shape

(39232, 133)

In [7]:
y.shape

(39232,)

In [12]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=42)

## Baseline Models

In [8]:
classifiers = [ensemble.RandomForestClassifier(n_estimators=5),
               naive_bayes.GaussianNB(), 
               linear_model.LogisticRegression(solver='liblinear')]
clf_names = ['RandomForest', 'GaussianNB', 'Logistic Regression']
metric_names = ['roc_auc', 'f1', 'accuracy', 'precision', 'recall']

In [9]:
scv = model_selection.StratifiedKFold(n_splits=3)

scores_df = pd.DataFrame(index=metric_names, columns=clf_names)
clf_scores = []
for clf, name in zip(classifiers, clf_names):
    print('clf: ', clf)
    for metric in metric_names:
        score = model_selection.cross_val_score(clf, X, y, scoring=metric, cv=scv).mean()
        clf_scores.append(score)
        print('{} score: {}'.format(metric, score))
    scores_df[name] = clf_scores
    clf_scores = []

clf:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
roc_auc score: 0.826777453222995
f1 score: 0.5272594229907263
accuracy score: 0.8533085812862077
precision score: 0.7472880335986997
recall score: 0.409892953338818
clf:  GaussianNB(priors=None, var_smoothing=1e-09)
roc_auc score: 0.8326694053407918
f1 score: 0.35836153909972596
accuracy score: 0.8010554563104372
precision score: 0.5096717631102043
recall score: 0.27660134328902597
clf:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', r

In [10]:
scores_df

Unnamed: 0,RandomForest,GaussianNB,Logistic Regression
roc_auc,0.826777,0.832669,0.907312
f1,0.527259,0.358362,0.694764
accuracy,0.853309,0.801055,0.89167
precision,0.747288,0.509672,0.801161
recall,0.409893,0.276601,0.613442


Great, so without tuning, we see that both the Gaussian Naive Bayes and Logistic Regression models have a strong performance.  Maybe we can increase the performance of the Random Forest model with parameter tuning.

**Random Forest**



**Gaussian Naive Bayes**

**Logistic Regression**

## Paramater Tuning & Model Building

Let's perform grid search on the two models - Random Forest and Logistic Regression. Gaussian Naive Bayes is a simple model and there are not hyperparameters to tune

### Random Forest

In [14]:
rf = ensemble.RandomForestClassifier(n_estimators=5)
clf_rf = rf.fit(X_train, y_train)

In [15]:
y_pred_labels = clf_rf.predict(X_test)

In [16]:
y_pred_labels

array([0., 0., 0., ..., 0., 0., 0.])

In [17]:
y_pred = clf_rf.predict_proba(X_test)

In [18]:
y_pred

array([[1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       ...,
       [1. , 0. ],
       [1. , 0. ],
       [0.8, 0.2]])

In [19]:
X_test.shape[0]

9808

In [20]:
y_test

array([1., 0., 0., ..., 0., 0., 0.])

In [21]:
print("AUC")
auc_score_rf = metrics.roc_auc_score(y_test, y_pred_labels)
auc_score_rf

AUC


0.6828122491447818

In [22]:
print(metrics.classification_report(y_test, y_pred_labels))

              precision    recall  f1-score   support

         0.0       0.87      0.97      0.92      7901
         1.0       0.75      0.40      0.52      1907

   micro avg       0.86      0.86      0.86      9808
   macro avg       0.81      0.68      0.72      9808
weighted avg       0.85      0.86      0.84      9808



In [23]:
print("Accuracy")
accuracy = metrics.accuracy_score(y_test, y_pred_labels)
accuracy

Accuracy


0.8565456769983687

In [24]:
fpr_rf, tpr_rf, thresholds_rf = metrics.roc_curve(y_test, y_pred_labels)

In [25]:
# predicted y probabilties
trace0 = go.Scatter(
    x = fpr_rf,
    y = tpr_rf,
    mode = 'markers+lines',
    line=dict(color='darkorange', width=2),
    name = 'ROC Curve, (area = {})'.format(auc_score_rf)
)

trace1 = go.Scatter(
    x=[0, 1], y=[0, 1], 
    mode='lines', 
    line=dict(color='navy', width=2, dash='dash'),
    showlegend=False)

data = [trace0, trace1]

title = 'ROC Curve: Random Forest'
fig = go.Figure(data=data, layout = {'title':title})

iplot(fig)
pio.write_image(fig, plt_filepath+title+'.png')

### Naive Bayes

In [26]:
nb = naive_bayes.GaussianNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

In [27]:
y_train

array([0., 0., 0., ..., 0., 1., 0.])

In [28]:
print("AUC")
auc_score_nb = metrics.roc_auc_score(y_test, y_pred)
auc_score_nb

AUC


0.5881804106096107

In [29]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.84      0.93      0.88      7901
         1.0       0.46      0.25      0.32      1907

   micro avg       0.80      0.80      0.80      9808
   macro avg       0.65      0.59      0.60      9808
weighted avg       0.76      0.80      0.77      9808



In [30]:
print("Accuracy")
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy

Accuracy


0.7966965742251223

In [31]:
fpr_nb, tpr_nb, thresholds_nb = metrics.roc_curve(y_test, y_pred)

In [32]:
# predicted y probabilties
trace0 = go.Scatter(
    x = fpr_nb,
    y = tpr_nb,
    mode = 'markers+lines',
    line=dict(color='darkorange', width=2),
    name = 'ROC Curve, (area = {})'.format(auc_score_nb)
)

trace1 = go.Scatter(
    x=[0, 1], y=[0, 1], 
    mode='lines', 
    line=dict(color='navy', width=2, dash='dash'),
    showlegend=False)

data = [trace0, trace1]

title = 'ROC Curve: Naive Bayes'
fig = go.Figure(data=data, layout = {'title':title})

iplot(fig)
pio.write_image(fig, plt_filepath+title+'.png')

### Logistic Regression

In [33]:
parameters = {
              'penalty':['l1', 'l2'],
              'C': np.logspace(0, 4, 10),
             }

lr = linear_model.LogisticRegression(solver='liblinear')

clf = model_selection.GridSearchCV(lr, parameters, cv=5)
clf_lr = clf.fit(X_train, y_train)


Liblinear failed to converge, increase the number of iterations.


Liblinear failed to converge, increase the number of iterations.



In [34]:
# Best hyperparameters
print('Best Penalty:', clf_lr.best_estimator_.get_params()['penalty'])
print('Best C:', clf_lr.best_estimator_.get_params()['C'])

Best Penalty: l2
Best C: 7.742636826811269


In [35]:
y_pred = clf_lr.predict(X_test)

In [36]:
print("AUC")
auc_score_lr = metrics.roc_auc_score(y_test, y_pred)
auc_score_lr

AUC


0.7908537063305761

In [37]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.91      0.96      0.94      7901
         1.0       0.78      0.62      0.69      1907

   micro avg       0.89      0.89      0.89      9808
   macro avg       0.85      0.79      0.82      9808
weighted avg       0.89      0.89      0.89      9808



In [38]:
print("Accuracy")
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy

Accuracy


0.8934543230016313

In [39]:
fpr_lr, tpr_lr, thresholds_lr = metrics.roc_curve(y_test, y_pred)

In [40]:
y_test

array([1., 0., 0., ..., 0., 0., 0.])

In [41]:
y_pred

array([1., 0., 0., ..., 0., 0., 0.])

In [42]:
# predicted y probabilties
trace0 = go.Scatter(
    x = fpr_lr,
    y = tpr_lr,
    mode = 'markers+lines',
    line=dict(color='darkorange', width=2),
    name = 'ROC Curve, (area = {})'.format(auc_score_lr)
)

trace1 = go.Scatter(
    x=[0, 1], y=[0, 1], 
    mode='lines', 
    line=dict(color='navy', width=2, dash='dash'),
    showlegend=False)

data = [trace0, trace1]

title = 'ROC Curve: Naive Bayes'
fig = go.Figure(data=data, layout = {'title':title})

iplot(fig)
pio.write_image(fig, plt_filepath+title+'.png')

In [43]:
# predicted y probabilties
trace_lr = go.Scatter(
    x = fpr_lr,
    y = tpr_lr,
    mode = 'markers+lines',
    #line=dict(color='darkorange', width=2),
    name = 'Logistic Regression ROC Curve, (area = {})'.format(auc_score_lr)
)

trace_nb = go.Scatter(
    x = fpr_nb,
    y = tpr_nb,
    mode = 'markers+lines',
    #line=dict(color='darkorange', width=2),
    name = 'Naive Bayes ROC Curve, (area = {})'.format(auc_score_nb)
)

trace_rf = go.Scatter(
    x = fpr_rf,
    y = tpr_rf,
    mode = 'markers+lines',
    #line=dict(color='darkorange', width=2),
    name = 'Random Forest ROC Curve, (area = {})'.format(auc_score_rf)
)

trace0 = go.Scatter(
    x=[0, 1], y=[0, 1], 
    mode='lines', 
    line=dict(color='navy', width=2, dash='dash'),
    showlegend=False)

data = [trace0, trace_lr, trace_nb, trace_rf]

title = 'ROC Curves'
fig = go.Figure(data=data, layout = {'title':title})

iplot(fig)
pio.write_image(fig, plt_filepath+title+'.png')

### Save & Load Model

In [45]:
clf_lr

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([1.00000e+00, 2.78256e+00, 7.74264e+00, 2.15443e+01, 5.99484e+01,
       1.66810e+02, 4.64159e+02, 1.29155e+03, 3.59381e+03, 1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [53]:
result = clf_lr.score(X_test, y_test)
print(result)

0.8934543230016313


In [46]:
clf_lr.predict(X_test)

array([1., 0., 0., ..., 0., 0., 0.])

In [49]:
# save model
filename = 'models/model-1.sav'
joblib.dump(clf_lr, filename)

['models/model-1.sav']

In [50]:
# load the model from disk
loaded_model = joblib.load(filename)
loaded_model

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([1.00000e+00, 2.78256e+00, 7.74264e+00, 2.15443e+01, 5.99484e+01,
       1.66810e+02, 4.64159e+02, 1.29155e+03, 3.59381e+03, 1.00000e+04])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [52]:
# re-run test validation
result = loaded_model.score(X_test, y_test)
print(result)

0.8934543230016313


## Predict Values on Test Dataset

Since Logistic Regression has the best AUC (.79), let's use that for the first model predictions.

In [54]:
test = pd.read_csv('data/test.csv')

In [61]:
y = loaded_model.predict_proba(test)
y_classes = loaded_model.predict(test)

In [62]:
y_pred = y[:,1]
y_pred

array([1.97008736e-01, 1.73090431e-04, 4.41005730e-02, ...,
       1.93809574e-02, 1.64909771e-01, 1.84067332e-02])

In [65]:
y_pred_labels = y_classes
y_pred_labels

array([0., 0., 0., ..., 0., 0., 0.])

In [66]:
# Create a plot to show the predicted y probabilties for the real test set
num = 500

# predicted y probabilties
trace0 = go.Scatter(
    x = [x for x in range(num)],
    y = y_pred[:num].flatten(),
    mode = 'markers',
    name = 'predicted y probabilties'
)

# predicted y labels
trace1 = go.Scatter(
    x = [x for x in range(num)],
    y = y_pred_labels[:num].flatten(),
    mode = 'markers',
    name = 'predicted y labels'
)

data = [trace0, trace1]

title = 'Y: Predicted Probabilties, Predicted Labels for the Test Dataset'
fig = go.Figure(data=data, layout = {'title':title})

iplot(fig)
pio.write_image(fig, plt_filepath+title+'.png')

In [68]:
np.savetxt('data/results1.csv', y_pred.flatten(), delimiter=',')