# Model 1

For the first model, let's pick a scikit-learn machine learning algorithm to model the data.

In [4]:
import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import ensemble

from sklearn import metrics
from sklearn import model_selection

random_state = 42

# plots
import plotly.io as pio
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

plt_filepath = 'plots/Model-1/'

In [5]:
X = np.loadtxt("data/train.csv", delimiter=",", skiprows=1) # skip first row which is the column labels 
y = np.loadtxt("data/train_y.csv", delimiter=",", skiprows=0)

In [6]:
X.shape

(39232, 133)

In [7]:
y.shape

(39232,)

In [12]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=42)

## Baseline Models

In [8]:
classifiers = [ensemble.RandomForestClassifier(n_estimators=5),
               naive_bayes.GaussianNB(), 
               linear_model.LogisticRegression(solver='liblinear')]
clf_names = ['RandomForest', 'GaussianNB', 'Logistic Regression']
metric_names = ['roc_auc', 'f1', 'accuracy', 'precision', 'recall']

In [9]:
scv = model_selection.StratifiedKFold(n_splits=3)

scores_df = pd.DataFrame(index=metric_names, columns=clf_names)
clf_scores = []
for clf, name in zip(classifiers, clf_names):
    print('clf: ', clf)
    for metric in metric_names:
        score = model_selection.cross_val_score(clf, X, y, scoring=metric, cv=scv).mean()
        clf_scores.append(score)
        print('{} score: {}'.format(metric, score))
    scores_df[name] = clf_scores
    clf_scores = []

clf:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
roc_auc score: 0.826777453222995
f1 score: 0.5272594229907263
accuracy score: 0.8533085812862077
precision score: 0.7472880335986997
recall score: 0.409892953338818
clf:  GaussianNB(priors=None, var_smoothing=1e-09)
roc_auc score: 0.8326694053407918
f1 score: 0.35836153909972596
accuracy score: 0.8010554563104372
precision score: 0.5096717631102043
recall score: 0.27660134328902597
clf:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', r

In [10]:
scores_df

Unnamed: 0,RandomForest,GaussianNB,Logistic Regression
roc_auc,0.826777,0.832669,0.907312
f1,0.527259,0.358362,0.694764
accuracy,0.853309,0.801055,0.89167
precision,0.747288,0.509672,0.801161
recall,0.409893,0.276601,0.613442


Great, so without tuning, we see that both the Gaussian Naive Bayes and Logistic Regression models have a strong performance.  Maybe we can increase the performance of the Random Forest model with parameter tuning.

**Random Forest**



**Gaussian Naive Bayes**

**Logistic Regression**

## Paramater Tuning & Model Building

Let's perform grid search on the two models - Random Forest and Logistic Regression. Gaussian Naive Bayes is a simple model and there are not hyperparameters to tune

### Random Forest

In [14]:
rf = ensemble.RandomForestClassifier(n_estimators=5)
clf_rf = rf.fit(X_train, y_train)

In [15]:
y_pred_labels = clf_rf.predict(X_test)

In [16]:
y_pred_labels

array([0., 0., 0., ..., 0., 0., 0.])

In [17]:
y_pred = clf_rf.predict_proba(X_test)

In [18]:
y_pred

array([[1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       ...,
       [1. , 0. ],
       [1. , 0. ],
       [0.8, 0.2]])

In [19]:
X_test.shape[0]

9808

In [20]:
y_test

array([1., 0., 0., ..., 0., 0., 0.])

In [21]:
print("AUC")
auc_score_rf = metrics.roc_auc_score(y_test, y_pred_labels)
auc_score_rf

AUC


0.6828122491447818

In [22]:
print(metrics.classification_report(y_test, y_pred_labels))

              precision    recall  f1-score   support

         0.0       0.87      0.97      0.92      7901
         1.0       0.75      0.40      0.52      1907

   micro avg       0.86      0.86      0.86      9808
   macro avg       0.81      0.68      0.72      9808
weighted avg       0.85      0.86      0.84      9808



In [23]:
print("Accuracy")
accuracy = metrics.accuracy_score(y_test, y_pred_labels)
accuracy

Accuracy


0.8565456769983687

In [24]:
fpr_rf, tpr_rf, thresholds_rf = metrics.roc_curve(y_test, y_pred_labels)

In [25]:
# predicted y probabilties
trace0 = go.Scatter(
    x = fpr_rf,
    y = tpr_rf,
    mode = 'markers+lines',
    line=dict(color='darkorange', width=2),
    name = 'ROC Curve, (area = {})'.format(auc_score_rf)
)

trace1 = go.Scatter(
    x=[0, 1], y=[0, 1], 
    mode='lines', 
    line=dict(color='navy', width=2, dash='dash'),
    showlegend=False)

data = [trace0, trace1]

title = 'ROC Curve: Random Forest'
fig = go.Figure(data=data, layout = {'title':title})

iplot(fig)
pio.write_image(fig, plt_filepath+title+'.png')

### Naive Bayes

In [26]:
nb = naive_bayes.GaussianNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

In [27]:
y_train

array([0., 0., 0., ..., 0., 1., 0.])

In [28]:
print("AUC")
auc_score_nb = metrics.roc_auc_score(y_test, y_pred)
auc_score_nb

AUC


0.5881804106096107

In [29]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.84      0.93      0.88      7901
         1.0       0.46      0.25      0.32      1907

   micro avg       0.80      0.80      0.80      9808
   macro avg       0.65      0.59      0.60      9808
weighted avg       0.76      0.80      0.77      9808



In [30]:
print("Accuracy")
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy

Accuracy


0.7966965742251223

In [31]:
fpr_nb, tpr_nb, thresholds_nb = metrics.roc_curve(y_test, y_pred)

In [32]:
# predicted y probabilties
trace0 = go.Scatter(
    x = fpr_nb,
    y = tpr_nb,
    mode = 'markers+lines',
    line=dict(color='darkorange', width=2),
    name = 'ROC Curve, (area = {})'.format(auc_score_nb)
)

trace1 = go.Scatter(
    x=[0, 1], y=[0, 1], 
    mode='lines', 
    line=dict(color='navy', width=2, dash='dash'),
    showlegend=False)

data = [trace0, trace1]

title = 'ROC Curve: Naive Bayes'
fig = go.Figure(data=data, layout = {'title':title})

iplot(fig)
pio.write_image(fig, plt_filepath+title+'.png')

### Logistic Regression

In [None]:
parameters = {
              'penalty':['l1', 'l2'],
              'C': np.logspace(0, 4, 10),
             }

lr = linear_model.LogisticRegression(solver='liblinear')

clf = model_selection.GridSearchCV(lr, parameters, cv=5)
clf_lr = clf.fit(X_train, y_train)

In [31]:
# Best hyperparameters
print('Best Penalty:', clf_lr.best_estimator_.get_params()['penalty'])
print('Best C:', clf_lr.best_estimator_.get_params()['C'])

Best Penalty: l2
Best C: 7.742636826811269


In [32]:
y_pred = clf_lr.predict(X_test)

In [33]:
print("AUC")
auc_score_lr = metrics.roc_auc_score(y_test, y_pred)
auc_score_lr

AUC


0.7908537063305761

In [34]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.91      0.96      0.94      7901
         1.0       0.78      0.62      0.69      1907

   micro avg       0.89      0.89      0.89      9808
   macro avg       0.85      0.79      0.82      9808
weighted avg       0.89      0.89      0.89      9808



In [35]:
print("Accuracy")
accuracy = metrics.accuracy_score(y_test, y_pred)
accuracy

Accuracy


0.8934543230016313

In [36]:
fpr_lr, tpr_lr, thresholds_lr = metrics.roc_curve(y_test, y_pred)

In [37]:
y_test

array([1., 0., 0., ..., 0., 0., 0.])

In [38]:
y_pred

array([1., 0., 0., ..., 0., 0., 0.])

In [39]:
# predicted y probabilties
trace0 = go.Scatter(
    x = fpr_lr,
    y = tpr_lr,
    mode = 'markers+lines',
    line=dict(color='darkorange', width=2),
    name = 'ROC Curve, (area = {})'.format(auc_score_lr)
)

trace1 = go.Scatter(
    x=[0, 1], y=[0, 1], 
    mode='lines', 
    line=dict(color='navy', width=2, dash='dash'),
    showlegend=False)

data = [trace0, trace1]

fig = go.Figure(data=data, layout = {'title':'ROC Curve: Naive Bayes'})
iplot(fig)

In [40]:
# predicted y probabilties
trace_lr = go.Scatter(
    x = fpr_lr,
    y = tpr_lr,
    mode = 'markers+lines',
    #line=dict(color='darkorange', width=2),
    name = 'Logistic Regression ROC Curve, (area = {})'.format(auc_score_lr)
)

trace_nb = go.Scatter(
    x = fpr_nb,
    y = tpr_nb,
    mode = 'markers+lines',
    #line=dict(color='darkorange', width=2),
    name = 'Naive Bayes ROC Curve, (area = {})'.format(auc_score_nb)
)

trace_rf = go.Scatter(
    x = fpr_rf,
    y = tpr_rf,
    mode = 'markers+lines',
    #line=dict(color='darkorange', width=2),
    name = 'Random Forest ROC Curve, (area = {})'.format(auc_score_rf)
)

trace0 = go.Scatter(
    x=[0, 1], y=[0, 1], 
    mode='lines', 
    line=dict(color='navy', width=2, dash='dash'),
    showlegend=False)

data = [trace0, trace_lr, trace_nb, trace_rf]

fig = go.Figure(data=data, layout = {'title':'ROC Curves'})
iplot(fig)

## Predict Values on Test Dataset

Since Logistic Regression has the best AUC (.79), let's use that for the first model predictions.

In [179]:
test = pd.read_csv('data/test.csv')

In [None]:
y = clf.predict_proba(test)

In [None]:
y

In [None]:
y.to_csv('data/results1.csv')

Logistic Regression

In [7]:
logreg = linear_model.LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, y_train) * 100, 2)
acc_log



88.98

Let's check the correlation between the labels and the inputs...

In [8]:
coeff_df = pd.DataFrame(X.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df["Correlation"] = pd.Series(logreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False).head()

Unnamed: 0,Feature,Correlation
62,x64,0.243347
75,x78,0.103738
9,x9,0.09118
50,x52,0.084196
94,x98,0.075135


In [14]:
logreg.classes_

array([0, 1])

array([[0.99747941, 0.00252059],
       [0.98871857, 0.01128143],
       [0.99402003, 0.00597997],
       ...,
       [0.92738135, 0.07261865],
       [0.90761477, 0.09238523],
       [0.87770741, 0.12229259]])

In [16]:
pred_class_probabilties = logreg.predict_proba(X_test)
X_test_class_predictions = pd.DataFrame(pred_class_probabilties, columns=logreg.classes_)
X_test_class_predictions.head()

Unnamed: 0,0,1
0,0.997479,0.002521
1,0.988719,0.011281
2,0.99402,0.00598
3,0.873889,0.126111
4,0.25721,0.74279


In [17]:
X_test_class_predictions.describe()

Unnamed: 0,0,1
count,9800.0,9800.0
mean,0.793023,0.206977
std,0.272688,0.272688
min,0.000689,8.4e-05
25%,0.720789,0.022119
50%,0.922271,0.077729
75%,0.977881,0.279211
max,0.999916,0.999311


In [53]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
cv = KFold(n_splits=3)
scv = StratifiedKFold(n_splits=3)
# Examine the positive label in train and test folds
print('KFold not Stratified')
for train_idx, test_idx in cv.split(y):
    print(y[train_idx].sum(), y[test_idx].sum())

print('StratifiedKFold')
for train_idx, test_idx in scv.split(X, y):
    print(y[train_idx].sum(), y[test_idx].sum())

KFold not Stratified
5314 2567
5243 2638
5205 2676
StratifiedKFold
5254 2627
5254 2627
5254 2627


In [9]:
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score 
from sklearn.naive_bayes import GaussianNB

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import linear_model
# clf = linear_model.LogisticRegression()
# clf = RandomForestClassifier(n_estimators=5, random_state=RNG)
# clf = DecisionTreeClassifier(random_state=RNG)
# clf = SVC(probability=True, random_state=RNG)

In [10]:
classifiers = [DecisionTreeClassifier(random_state=random_state), 
               RandomForestClassifier(n_estimators=5, random_state=random_state), 
               GaussianNB(), 
               #SVC(probability=True, random_state=random_state), 
               linear_model.LogisticRegression()
              ]
clf_names = ['DecisionTree', 'RandomForest', 'GaussianNB', 'Logistic Regression']
metrics = ['f1', 'roc_auc', 'average_precision', 'accuracy', 'precision', 'recall']

In [14]:
clf_scores

[0.48012482283943386,
 0.667580827909605,
 0.3344507575794761,
 0.7263172877774337,
 0.44935844157314025,
 0.5693439918792031]

In [None]:
#cv = KFold(n_splits=3)
scv = StratifiedKFold(n_splits=3)

scores_df = pd.DataFrame(index=metrics, columns=clf_names)
clf_scores = []
for clf, name in zip(classifiers, clf_names):
    print('clf: ', clf)
    for metric in metrics:
        score = cross_val_score(clf, X, y, scoring=metric, cv=scv).mean()
        clf_scores.append(score)
        print('{} score: {}'.format(metric, score))
    scores_df[name] = clf_scores
    clf_scores = []

clf:  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
f1 score: 0.48012482283943386
roc_auc score: 0.667580827909605
average_precision score: 0.3344507575794761
accuracy score: 0.7263172877774337
precision score: 0.44935844157314025
recall score: 0.5693439918792031
clf:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
f1 score: 0.5017906

  'precision', 'predicted', average, warn_for)


In [None]:
scores_df

Let's use the Stratified K fold technique to evenly distribute the label samples, ensuring that the training and validation sets have a similiar ratio of labels.

In [49]:
metric_names = ['f1', 'roc_auc', 'average_precision', 'accuracy', 'precision', 'recall']
scores_df = pd.DataFrame(index=metric_names, columns=['Random-CV', 'Stratified-CV']) # to store the scores
cv = KFold(n_splits=3)
scv = StratifiedKFold(n_splits=3)
clf = GaussianNB()
for metric in metric_names:
    score1 = cross_val_score(clf, X, y, scoring=metric, cv=cv).mean()
    score2 = cross_val_score(clf, X, y, scoring=metric, cv=scv).mean()
    scores_df.loc[metric] = [score1, score2]

In [50]:
scores_df

Unnamed: 0,Random-CV,Stratified-CV
f1,0.605448,0.603506
roc_auc,0.888859,0.88885
average_precision,0.719777,0.719803
accuracy,0.868287,0.868007
precision,0.760866,0.761321
recall,0.502999,0.500825


In [62]:
next(scv.split(X, y))

(array([12991, 12992, 12993, ..., 39196, 39197, 39198]),
 array([    0,     1,     2, ..., 13310, 13327, 13328]))

In [66]:
from sklearn import metrics
train_idx, test_idx = next( scv.split(X, y) )

clf = RandomForestClassifier(n_estimators=10, random_state=42)
clf.fit(X[train_idx], y[train_idx])

print(X[train_idx].shape, X[test_idx].shape)
y_probas = clf.predict_proba(X[test_idx])[:,1]
y_preds = clf.predict(X[test_idx])

## confusion matrix
print('--confusion matrix--')
cm = metrics.confusion_matrix(y[test_idx], y_preds, labels=[0,1])
print(cm)
print('--classification report --')
print(metrics.classification_report(y[test_idx], y_preds, labels=[0,1]))

KeyError: '[12991 12992 12993 ... 39196 39197 39198] not in index'

In [74]:
X_numpy = X.values
y_numpy = y.values

In [69]:
# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(n_splits=6)
classifier = svm.SVC(kernel='linear', probability=True,
                     random_state=random_state)

In [70]:
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

i = 0

In [75]:
for train, test in cv.split(X, y):
    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)

KeyError: '[ 6493  6494  6495 ... 39196 39197 39198] not in index'

In [None]:
clf_names = ['']

In [44]:
from sklearn.feature_selection import chi2
X_new = feature_selection.SelectKBest(chi2, k=2).fit_transform(X, y)

ValueError: Input X must be non-negative.

In [37]:
X_new.h

NameError: name 'X_new' is not defined

In [19]:

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=42)

## Baseline Model
Let's get the performance of a generic machine learning model

In [20]:
sgd_clf = SGDClassifier(max_iter=5, tol=-np.infty, random_state=42)
sgd_clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=42, shuffle=True, tol=-inf,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [21]:
y_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)

In [22]:
"AUC Score: {}".format( roc_auc_score(y_train, y_pred) )

'AUC Score: 0.5091373522260393'

Since AUC scores range from .5 to 1, and .5 is the lower end, this is a pretty bad classifier to start with :/

In [23]:
confusion_matrix(y_train, y_pred)

array([[23233,   236],
       [ 5762,   168]])

In [24]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.99      0.89     23469
           1       0.42      0.03      0.05      5930

   micro avg       0.80      0.80      0.80     29399
   macro avg       0.61      0.51      0.47     29399
weighted avg       0.72      0.80      0.72     29399



In [25]:
svm_clf = svm.SVC(gamma="auto")
svm_clf.fit(X_train, y_train)
# y_pred = svm_clf.predict(X_test)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [28]:
y_pred = cross_val_predict(svm_clf, X_train, y_train, cv=3)

In [29]:
"AUC Score: {}".format( roc_auc_score(y_train, y_pred) )

'AUC Score: 0.5'

In [30]:
confusion_matrix(y_train, y_pred)

array([[23469,     0],
       [ 5930,     0]])

In [None]:


forest_clf = ensemble.RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = model_selection.cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()