# Modeling

In [1]:
import pickle
import seaborn as sns
from sklearn.cross_validation import train_test_split
import numpy as np
import pandas as pd
import os
import seaborn as sns

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."

def image_path(fig_id):
    return os.path.join(PROJECT_ROOT_DIR, "images", fig_id)

def save_fig(fig_id, tight_layout=True):
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(image_path(fig_id) + ".png", format='png', dpi=300)




In [2]:
df_training = pickle.load(open('my_df_training_onehot.pickle', 'rb'))
y = pickle.load(open('my_y.pickle', 'rb'))
X_train, X_test, y_train, y_test = train_test_split(df_training, y)

In [3]:
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')


def tuned_clf(estimator,k,Xtrain,ytrain,pram_dist,ncv,njobs=1,scoring_method='None',**kwargs):
    """K: the number of features in SelectKBest method
       Xtrain: training features
       ytrain: traning labels 
       param dist: distribution parametes that are used in RandomizedSearchCV
       ncv : number of cross-validation folds
       This function will return the trained estimator"""
    if kwargs:
        clf = estimator(kwargs)
    else:
        clf = estimator
    
    if k == Xtrain.shape[1]:
        pipe = make_pipeline(clf)
    else:
        pipe = make_pipeline(SelectKBest(k=k),clf)
    
    grid_clf = RandomizedSearchCV(pipe,param_distributions= param_dist,cv=ncv,n_jobs=njobs,scoring=scoring_method)
    grid_clf.fit(Xtrain,ytrain) 
                                  
    return  grid_clf

def tuned_estimators(estimator,Xtrain,ytrain,Xtest,ytest,param_dist,n_features_list,ncv=5,njobs=1,scoring_method='None',
                   verbose=False,**kwargs):
    '''A kbest and a randomforestclassifier are embeded in a pipeline and a randomizedsearchCV tunes the
     hyperparameters'''
    models = defaultdict(str)
    accuracy_scores = []
    f1_scores = []
    roc_auc_scores = []
    nfeatures = []
    for k in n_features_list:
        nfeatures.append(k)
        model_name = 'clf_k'+ str(k)
        clf = tuned_clf(estimator,k,Xtrain,ytrain,param_dist,ncv,njobs,scoring_method,**kwargs)
        models[model_name] = clf.best_estimator_
        ypred = models[model_name].predict(Xtest)
        accuracy_scores.append (accuracy_score(ypred,ytest))
        f1_scores.append(f1_score(ypred,ytest))
        roc_auc_scores.append(roc_auc_score(ypred,ytest))
        if verbose:
            print('%s best features: accuracy=%.4f, f1=%.4f, roc_auc=%.4f' % 
                  (k,accuracy_scores[-1],f1_scores[-1],roc_auc_scores[-1]))
    plt.title('Effect of feature elimination on accuracy, f1, roc_auc scores')
    plt.xlabel("K best features")
    plt.ylabel("Score")
    plt.xticks(nfeatures)
    plt.grid(b=True)
    plt.plot(nfeatures,accuracy_scores,'o-', color="r",label="accuracy score")
    plt.plot(nfeatures,f1_scores,'o-', color="b",label="f1 score")
    plt.plot(nfeatures,roc_auc_scores,'o-', color="g",label="roc_auc score")
    plt.legend(loc="best")
    return models



In [4]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

def Summary_Results(estimator,X_test,y_test):
    ypred = estimator.predict(X_test)
    print('The accuracy is: %.2f3 \n' % accuracy_score(ypred,y_test))
    print('Confusion_matrix:')
    cm = confusion_matrix(y_test, ypred)
    print('\t\t pridicted values')
    print('\t\t 0 \t 1')
    print('actual 0: ','\t',cm[0,0],'\t',cm[0,1])
    print('values 1: ','\t',cm[1,0],'\t',cm[1,1])
    print('-------------------------------------------------------')
    print('Classification_report: \n')
    print(classification_report(y_test,ypred,target_names=["class 0","class 1"]))

In [5]:
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve

def plot_roc_curve(estimator,Xtest,ytest,figsize=(8,5)):
    plt.figure(figsize=figsize)
    fpr_rf, tpr_rf, threshold = roc_curve(ytest,estimator.predict_proba(Xtest)[:,1])
    plt.plot(fpr_rf, tpr_rf)
    plt.xlabel('False positive rate',fontsize=16)
    plt.ylabel('True positive rate',fontsize=16)
    plt.title('ROC Curve',fontsize=18)


In [6]:
from sklearn.learning_curve import learning_curve
sns.set_context('notebook',font_scale=1)

def plot_learning_curve(estimator, X, y, ylim=(0, 1.1), cv=5,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5),
                        scoring=None):
    plt.title("Learning curves for %s" % type(estimator).__name__)
    plt.ylim(*ylim); plt.grid()
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, validation_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,
        scoring=scoring)
    train_scores_mean = np.mean(train_scores, axis=1)
    validation_scores_mean = np.mean(validation_scores, axis=1)
    
    plt.grid(b=True)
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, validation_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    plt.legend(loc="best")
    print("Best validation score: {:.4f}".format(validation_scores_mean[-1]))

In [7]:
# models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from imblearn.metrics import classification_report_imbalanced
from collections import defaultdict

estimators = {'RandomForest':RandomForestClassifier(),'AdaBoost': AdaBoostClassifier(), 'Extra Tree': ExtraTreesClassifier()}
clfs = defaultdict(str)

for name,clf in estimators.items():
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(name + '_clf')
    print(classification_report_imbalanced(y_test, y_pred))
    clfs[name+'_clf'] = clf

Extra Tree_clf
                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.99      0.05      0.93      0.54      0.31     16015
          1       0.33      0.05      0.99      0.09      0.54      0.28      2231

avg / total       0.81      0.87      0.16      0.83      0.54      0.31     18246

RandomForest_clf
                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.99      0.03      0.93      0.56      0.34     16015
          1       0.36      0.03      0.99      0.06      0.56      0.30      2231

avg / total       0.82      0.87      0.15      0.83      0.56      0.33     18246

AdaBoost_clf
                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      1.00      0.01      0.94      0.74      0.56     16015
          1       0.62      0.01      1.00      0.03      0.74      0.53      2231

avg / total       0.85      0.88  

In [14]:
# oversample
from imblearn.over_sampling import SMOTE
sme = SMOTE()
X_resampled, y_resampled = sme.fit_sample(X_train, y_train)
for name,clf in estimators.items():
    clf.fit(X_resampled,y_resampled)
    y_pred = clf.predict(X_test)
    print(name + '_clf')
    print(classification_report_imbalanced(y_test, y_pred))
    clfs[name+'_SMOTEENN_clf'] = clf

Extra Tree_clf
                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.97      0.09      0.93      0.52      0.28     16015
          1       0.30      0.09      0.97      0.14      0.52      0.25      2231

avg / total       0.81      0.86      0.20      0.83      0.52      0.28     18246

RandomForest_clf
                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.97      0.11      0.93      0.54      0.31     16015
          1       0.33      0.11      0.97      0.16      0.54      0.27      2231

avg / total       0.82      0.86      0.21      0.83      0.54      0.30     18246

AdaBoost_clf
                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.91      0.21      0.90      0.47      0.23     16015
          1       0.24      0.21      0.91      0.23      0.47      0.20      2231

avg / total       0.81      0.82  

In [8]:
# combine
from imblearn.combine import SMOTEENN
sme = SMOTEENN()
X_resampled, y_resampled = sme.fit_sample(X_train, y_train)
for name,clf in estimators.items():
    clf.fit(X_resampled,y_resampled)
    y_pred = clf.predict(X_test)
    print(name + '_clf')
    print(classification_report_imbalanced(y_test, y_pred))
    clfs[name+'_SMOTEENN_clf'] = clf

Extra Tree_clf
                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.80      0.43      0.85      0.46      0.22     16015
          1       0.23      0.43      0.80      0.30      0.46      0.20      2231

avg / total       0.83      0.75      0.48      0.78      0.46      0.22     18246

RandomForest_clf
                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.79      0.44      0.85      0.46      0.22     16015
          1       0.23      0.44      0.79      0.30      0.46      0.19      2231

avg / total       0.83      0.75      0.48      0.78      0.46      0.22     18246

AdaBoost_clf
                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.62      0.61      0.74      0.41      0.18     16015
          1       0.18      0.61      0.62      0.28      0.41      0.16      2231

avg / total       0.83      0.62  

In [10]:
# combine
from imblearn.combine import SMOTETomek
sme = SMOTETomek()
X_resampled, y_resampled = sme.fit_sample(X_train, y_train)
for name,clf in estimators.items():
    clf.fit(X_resampled,y_resampled)
    y_pred = clf.predict(X_test)
    print(name + '_clf')
    print(classification_report_imbalanced(y_test, y_pred))
    clfs[name+'_SMOTETomek_clf'] = clf

Extra Tree_clf
                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.97      0.08      0.93      0.51      0.27     16015
          1       0.29      0.08      0.97      0.13      0.51      0.24      2231

avg / total       0.81      0.86      0.19      0.83      0.51      0.27     18246

RandomForest_clf
                   pre       rec       spe        f1       geo       iba       sup

          0       0.88      0.97      0.08      0.93      0.50      0.26     16015
          1       0.28      0.08      0.97      0.12      0.50      0.23      2231

avg / total       0.81      0.86      0.19      0.83      0.50      0.26     18246

AdaBoost_clf
                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.91      0.23      0.90      0.48      0.24     16015
          1       0.25      0.23      0.91      0.24      0.48      0.21      2231

avg / total       0.82      0.82  

In [11]:
for name,clf in clfs.items():
    print(name+':')
    Summary_Results(clfs[name],X_test.as_matrix(),y_test.as_matrix())
    print('================================================================')

RandomForest_SMOTEENN_clf:
The accuracy is: 0.863 

Confusion_matrix:
		 pridicted values
		 0 	 1
actual 0:  	 15560 	 455
values 1:  	 2056 	 175
-------------------------------------------------------
Classification_report: 

             precision    recall  f1-score   support

    class 0       0.88      0.97      0.93     16015
    class 1       0.28      0.08      0.12      2231

avg / total       0.81      0.86      0.83     18246

Extra Tree_clf:
The accuracy is: 0.863 

Confusion_matrix:
		 pridicted values
		 0 	 1
actual 0:  	 15555 	 460
values 1:  	 2043 	 188
-------------------------------------------------------
Classification_report: 

             precision    recall  f1-score   support

    class 0       0.88      0.97      0.93     16015
    class 1       0.29      0.08      0.13      2231

avg / total       0.81      0.86      0.83     18246

AdaBoost_SMOTETomek_clf:
The accuracy is: 0.823 

Confusion_matrix:
		 pridicted values
		 0 	 1
actual 0:  	 14533 	 1482


## SVM

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from imblearn.combine import SMOTEENN
sme = SMOTEENN()
X_resampled, y_resampled = sme.fit_sample(X_train, y_train)

clf = Pipeline([
        ("kpca", KernelPCA(n_components=2)),
        ("log_reg", LogisticRegression())
    ])

param_grid = [{
        "kpca__gamma": np.linspace(0.03, 0.05, 10),
        "kpca__kernel": ["rbf", "sigmoid"]
    }]

grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X_resampled, y_resampled)

In [None]:
print(grid_search.best_params_)

In [None]:
rbf_pca = KernelPCA(n_components = 2, kernel="rbf", gamma=0.0433,
                    fit_inverse_transform=True)
y_pred = rbf_pca.predict(X_test)
print(name + 'svm_clf')
print(classification_report_imbalanced(y_test, y_pred))