In [1]:
# Custom Python Modules
import data_cleaning as dc
from custom_metrics import scania_score
from model_abstraction import cross_val_models, cross_val_xgb

import sys

# Data Containers
import pandas as pd
import numpy as np
from collections import defaultdict
import pickle

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing: standardizing, undersampling, and oversampling, gridsearch
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.model_selection import GridSearchCV, train_test_split,\
cross_val_score, StratifiedKFold, StratifiedShuffleSplit, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV
import imblearn

#Pipeline Experimentation
from sklearn.pipeline import make_pipeline

# Models: dummy, kNN, logistic regression, Naive Bayes, SVM, Gradient Boosting
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb

# Model evaluation metrics
# ROC curve
# Proprietary cost function
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
auc, log_loss, confusion_matrix, f1_score, make_scorer, roc_auc_score, roc_curve

Using TensorFlow backend.


In [2]:
X_train, X_test, y_train, y_test = dc.ready_aps_data()

In [4]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Columns: 147 entries, aa_000 to n_missing
dtypes: float64(147)
memory usage: 67.3 MB


In [3]:
y_train.value_counts()

0    59000
1     1000
Name: class, dtype: int64

In [5]:
y_test.value_counts()

0    15625
1      375
Name: class, dtype: int64

In [11]:
# Experiment with stratification in train-test-split
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.25)

In [12]:
np.sum(y_val==1)/len(y_val)

0.016666666666666666

In [13]:
np.sum(y_tr==1)/len(y_tr)

0.016666666666666666

### Naive Performance of Classifiers
Grab small subsection of data while setting up the pipeline so as not to make the runtime too large.  

'''   EXPERIMENT WITH MAKING A PIPELINE OBJECT .  '''

In [3]:
# Create stratified folds for cross-validation
skf = StratifiedKFold(n_splits=10, random_state=42)

In [9]:
# SkLearn Dummy classifier baseline
np.mean(cross_val_score(DummyClassifier(),X_train,y_train,
                        cv=skf, scoring = 'roc_auc'))

0.4980084745762713

In [10]:
# Absolute baseline: Predicting everything as negative
all_neg = np.zeros(len(y_test))
print(roc_auc_score(y_test, all_neg))
print(scania_score(y_test,all_neg))

0.5
187500


In [11]:
classifiers = {
    'knn': KNeighborsClassifier,
    'lgr': LogisticRegression,
    'gnb': GaussianNB,
    #'mnb': MultinomialNB,
    'bnb': BernoulliNB,
    'dtc': DecisionTreeClassifier,
    'rfc': RandomForestClassifier,
    'gbc': GradientBoostingClassifier,
    'lsvc': LinearSVC,
    # 'svc': SVC   ''' really, really slow'''
}

default_parameters = {
    'knn': {},
    'lgr': {'solver':'liblinear'},
    'gnb': {},
    #'mnb': {},
    'bnb': {},
    'dtc': {},
    'rfc': {'n_estimators':100},
    'gbc': {},
    'lsvc': {},
    # 'svc': {}
}

In [12]:
vanilla_unscaled_auc = cross_val_models(classifiers, X_train,y_train, use_cv=skf,
                                        metric='roc_auc', params=default_parameters)
print(vanilla_unscaled_auc)



defaultdict(<class 'str'>, {'knn': 0.9092805084745763, 'lgr': 0.8963062711864407, 'gnb': 0.9672299152542372, 'bnb': 0.9355677966101694, 'dtc': 0.8498983050847458, 'rfc': 0.9872572881355934, 'gbc': 0.98686906779661, 'lsvc': 0.7310842372881357})




In [14]:
vanilla_unscaled_prcsn = cross_val_models(classifiers, X_train,y_train, use_cv=skf, metric='precision',
                                         params=default_parameters)
print(vanilla_unscaled_prcsn)



defaultdict(<class 'str'>, {'knn': 0.675370369371087, 'lgr': 0.7664283216382832, 'gnb': 0.3353019626081763, 'bnb': 0.14451239676743705, 'dtc': 0.6967446081039919, 'rfc': 0.903895386727101, 'gbc': 0.8392126195160824, 'lsvc': 0.4201092371338735})




In [4]:
# XGBoost isn't part of sklearn, manually cross-validating using skf splits
xgb_cv_score = cross_val_xgb(X_train, y_train, skf, roc_auc_score,
                            pred_threshold = 0.03)

In [6]:
print(xgb_cv_score)

0.9619322033898305


## Naive Classifiers using Scaled Data

Looking at the effects of scaled data on model performance. I would only expect changes from models operating in a linear space, such as KNeighbors, Logistic Regression, and SVC's.

In [9]:
ssx = StandardScaler()
scaled_X_train = pd.DataFrame(ssx.fit_transform(X_train), columns = X_train.columns,
                              index=X_train.index)
scaled_X_test = pd.DataFrame(ssx.transform(X_test), columns = X_test.columns,
                             index=X_test.index)

In [10]:
print(type(scaled_X_train))

<class 'pandas.core.frame.DataFrame'>


In [21]:
vanilla_scaled = cross_val_models(classifiers, scaled_X_train,y_train, use_cv=skf, metric='roc_auc')
print(vanilla_scaled)



defaultdict(<class 'str'>, {'knn': 0.9150966949152544, 'lgr': 0.973212372881356, 'gnb': 0.9644616949152542, 'bnb': 0.9316935593220339, 'dtc': 0.8553389830508475, 'rfc': 0.9642127966101695, 'gbc': 0.9868824576271186, 'lsvc': 0.9455874576271185})




In [11]:
# Attempting XGB with scaled data
xgb_cv_score = cross_val_xgb(scaled_X_train, y_train, skf, roc_auc_score,
                            pred_threshold = 0.03)

In [12]:
print(xgb_cv_score)

0.961822033898305


## Models Worth Pursuing:

In [13]:
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
test_err = scania_score(y_test,y_pred)




In [61]:
y_pred_threshold = [1 if x > 0.01 else 0 for x in rfc.predict_proba(X_test)[:,1]]

In [40]:
confusion_matrix(y_test,y_pred)

array([[15611,    14],
       [  130,   245]])

In [62]:
confusion_matrix(y_test,y_pred_threshold)

array([[15172,   453],
       [   19,   356]])

In [63]:
scania_score(y_test, y_pred_threshold)

14030

In [64]:
grb = GradientBoostingClassifier()
grb.fit(X_train,y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [105]:
y_pred = grb.predict(X_test)
y_pred_threshold = [1 if x > 0.0053 else 0 for x in grb.predict_proba(X_test)[:,1]]

In [106]:
print(scania_score(y_test,y_pred))
print(scania_score(y_test,y_pred_threshold))

59790
9730


In [107]:
confusion_matrix(y_test,y_pred_threshold)

array([[14952,   673],
       [    6,   369]])

In [None]:
model = GradientBoostingClassifier
const_params = {'random_state':42}
tuning_params = {'min_samples_split':[2,10,30,100],
                'max_depth':[3,5,10],
                'max_features':['auto', 'log2', None],
                'n_estimators':[5,10,50,100],
                 'subsample':[1,0.9,0.8, 0.7],
                }
metric = 'roc_auc'
use_cv = skf

grid_gb_scaled = RandomizedSearchCV(model(**const_params), tuning_params, scoring = metric,
                             iid=False, cv=use_cv, n_iter = 20)
grid_gb_scaled.fit(scaled_X_train, y_train)

## Score to beat: 9920. Type I: 542. Type 2: 9.
Current best: Vanilla Sklearn Gradient Boost with decision threshold at 0.0053