In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score, make_scorer

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix, log_loss
import scipy.stats as st

np.random.seed(42)

In [2]:
def get_cm(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    cm = cm / cm.astype(float).sum(axis=1)
    np.round_(cm, decimals=4, out=cm)
    
    return cm

In [3]:
def deskew_df(df):
    box_cox_df = pd.DataFrame()
    for col in df.columns:
        box_cox_col = st.boxcox(df[col])[0]
        box_cox_df[col] = pd.Series(box_cox_col)
        
    return box_cox_df

In [4]:
def test_model(my_model, X_train, y_train, X_test, y_test, my_rand_state=42, pca_flag=False, gs_params=None):
    if pca_flag:
        pipe = Pipeline([('scaler', StandardScaler()),
                         ('pca', PCA(n_components=5, random_state=my_rand_state)),
                         ('clf', my_model)
                        ])
    else:
        pipe = Pipeline([('scaler', StandardScaler()),
                         ('clf', my_model)
                        ])
    
    if gs_params != None:
        roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True)
        pipe_gs = GridSearchCV(pipe, param_grid=gs_params, cv=10, n_jobs=-1, verbose=1, scoring=roc_auc_scorer)
        pipe = pipe_gs
    
    pipe.fit(X_train, y_train)    
    train_score = pipe.score(X_train, y_train)
    test_score = pipe.score(X_test, y_test)
    
    y_pred = pipe.predict(X_test)
    cm = get_cm(y_test, y_pred)
    ll = log_loss(y_test, y_pred)
    
    print(my_model)
    print('train score:', train_score)
    print('test score:', test_score)
    print('log loss:', ll)
    print('confusion matrix:\n', cm)
    print()
    
    if gs_params != None:
        return pipe.best_params_

In [5]:
m_features = pd.read_pickle('../pickled_data-UCI/madelon_important_features.p')
imp_feats = list(m_features['feature'].head(20))

In [6]:
X_train = pd.read_pickle('../pickled_data-UCI/sample_1/df_sample_data.p')
X_train = X_train[imp_feats]

y_train = pd.read_pickle('../pickled_data-UCI/sample_1/df_sample_labels.p')
y_train = y_train.values.ravel()

X_test = pd.read_pickle('../pickled_data-UCI/madelon_valid_data.p')
X_test = X_test[imp_feats]

y_test = pd.read_pickle('../pickled_data-UCI/madelon_valid_labels.p')
y_test = y_test.values.ravel()

In [7]:
mm_scaler = MinMaxScaler(feature_range=(0.00001, 1))

X_train_mmsc = mm_scaler.fit_transform(X_train)
X_test_mmsc = mm_scaler.fit_transform(X_test)

In [8]:
X_train_ds = deskew_df(pd.DataFrame(X_train_mmsc))
X_test_ds = deskew_df(pd.DataFrame(X_test_mmsc))

## Model Testing Pipelines with Feature Selection

In [9]:
lr = LogisticRegression(C=1e-4, random_state=42)
test_model(lr, X_train_ds, y_train, X_test_ds, y_test)

LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
train score: 0.595
test score: 0.61
log loss: 13.470245399
confusion matrix:
 [[ 0.6933  0.3067]
 [ 0.4733  0.5267]]



In [10]:
dtc = DecisionTreeClassifier(random_state=42)
test_model(dtc, X_train_ds, y_train, X_test_ds, y_test)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=42, splitter='best')
train score: 1.0
test score: 0.696666666667
log loss: 10.476904768
confusion matrix:
 [[ 0.6433  0.3567]
 [ 0.25    0.75  ]]



In [11]:
knc = KNeighborsClassifier(n_neighbors=10)
test_model(knc, X_train_ds, y_train, X_test_ds, y_test)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')
train score: 0.805
test score: 0.706666666667
log loss: 10.1314437076
confusion matrix:
 [[ 0.8267  0.1733]
 [ 0.4133  0.5867]]



In [12]:
my_svc = SVC(C=1e-4, kernel='linear', random_state=42)
test_model(my_svc, X_train_ds, y_train, X_test_ds, y_test)

SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False)
train score: 0.503
test score: 0.5
log loss: 17.2697879962
confusion matrix:
 [[ 0.  1.]
 [ 0.  1.]]



## Model Testing Pipelines with Feature Selection and Dimensionality Reduction

In [13]:
test_model(lr, X_train_ds, y_train, X_test_ds, y_test, pca_flag=True)

LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
train score: 0.579
test score: 0.575
log loss: 14.6791598773
confusion matrix:
 [[ 0.55  0.45]
 [ 0.4   0.6 ]]



In [14]:
test_model(dtc, X_train_ds, y_train, X_test_ds, y_test, pca_flag=True)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=42, splitter='best')
train score: 1.0
test score: 0.65
log loss: 12.0886903452
confusion matrix:
 [[ 0.7033  0.2967]
 [ 0.4033  0.5967]]



In [15]:
test_model(knc, X_train_ds, y_train, X_test_ds, y_test, pca_flag=True)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')
train score: 0.825
test score: 0.708333333333
log loss: 10.0739137295
confusion matrix:
 [[ 0.74    0.26  ]
 [ 0.3233  0.6767]]



In [16]:
test_model(my_svc, X_train_ds, y_train, X_test_ds, y_test, pca_flag=True)

SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False)
train score: 0.503
test score: 0.5
log loss: 17.2697879962
confusion matrix:
 [[ 0.  1.]
 [ 0.  1.]]

