In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score, make_scorer

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix, log_loss
import scipy.stats as st

np.random.seed(42)

In [3]:
def get_cm(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    cm = cm / cm.astype(float).sum(axis=1)
    np.round_(cm, decimals=4, out=cm)
    
    return cm

In [4]:
def deskew_df(df):
    box_cox_df = pd.DataFrame()
    for col in df.columns:
        box_cox_col = st.boxcox(df[col])[0]
        box_cox_df[col] = pd.Series(box_cox_col)
        
    return box_cox_df

In [5]:
def test_model(my_model, X_train, y_train, X_test, y_test, my_rand_state=42, pca_flag=False, gs_params=None):
    if pca_flag:
        pipe = Pipeline([('scaler', StandardScaler()),
                         ('pca', PCA(n_components=5, random_state=my_rand_state)),
                         ('clf', my_model)
                        ])
    else:
        pipe = Pipeline([('scaler', StandardScaler()),
                         ('clf', my_model)
                        ])
    
    if gs_params != None:
        roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True)
        pipe_gs = GridSearchCV(pipe, param_grid=gs_params, cv=10, n_jobs=-1, verbose=1, scoring=roc_auc_scorer)
        pipe = pipe_gs
    
    pipe.fit(X_train, y_train)    
    train_score = pipe.score(X_train, y_train)
    test_score = pipe.score(X_test, y_test)
    
    y_pred = pipe.predict(X_test)
    cm = get_cm(y_test, y_pred)
    ll = log_loss(y_test, y_pred)
    
    print(my_model)
    print('train score:', train_score)
    print('test score:', test_score)
    print('log loss:', ll)
    print('confusion matrix:\n', cm)
    print()
    
    if gs_params != None:
        return pipe.best_params_

In [6]:
def sample_df(X, y, sample_pct=.1):
    sample_size = int(sample_pct * X.shape[0])
    
    X_sample = X.sample(sample_size)
    y_sample = y.iloc[X_sample.index.values]
    
    return X_sample, y_sample

In [7]:
m_features = pd.read_pickle('../pickled_data-UCI/madelon_important_features.p')
imp_feats = list(m_features['feature'].head(20))

In [8]:
X_train = pd.read_pickle('../pickled_data-UCI/madelon_train_data.p')
X_train = X_train[imp_feats]

y_train = pd.read_pickle('../pickled_data-UCI/madelon_train_labels.p')
y_train = y_train.values.ravel()

X_test = pd.read_pickle('../pickled_data-UCI/madelon_valid_data.p')
X_test = X_test[imp_feats]

y_test = pd.read_pickle('../pickled_data-UCI/madelon_valid_labels.p')
y_test = y_test.values.ravel()

In [9]:
mm_scaler = MinMaxScaler(feature_range=(0.00001, 1))

X_train_mmsc = mm_scaler.fit_transform(X_train)
X_test_mmsc = mm_scaler.fit_transform(X_test)

In [10]:
X_train_ds = deskew_df(pd.DataFrame(X_train_mmsc))
X_test_ds = deskew_df(pd.DataFrame(X_test_mmsc))

## Run KNC with Feature Selection - Full Data

In [11]:
knc = KNeighborsClassifier(n_neighbors=10, weights='distance', p=1, algorithm='auto', n_jobs=-1)
test_model(knc, X_train_ds, y_train, X_test_ds, y_test)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=1,
           weights='distance')
train score: 1.0
test score: 0.77
log loss: 7.94400919187
confusion matrix:
 [[ 0.7733  0.2267]
 [ 0.2333  0.7667]]



## Run KNC with Feature Selection - Sample 1 (10%) 

In [15]:
X_train_sample, y_train_sample = sample_df(X_train_ds, pd.DataFrame(y_train))
y_train_sample = y_train_sample.values.ravel()
test_model(knc, X_train_sample, y_train_sample, X_test_ds, y_test)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=1,
           weights='distance')
train score: 1.0
test score: 0.696666666667
log loss: 10.4769034353
confusion matrix:
 [[ 0.6467  0.3533]
 [ 0.2533  0.7467]]



## Run KNC with Feature Selection - Sample 2 (10%) 

In [16]:
X_train_sample, y_train_sample = sample_df(X_train_ds, pd.DataFrame(y_train))
y_train_sample = y_train_sample.values.ravel()
test_model(knc, X_train_sample, y_train_sample, X_test_ds, y_test)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=1,
           weights='distance')
train score: 1.0
test score: 0.653333333333
log loss: 11.9735757498
confusion matrix:
 [[ 0.6667  0.3333]
 [ 0.36    0.64  ]]



## Run KNC with Feature Selection - Sample 3 (10%) 

In [17]:
X_train_sample, y_train_sample = sample_df(X_train_ds, pd.DataFrame(y_train))
y_train_sample = y_train_sample.values.ravel()
test_model(knc, X_train_sample, y_train_sample, X_test_ds, y_test)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=1,
           weights='distance')
train score: 1.0
test score: 0.676666666667
log loss: 11.1675950055
confusion matrix:
 [[ 0.8567  0.1433]
 [ 0.5033  0.4967]]



## Run KNC with Feature Selection - Sample 1 (25%) 

In [12]:
X_train_sample, y_train_sample = sample_df(X_train_ds, pd.DataFrame(y_train), .25)
y_train_sample = y_train_sample.values.ravel()
test_model(knc, X_train_sample, y_train_sample, X_test_ds, y_test)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=1,
           weights='distance')
train score: 1.0
test score: 0.698333333333
log loss: 10.4193214834
confusion matrix:
 [[ 0.69    0.31  ]
 [ 0.2933  0.7067]]



## Run KNC with Feature Selection - Sample 2 (25%) 

In [13]:
X_train_sample, y_train_sample = sample_df(X_train_ds, pd.DataFrame(y_train), .25)
y_train_sample = y_train_sample.values.ravel()
test_model(knc, X_train_sample, y_train_sample, X_test_ds, y_test)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=1,
           weights='distance')
train score: 1.0
test score: 0.721666666667
log loss: 9.61339671092
confusion matrix:
 [[ 0.74    0.26  ]
 [ 0.2967  0.7033]]



## Run KNC with Feature Selection - Sample 3 (25%) 

In [14]:
X_train_sample, y_train_sample = sample_df(X_train_ds, pd.DataFrame(y_train), .25)
y_train_sample = y_train_sample.values.ravel()
test_model(knc, X_train_sample, y_train_sample, X_test_ds, y_test)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=1,
           weights='distance')
train score: 1.0
test score: 0.716666666667
log loss: 9.78609992153
confusion matrix:
 [[ 0.7167  0.2833]
 [ 0.2833  0.7167]]

