In [1]:
% matplotlib inline
import pandas as pd
import numpy as np
import datetime

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, SelectFpr
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, Ridge, SGDRegressor, LinearRegression


In [3]:
data1 = pd.read_csv("madelon_pt_1.csv")
data2 = pd.read_csv("madelon_pt_2.csv")
data3 = pd.read_csv("madelon_pt_3.csv")
data = pd.concat([data1, data2, data3])

In [4]:
y_data = data['label']

In [5]:
X_data = data.drop('label', axis=1)

In [6]:
X_data.shape

(2000, 501)

In [7]:
a = train_test_split(X_data, y_data, random_state=10)

In [8]:
a[2]

450   -1
165   -1
325    1
577   -1
495   -1
98     1
15     1
260    1
286   -1
626    1
564    1
195    1
259   -1
529    1
382   -1
667    1
336    1
383   -1
239    1
658    1
109    1
555    1
385    1
35     1
543   -1
243   -1
592   -1
458    1
638   -1
458   -1
      ..
524    1
169   -1
574   -1
446   -1
687    1
668    1
54    -1
395    1
239   -1
356    1
152   -1
371   -1
44     1
502   -1
40     1
490   -1
256   -1
397    1
332   -1
505    1
549    1
120   -1
33    -1
480    1
447    1
693   -1
644   -1
527   -1
449   -1
589   -1
Name: label, dtype: int64

In [9]:
def split_data(X_df, y_df, random_state=None):
    X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=random_state)
    return {
            'X_train' : X_train,
            'X_test' : X_test,
            'y_train' : y_train,
            'y_test' : y_test,
            'random_state' : random_state
        }

In [10]:
def general_transformer(transformer, dd, random_state=None):
     
    local_dd = dict(dd)
    
    if transformer == 'StandardScaler':
        transformer.fit(local_dd['X_train'])
        dd['Scaler'] = transformer
    else:
        transformer.fit(local_dd['X_train'], local_dd['y_train'])
        local_dd['transformer'] = transformer
        
    
    local_dd['X_train'] = transformer.transform(local_dd['X_train'])
    local_dd['X_test'] = transformer.transform(local_dd['X_test'])
    
    return local_dd    
        
            

In [11]:
def general_model(model, dd, random_state=None):

    local_dd = dict(dd)
    
    model.fit(local_dd['X_train'], local_dd['y_train'])
    
    local_dd['train_score'] = model.score(local_dd['X_train'], local_dd['y_train'])
    local_dd['test_score'] = model.score(local_dd['X_test'], local_dd['y_test'])
    local_dd['model'] = model
    
    return local_dd    


In [12]:
def full_stack(X_df, y_v, scaler, transformer, model, fs_params, gs_params, random_state=None):
    dd1 = split_data(X_df, y_v, random_state=random_state)
    dd2 = general_transformer(scaler, dd1, random_state=random_state)
    
    l_dd4 = []
    for ks in fs_params:
        dd3 = general_transformer(transformer(k=ks), dd2, random_state=random_state)
        gs = GridSearchCV(model, param_grid=gs_params)
        l_dd4.append(general_model(gs, dd3))
    
    skb_gr = skb_gridsearch_results(fs_params, l_dd4)
  
    return {
        "train_score" : skb_gr["train_score"],
        "test_score" : skb_gr["test_score"],
        "best_k" : skb_gr["best_k"],
        "fdd" : skb_gr["fdd"]
        }
    

In [13]:
def skb_gridsearch_results(k_range, l_dd):
    k_range_min = min(k_range)
    max_test_score = 0
    for i in k_range:
        tst_s = l_dd[i-k_range_min]["test_score"]
        if max_test_score < tst_s:
            max_test_score = tst_s
            max_k = i
    l_dd[max_k-k_range_min]["train_score"], max_test_score, max_k
    
    return {
        "train_score" : l_dd[max_k-k_range_min]["test_score"],
        "test_score" : max_test_score,
        "best_k" : max_k,
        "fdd" : l_dd[max_k-k_range_min]
    }

In [14]:
#X_data, y_data

In [15]:
split_dd = split_data(X_data, y_data, random_state=10)

In [16]:
type(split_dd["X_train"])

pandas.core.frame.DataFrame

In [17]:
scaled_dd = general_transformer(StandardScaler(), split_dd)

In [18]:
feated_dd = general_transformer(SelectKBest(k=10), scaled_dd)

In [19]:
feated_dd["transformer"]

SelectKBest(k=10, score_func=<function f_classif at 0x000000000B9833C8>)

In [20]:
out1 = general_model(LogisticRegression(), feated_dd)
out1['train_score'], out1['test_score']

(0.61399999999999999, 0.59399999999999997)

In [21]:
out1

{'X_test': array([[ 0.87868003, -1.35299894,  2.34549604, ..., -1.84011875,
         -1.92843073,  1.8127922 ],
        [-1.01127223, -1.19551653,  0.39648044, ..., -0.18672098,
         -0.19069912,  0.44534192],
        [-1.54792534,  1.5379283 , -1.25152888, ..., -1.17259791,
         -1.20625655, -0.74782549],
        ..., 
        [ 0.13203222,  2.24659918, -1.13865153, ..., -0.01213861,
         -0.03272352, -1.32429964],
        [-0.3579554 , -0.71182053,  0.9608672 , ..., -0.37157291,
         -0.37124266,  0.08336978],
        [ 1.64866057,  0.70552123,  1.32959988, ..., -0.61804214,
         -0.61949003, -0.73441912]]),
 'X_train': array([[-0.00796424,  1.74040569, -1.04834965, ..., -1.46014535,
         -1.34166421, -0.98914025],
        [-1.61792357, -0.25062202, -1.19132763, ..., -0.01213861,
          0.01241237,  0.51237379],
        [ 0.92534551, -0.94804416,  0.82541438, ..., -0.65912034,
         -0.64205798,  1.06203518],
        ..., 
        [-2.01458022, -0.599333

In [22]:
out2 = general_model(LogisticRegression(penalty='l1'), feated_dd)
out2['train_score'], out2['test_score']

(0.61599999999999999, 0.60599999999999998)

In [23]:
out20 = general_model(KNeighborsClassifier(n_neighbors=7), feated_dd)
out20['train_score'], out20['test_score']

(0.89933333333333332, 0.878)

In [24]:
fs_kn_params = range(4, 25)    
gs_kn_params = {
    'n_neighbors' : range(3, 20, 2)
}
out21 = []
for ks in fs_kn_params:
    feated_dd = general_transformer(SelectKBest(k=ks), scaled_dd)
    gs_kn = GridSearchCV(KNeighborsClassifier(), param_grid=gs_kn_params)
    out21.append(general_model(gs_kn, feated_dd))


In [25]:
skb_gridsearch_results(fs_kn_params, out21)

{'best_k': 12,
 'fdd': {'X_test': array([[ 0.87868003, -1.35299894,  2.34549604, ..., -1.92843073,
           1.8127922 ,  1.00253944],
         [-1.01127223, -1.19551653,  0.39648044, ..., -0.19069912,
           0.44534192, -0.53355138],
         [-1.54792534,  1.5379283 , -1.25152888, ..., -1.20625655,
          -0.74782549, -0.51771539],
         ..., 
         [ 0.13203222,  2.24659918, -1.13865153, ..., -0.03272352,
          -1.32429964,  0.23449403],
         [-0.3579554 , -0.71182053,  0.9608672 , ..., -0.37124266,
           0.08336978,  0.96294947],
         [ 1.64866057,  0.70552123,  1.32959988, ..., -0.61949003,
          -0.73441912,  1.55679901]]),
  'X_train': array([[-0.00796424,  1.74040569, -1.04834965, ..., -1.34166421,
          -0.98914025, -1.09572895],
         [-1.61792357, -0.25062202, -1.19132763, ...,  0.01241237,
           0.51237379, -0.46228943],
         [ 0.92534551, -0.94804416,  0.82541438, ..., -0.64205798,
           1.06203518, -1.14323691],
    

In [26]:
fs_lr_params = range(4, 25)    
gs_lr_params = {
    'penalty' : ["l1", "l2"],
    'C' : [10**(i-3) for i in range(7)]    
}
out22 = []
for ks in fs_lr_params:
    feated_dd = general_transformer(SelectKBest(k=ks), scaled_dd)
    gs_lr = GridSearchCV(LogisticRegression(), param_grid=gs_lr_params)
    out22.append(general_model(gs_lr, feated_dd))

In [27]:
skb_gridsearch_results(fs_lr_params, out22)

{'best_k': 7,
 'fdd': {'X_test': array([[-1.35299894,  2.34549604,  2.7156381 , ..., -1.36531309,
          -2.89251894,  1.8127922 ],
         [-1.19551653,  0.39648044,  0.59868586, ..., -1.18234602,
          -0.18149756,  0.44534192],
         [ 1.5379283 , -1.25152888, -1.16544102, ...,  1.09992738,
          -0.14026529, -0.74782549],
         ..., 
         [ 2.24659918, -1.13865153, -1.25364736, ...,  2.46736546,
           0.88023325, -1.32429964],
         [-0.71182053,  0.9608672 ,  1.39254295, ..., -0.68159405,
          -1.21230417,  0.08336978],
         [ 0.70552123,  1.32959988,  1.03971757, ...,  0.7436231 ,
          -0.91337025, -0.73441912]]),
  'X_train': array([[ 1.74040569, -1.04834965, -0.90082198, ...,  1.99550302,
           0.58129934, -0.98914025],
         [-0.25062202, -1.19132763, -1.07723467, ..., -0.23862115,
          -0.0474927 ,  0.51237379],
         [-0.94804416,  0.82541438,  0.86330489, ..., -1.10530726,
          -0.12995723,  1.06203518],
     

In [28]:
fs_svc_params = range(4, 25)
gs_svc_params = {
    'C' : [10**(i-3) for i in range(7)]    
}
out23 = []
for ks in fs_svc_params:
    feated_dd = general_transformer(SelectKBest(k=ks), scaled_dd)
    gs_svc = GridSearchCV(SVC(), param_grid=gs_svc_params)
    out23.append(general_model(gs_svc, feated_dd))

In [29]:
skb_gridsearch_results(fs_svc_params, out23)

{'best_k': 12,
 'fdd': {'X_test': array([[ 0.87868003, -1.35299894,  2.34549604, ..., -1.92843073,
           1.8127922 ,  1.00253944],
         [-1.01127223, -1.19551653,  0.39648044, ..., -0.19069912,
           0.44534192, -0.53355138],
         [-1.54792534,  1.5379283 , -1.25152888, ..., -1.20625655,
          -0.74782549, -0.51771539],
         ..., 
         [ 0.13203222,  2.24659918, -1.13865153, ..., -0.03272352,
          -1.32429964,  0.23449403],
         [-0.3579554 , -0.71182053,  0.9608672 , ..., -0.37124266,
           0.08336978,  0.96294947],
         [ 1.64866057,  0.70552123,  1.32959988, ..., -0.61949003,
          -0.73441912,  1.55679901]]),
  'X_train': array([[-0.00796424,  1.74040569, -1.04834965, ..., -1.34166421,
          -0.98914025, -1.09572895],
         [-1.61792357, -0.25062202, -1.19132763, ...,  0.01241237,
           0.51237379, -0.46228943],
         [ 0.92534551, -0.94804416,  0.82541438, ..., -0.64205798,
           1.06203518, -1.14323691],
    

In [30]:
fs_lsvc_params = range(4, 25)
gs_lsvc_params = {
    'C' : [10**(i-3) for i in range(7)]    
}
out24 = []
for ks in fs_lsvc_params:
    feated_dd = general_transformer(SelectKBest(k=ks), scaled_dd)
    gs_lsvc = GridSearchCV(LinearSVC(), param_grid=gs_lsvc_params)
    out24.append(general_model(gs_lsvc, feated_dd))

In [31]:
skb_gridsearch_results(fs_lsvc_params, out24)

{'best_k': 4,
 'fdd': {'X_test': array([[-1.35299894,  1.66638542, -1.36531309,  1.8127922 ],
         [-1.19551653,  0.32618893, -1.18234602,  0.44534192],
         [ 1.5379283 , -0.64173075,  1.09992738, -0.74782549],
         ..., 
         [ 2.24659918, -1.70271964,  2.46736546, -1.32429964],
         [-0.71182053,  0.15866437, -0.68159405,  0.08336978],
         [ 0.70552123, -0.66034459,  0.7436231 , -0.73441912]]),
  'X_train': array([[ 1.74040569, -1.01400755,  1.99550302, -0.98914025],
         [-0.25062202,  0.45648582, -0.23862115,  0.51237379],
         [-0.94804416,  0.88460414, -1.10530726,  1.06203518],
         ..., 
         [-0.59933309, -0.04608787, -0.60455529, -0.06410035],
         [ 2.25784792, -1.4048982 ,  1.91846426, -1.31089326],
         [ 0.671775  , -1.34905668,  0.87844094, -1.49858252]]),
  'model': GridSearchCV(cv=None, error_score='raise',
         estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
       intercept_scaling=1, 

In [32]:
fs_lr_params = range(4, 25)    
gs_lr_params = {
    'penalty' : ["l1", "l2"],
    'C' : [10**(i-3) for i in range(7)]    
}    
    
out50 = full_stack(X_data, y_data, StandardScaler(), SelectKBest, LogisticRegression(), fs_lr_params, gs_lr_params, random_state=None)


In [33]:
out50

{'best_k': 6,
 'fdd': {'X_test': array([[ 1.29968251, -1.1617938 , -1.43603922, -0.16406109, -0.15605647,
           1.19076187],
         [ 1.35572746, -0.04946048, -0.54013809, -0.59855951, -0.53924763,
           1.47208429],
         [ 0.70186965, -0.70151794, -0.14427481,  0.58079335,  0.70048845,
           0.84245793],
         ..., 
         [-0.83002579,  1.04369469, -0.06093517, -0.3813103 , -0.38146303,
          -1.00623223],
         [-0.60584596, -0.05904956,  0.07449175, -0.13302549, -0.13351582,
          -0.57755045],
         [ 1.31836416, -0.28918749, -0.62347773, -0.84684432, -0.76465418,
           0.97642098]]),
  'X_train': array([[ 0.62714305, -0.71110702, -0.50888573, -0.86753472, -0.78719484,
           0.61472074],
         [-1.33443038,  0.61218607,  2.27257474,  1.81187222,  1.80498059,
          -1.34113987],
         [ 0.53373479, -0.20288576, -1.09226321, -1.68480557, -1.30562993,
           0.41377615],
         ..., 
         [-1.12893221,  1.1875309 ,