In [108]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, SelectFromModel, RFE
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, make_scorer

np.random.seed(42)

In [100]:
X_train = pd.read_pickle('pickled_data/df_sample_data_10_1.p')
y_train = pd.read_pickle('pickled_data/df_sample_labels_10_1.p')
y_train = y_train.values.ravel()
X_test = pd.read_pickle('pickled_data/madelon_test_data.p')

In [101]:

# sfm_pipe = Pipeline()
# rfe_pipe = Pipeline()
# sp_pipe = Pipeline()

## Feature Selection - SelectKBest

In [102]:
skb_pipe = Pipeline([('scaler', StandardScaler()), 
                     ('skb', SelectKBest())
                    ])

In [103]:
skb_pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('skb', SelectKBest(k=10, score_func=<function f_classif at 0x7f257e055d08>))])

In [104]:
skb_feats = np.where(skb_pipe.named_steps['skb'].pvalues_ < 0.05)[0]
skb_feats

array([  4,  26,  30,  36,  64,  88, 121, 132, 157, 163, 175, 198, 207,
       226, 230, 241, 245, 253, 269, 282, 289, 336, 338, 352, 383, 442,
       443, 446, 452, 453, 456, 472, 475, 484, 486, 489, 494])

## Feature Selection - SelectFromModel

In [126]:
sfm_pipe = Pipeline([('scaler', StandardScaler()),
                     ('select from logistic regression', SelectFromModel(estimator=LogisticRegression(C=1e9), threshold='mean'))
                    ])

In [127]:
sfm_pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('select from logistic regression', SelectFromModel(estimator=LogisticRegression(C=1000000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
        prefit=False, threshold='mean'))])

In [130]:
sfm_feats = np.where(sfm_pipe.named_steps['select from logistic regression'].get_support())[0]
sfm_feats

array([  3,   4,   5,   9,  10,  12,  16,  17,  18,  26,  27,  29,  30,
        33,  34,  35,  36,  38,  40,  42,  45,  46,  52,  54,  58,  60,
        61,  64,  67,  69,  71,  73,  78,  80,  82,  85,  88,  89,  90,
        93,  96, 100, 102, 111, 113, 115, 116, 117, 118, 119, 120, 121,
       124, 130, 132, 136, 137, 139, 140, 143, 145, 149, 154, 157, 158,
       163, 167, 174, 175, 177, 178, 181, 183, 189, 192, 198, 201, 203,
       205, 207, 210, 213, 214, 217, 219, 220, 221, 222, 223, 226, 227,
       228, 230, 231, 236, 238, 239, 240, 241, 242, 243, 245, 250, 253,
       257, 259, 260, 262, 263, 264, 266, 267, 269, 270, 273, 274, 277,
       278, 279, 280, 285, 286, 288, 289, 290, 295, 297, 299, 300, 302,
       303, 310, 313, 317, 318, 319, 320, 327, 328, 330, 331, 334, 336,
       338, 340, 347, 348, 349, 350, 352, 355, 356, 357, 359, 360, 362,
       369, 370, 372, 373, 374, 377, 378, 379, 382, 383, 384, 387, 389,
       392, 395, 398, 399, 400, 402, 412, 421, 424, 425, 427, 42