In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, SelectFromModel, RFE
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

np.random.seed(42)

In [2]:
X_train = pd.read_pickle('../pickled_data-UCI/sample_1/df_sample_data.p')
y_train = pd.read_pickle('../pickled_data-UCI/sample_1/df_sample_labels.p')
y_train = y_train.values.ravel()
X_test = pd.read_pickle('../pickled_data-UCI/madelon_valid_data.p')
y_test = pd.read_pickle('../pickled_data-UCI/madelon_valid_labels.p')
y_test = y_test.values.ravel()

## Feature Selection - Linear Regression

In [3]:
X = pd.read_pickle('../pickled_data-UCI/madelon_test_data.p')

In [4]:
linreg_pipe = Pipeline([('scaler', StandardScaler()), 
                        ('linreg', LinearRegression())
                       ])

rsq_scores = []
for col in X.columns:
    temp_X = X.drop(col, axis=1)
    temp_y = X[col]
    
    linreg_pipe.fit(temp_X, temp_y)
    train_score = linreg_pipe.score(temp_X, temp_y)
    
    rsq_scores.append(train_score)

In [5]:
linreg_feats = np.argsort(rsq_scores)
linreg_feats

array([387, 145, 346, 225, 343, 130, 287,  92, 496, 154, 485, 314, 298,
       113, 232,  76, 296, 126, 266, 394, 319, 440,  34,  17, 277,  98,
        93, 488, 210, 303, 246, 197, 221, 120, 361, 164,  87,  39, 190,
       240,  46,  83, 272, 434, 395, 422, 405, 477,  59,   2,  47, 329,
       494, 412,  52, 371, 461, 122,  99, 129, 110, 417, 384,  70,   7,
       101, 307,  23,  37, 377, 473, 392, 415, 487, 328, 201, 253, 364,
       291, 165, 386, 207, 282, 212,  60,  41, 124, 310,  69, 481, 432,
       247, 183, 140, 257, 168,  97, 112, 324, 260, 340, 466, 385, 439,
       420, 222, 407, 116, 151, 123, 419,  25, 149,  38,  84, 464,  71,
       332, 254, 156, 321, 483, 220, 401,  40, 304, 342, 229, 219, 396,
       390,  53,  81,   0, 223, 255, 150, 469, 218, 169,  61, 436, 299,
       300, 211, 382, 275,  19, 199, 195, 313, 317, 141, 117, 350, 262,
       148, 242,  50, 186, 160, 161, 458,  90, 391, 204, 217, 227, 264,
       270, 108, 435, 238,  44, 146, 428, 203, 308, 333,  75, 45

## Feature Selection - SelectKBest

In [6]:
skb_pipe = Pipeline([('scaler', StandardScaler()), 
                     ('skb', SelectKBest(k=20))
                    ])

In [7]:
skb_pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('skb', SelectKBest(k=20, score_func=<function f_classif at 0x7f6fb756d1e0>))])

In [8]:
skb_feats = np.where(skb_pipe.named_steps['skb'].get_support())[0]
skb_feats

array([ 48,  64,  85, 105, 128, 221, 241, 246, 285, 309, 323, 336, 338,
       414, 431, 442, 453, 472, 475, 493])

## Feature Selection - SelectFromModel - LogisticRegression

In [9]:
sfm_lr_pipe = Pipeline([('scaler', StandardScaler()),
                     ('sfm', SelectFromModel(LogisticRegression(C=1e-4, random_state=42)))
                    ])

In [10]:
sfm_lr_pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('sfm', SelectFromModel(estimator=LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
        prefit=False, threshold=None))])

In [11]:
sfm_lr_feats = np.where(sfm_lr_pipe.named_steps['sfm'].get_support())[0]
sfm_lr_feats

array([  1,   4,  10,  13,  20,  21,  24,  26,  31,  36,  39,  42,  43,
        44,  45,  48,  49,  50,  55,  61,  64,  66,  70,  71,  73,  77,
        78,  79,  80,  84,  85,  88,  89,  94,  95,  97,  98, 100, 102,
       103, 105, 106, 107, 115, 119, 120, 124, 126, 128, 129, 134, 137,
       139, 140, 146, 147, 149, 152, 154, 156, 159, 161, 162, 163, 164,
       166, 167, 168, 177, 180, 181, 185, 186, 187, 188, 193, 195, 201,
       204, 205, 207, 210, 211, 214, 217, 221, 223, 224, 226, 227, 236,
       238, 241, 245, 246, 248, 259, 262, 264, 266, 272, 277, 278, 281,
       282, 283, 284, 285, 286, 289, 290, 291, 298, 299, 301, 304, 306,
       307, 308, 309, 313, 314, 321, 323, 326, 332, 333, 336, 338, 342,
       343, 346, 347, 348, 349, 352, 355, 358, 359, 362, 365, 367, 368,
       372, 377, 378, 380, 382, 384, 393, 399, 401, 403, 404, 409, 410,
       411, 413, 414, 415, 417, 422, 424, 425, 426, 427, 430, 431, 434,
       442, 450, 452, 453, 454, 456, 458, 461, 462, 465, 467, 46

## Feature Selection - SelectFromModel - Decision Trees

In [12]:
sfm_dtc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('sfm', SelectFromModel(DecisionTreeClassifier(random_state=42)))
                    ])

In [13]:
sfm_dtc_pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('sfm', SelectFromModel(estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=42, splitter='best'),
        prefit=False, threshold=None))])

In [14]:
sfm_dtc_feats = np.where(sfm_dtc_pipe.named_steps['sfm'].get_support())[0]
sfm_dtc_feats

array([  6,  13,  25,  28,  29,  43,  44,  48,  62,  64,  71,  80,  81,
        86,  99, 100, 105, 113, 119, 120, 124, 128, 129, 140, 151, 153,
       158, 162, 169, 170, 174, 196, 197, 212, 215, 238, 241, 261, 270,
       273, 281, 282, 283, 291, 298, 300, 303, 313, 320, 326, 334, 336,
       338, 346, 351, 361, 368, 375, 378, 385, 388, 391, 394, 400, 420,
       431, 433, 436, 442, 451, 453, 462, 472, 493, 496])

## Feature Selection - SelectFromModel - SVC

In [15]:
sfm_svc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('sfm', SelectFromModel(SVC(C=1e-4, kernel='linear', random_state=42)))
                    ])

In [16]:
sfm_svc_pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('sfm', SelectFromModel(estimator=SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False),
        prefit=False, threshold=None))])

In [17]:
sfm_svc_feats = np.where(sfm_svc_pipe.named_steps['sfm'].get_support())[0]
sfm_svc_feats

array([  1,   2,   4,  10,  13,  17,  19,  20,  21,  24,  26,  31,  36,
        39,  42,  43,  44,  45,  48,  49,  50,  55,  61,  64,  66,  70,
        71,  73,  77,  78,  79,  80,  84,  85,  88,  89,  94,  95,  98,
       100, 102, 103, 105, 106, 107, 115, 119, 120, 121, 124, 126, 128,
       129, 134, 137, 139, 140, 149, 152, 153, 154, 156, 159, 161, 162,
       163, 164, 166, 167, 168, 177, 180, 181, 185, 186, 187, 188, 195,
       201, 204, 205, 207, 210, 214, 217, 221, 223, 224, 227, 236, 238,
       241, 245, 246, 248, 259, 262, 264, 266, 272, 278, 281, 282, 283,
       284, 285, 286, 289, 290, 291, 298, 299, 301, 303, 304, 306, 307,
       308, 309, 313, 321, 323, 326, 329, 332, 333, 336, 338, 342, 343,
       346, 347, 348, 349, 352, 355, 358, 359, 362, 365, 367, 368, 372,
       377, 378, 380, 382, 384, 393, 399, 401, 404, 409, 410, 411, 413,
       414, 415, 417, 422, 424, 425, 426, 427, 430, 431, 433, 434, 438,
       442, 450, 452, 453, 454, 455, 456, 458, 461, 462, 465, 46

## Recursive Feature Elimination - Logistic Regression

In [18]:
rfe_logreg_pipe = Pipeline([('scaler', StandardScaler()), 
                            ('rfe', RFE(LogisticRegression(C=1e-4, random_state=42), n_features_to_select=20, verbose=1))
                           ])

In [19]:
rfe_logreg_pipe.fit(X_train, y_train)

Fitting estimator with 500 features.
Fitting estimator with 499 features.
Fitting estimator with 498 features.
Fitting estimator with 497 features.
Fitting estimator with 496 features.
Fitting estimator with 495 features.
Fitting estimator with 494 features.
Fitting estimator with 493 features.
Fitting estimator with 492 features.
Fitting estimator with 491 features.
Fitting estimator with 490 features.
Fitting estimator with 489 features.
Fitting estimator with 488 features.
Fitting estimator with 487 features.
Fitting estimator with 486 features.
Fitting estimator with 485 features.
Fitting estimator with 484 features.
Fitting estimator with 483 features.
Fitting estimator with 482 features.
Fitting estimator with 481 features.
Fitting estimator with 480 features.
Fitting estimator with 479 features.
Fitting estimator with 478 features.
Fitting estimator with 477 features.
Fitting estimator with 476 features.
Fitting estimator with 475 features.
Fitting estimator with 474 features.
F

Fitting estimator with 275 features.
Fitting estimator with 274 features.
Fitting estimator with 273 features.
Fitting estimator with 272 features.
Fitting estimator with 271 features.
Fitting estimator with 270 features.
Fitting estimator with 269 features.
Fitting estimator with 268 features.
Fitting estimator with 267 features.
Fitting estimator with 266 features.
Fitting estimator with 265 features.
Fitting estimator with 264 features.
Fitting estimator with 263 features.
Fitting estimator with 262 features.
Fitting estimator with 261 features.
Fitting estimator with 260 features.
Fitting estimator with 259 features.
Fitting estimator with 258 features.
Fitting estimator with 257 features.
Fitting estimator with 256 features.
Fitting estimator with 255 features.
Fitting estimator with 254 features.
Fitting estimator with 253 features.
Fitting estimator with 252 features.
Fitting estimator with 251 features.
Fitting estimator with 250 features.
Fitting estimator with 249 features.
F

Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.


Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('rfe', RFE(estimator=LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
  n_features_to_select=20, step=1, verbose=1))])

In [20]:
rfe_logreg_feats = np.where(rfe_logreg_pipe.named_steps['rfe'].get_support())[0]
rfe_logreg_feats

array([ 48,  64,  85, 105, 128, 221, 241, 246, 285, 309, 323, 336, 338,
       414, 431, 442, 453, 472, 475, 493])

## Recursive Feature Elimination - Decision Tree Classifier

In [21]:
rfe_dtc_pipe = Pipeline([('scaler', StandardScaler()), 
                            ('rfe', RFE(DecisionTreeClassifier(random_state=42), n_features_to_select=20, verbose=1))
                           ])

In [22]:
rfe_dtc_pipe.fit(X_train, y_train)

Fitting estimator with 500 features.
Fitting estimator with 499 features.
Fitting estimator with 498 features.
Fitting estimator with 497 features.
Fitting estimator with 496 features.
Fitting estimator with 495 features.
Fitting estimator with 494 features.
Fitting estimator with 493 features.
Fitting estimator with 492 features.
Fitting estimator with 491 features.
Fitting estimator with 490 features.
Fitting estimator with 489 features.
Fitting estimator with 488 features.
Fitting estimator with 487 features.
Fitting estimator with 486 features.
Fitting estimator with 485 features.
Fitting estimator with 484 features.
Fitting estimator with 483 features.
Fitting estimator with 482 features.
Fitting estimator with 481 features.
Fitting estimator with 480 features.
Fitting estimator with 479 features.
Fitting estimator with 478 features.
Fitting estimator with 477 features.
Fitting estimator with 476 features.
Fitting estimator with 475 features.
Fitting estimator with 474 features.
F

Fitting estimator with 278 features.
Fitting estimator with 277 features.
Fitting estimator with 276 features.
Fitting estimator with 275 features.
Fitting estimator with 274 features.
Fitting estimator with 273 features.
Fitting estimator with 272 features.
Fitting estimator with 271 features.
Fitting estimator with 270 features.
Fitting estimator with 269 features.
Fitting estimator with 268 features.
Fitting estimator with 267 features.
Fitting estimator with 266 features.
Fitting estimator with 265 features.
Fitting estimator with 264 features.
Fitting estimator with 263 features.
Fitting estimator with 262 features.
Fitting estimator with 261 features.
Fitting estimator with 260 features.
Fitting estimator with 259 features.
Fitting estimator with 258 features.
Fitting estimator with 257 features.
Fitting estimator with 256 features.
Fitting estimator with 255 features.
Fitting estimator with 254 features.
Fitting estimator with 253 features.
Fitting estimator with 252 features.
F

Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 fe

Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('rfe', RFE(estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=42, splitter='best'),
  n_features_to_select=20, step=1, verbose=1))])

In [23]:
rfe_dtc_feats = np.where(rfe_dtc_pipe.named_steps['rfe'].get_support())[0]
rfe_dtc_feats

array([ 43,  62,  64, 105, 113, 128, 140, 153, 241, 281, 291, 318, 334,
       338, 368, 378, 436, 442, 451, 472])

## Feature Frequency Table

In [24]:
m_features = np.hstack((linreg_feats, skb_feats, sfm_lr_feats, sfm_dtc_feats, sfm_svc_feats, rfe_logreg_feats, rfe_dtc_feats))
m_features_df = pd.value_counts(m_features).to_frame().reset_index()
m_features_df.columns = ['feature', 'feature_count']
m_features_df.head(20)

Unnamed: 0,feature,feature_count
0,64,7
1,338,7
2,241,7
3,128,7
4,472,7
5,105,7
6,442,7
7,431,6
8,336,6
9,493,6


In [25]:
m_features_df.to_pickle('../pickled_data-UCI/madelon_important_features.p')