In [1]:
import matplotlib.pyplot as plt
import numpy as np
import scipy
import pandas as pd
import sklearn as skl

In [2]:
mdata= pd.read_csv('train_imperson_without4n7_balanced_data.csv')
Xtrain, Ytrain = mdata.loc[:, mdata.columns != '155'], mdata['155']

mdata_test= pd.read_csv('test_imperson_without4n7_balanced_data.csv')
Xtest, Ytest = mdata_test.loc[:, mdata_test.columns != '155'], mdata_test['155']

In [3]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC
def column_keeper(col):
    """ Create a function that discards all columns except those in the 'col' parameter"""
    def keep_columns(X):
        return X[:, col]
    return keep_columns

selected_features = sorted([25,16,22,13,77,15,2,3,23,34])

pipeline = make_pipeline(
    VarianceThreshold(),
    FunctionTransformer(column_keeper(selected_features)),
    StandardScaler(),
    SVC(),
)


In [5]:
# Train the model on the training data
pipeline.fit(Xtrain, Ytrain)



Pipeline(memory=None,
     steps=[('variancethreshold', VarianceThreshold(threshold=0.0)), ('functiontransformer', FunctionTransformer(accept_sparse=False, check_inverse=True,
          func=<function column_keeper.<locals>.keep_columns at 0x00000264B14E9730>,
          inv_kw_args=None, inverse_func=None, kw_args=None,
     ...f', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [6]:
pipeline.score(Xtest, Ytest)



0.5260222122615668

In [7]:
from sklearn.ensemble import RandomForestClassifier

rf_pipeline = make_pipeline(
    VarianceThreshold(),
    FunctionTransformer(column_keeper(selected_features)),
    RandomForestClassifier(),
)

In [8]:
rf_pipeline.fit(Xtrain, Ytrain)



Pipeline(memory=None,
     steps=[('variancethreshold', VarianceThreshold(threshold=0.0)), ('functiontransformer', FunctionTransformer(accept_sparse=False, check_inverse=True,
          func=<function column_keeper.<locals>.keep_columns at 0x00000264CF368E18>,
          inv_kw_args=None, inverse_func=None, kw_args=None,
     ...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [9]:
rf_pipeline.score(Xtest, Ytest)



0.5361322775038597

In [10]:
%%time
from catboost import CatBoostClassifier
cb_top_6 = sorted(['8', '38', '67', '119', '76', '78'])
cb_pipeline = make_pipeline(
    FunctionTransformer(column_keeper(cb_top_6)),
    CatBoostClassifier(),
)

cb_pipeline.fit(Xtrain, Ytrain)
cb_pipeline.score(Xtest, Ytest)

ModuleNotFoundError: No module named 'catboost'

In [12]:
pipeline.named_steps

{'variancethreshold': VarianceThreshold(threshold=0.0),
 'functiontransformer': FunctionTransformer(accept_sparse=False, check_inverse=True,
           func=<function column_keeper.<locals>.keep_columns at 0x00000264B14E9730>,
           inv_kw_args=None, inverse_func=None, kw_args=None,
           pass_y='deprecated', validate=None),
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'svc': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
   kernel='rbf', max_iter=-1, probability=False, random_state=None,
   shrinking=True, tol=0.001, verbose=False)}

In [14]:
vat = pipeline.named_steps['variancethreshold']

In [15]:
vat.variances_

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.41517214e-04,
       2.41517214e-04, 1.25629093e-01, 1.25629093e-01, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.16349859e-04,
       2.16349859e-04, 2.16349859e-04, 0.00000000e+00, 2.16349859e-04,
       0.00000000e+00, 2.16349859e-04, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.16349859e-04,
       0.00000000e+00, 0.00000000e+00, 2.16349859e-04, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.20578048e-02,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       2.16349859e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.64292310e-01, 2.08288948e-04, 0.00000000e+00, 2.09370196e-01,
       2.09282912e-01, 2.16349859e-04, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      