# Automated feature selection

**Reasons to have:**

* Some automatically created features are garbage
* Reduces complexity
* Trains faster
* Improves accuracy
* Reduce overfitting

**Methods:**
1. Filter methods
2. Wrapper Methods
3. Embedded Methods

In [42]:
import numpy as np
import pandas as pd

from IPython.display import Image

## Load data

In [5]:
input_file = '../data/train_feature_sample.csv'
dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8'
}

In [36]:
df_train = pd.read_csv(input_file, dtype=dtypes)
X = df_train.drop(columns='click_time')
y = X.pop('is_attributed')

X.head()

Unnamed: 0,ip,app,device,os,channel,id,"app.isin([1, 2, 3, 4, 5])",COUNT(clicks),PERCENT_TRUE(clicks.is_attributed),NUM_UNIQUE(clicks.ip),...,MODE(clicks.channel),COUNT(clicks WHERE is_attributed = True),NUM_UNIQUE(clicks.DAY(click_time)),NUM_UNIQUE(clicks.YEAR(click_time)),NUM_UNIQUE(clicks.MONTH(click_time)),NUM_UNIQUE(clicks.WEEKDAY(click_time)),MODE(clicks.DAY(click_time)),MODE(clicks.YEAR(click_time)),MODE(clicks.MONTH(click_time)),MODE(clicks.WEEKDAY(click_time))
0,87540,12,1,13,497,0,False,13198,7.6e-05,9190,...,178,1.0,4,1,1,4,8,2017,11,2
1,105560,25,1,17,259,1,False,804,0.0,740,...,259,0.0,4,1,1,4,7,2017,11,1
2,101424,12,1,19,212,2,False,13198,7.6e-05,9190,...,178,1.0,4,1,1,4,8,2017,11,2
3,94584,13,1,13,477,3,False,2422,0.0,2179,...,477,0.0,4,1,1,4,7,2017,11,1
4,68413,12,1,1,178,4,False,13198,7.6e-05,9190,...,178,1.0,4,1,1,4,8,2017,11,2


# Filter methods

* statistical tests
* independent of machine learning algorithms
* used as a preprocessing step before machine learning 

In [44]:
Image(url= "../img/filter-methods.png", width=600, height=600)

## 1. Common sense

In [25]:
len(X.columns)

27

In [11]:
X.columns

Index(['ip', 'app', 'device', 'os', 'channel', 'is_attributed', 'id',
       'app.isin([1, 2, 3, 4, 5])', 'COUNT(clicks)',
       'PERCENT_TRUE(clicks.is_attributed)', 'NUM_UNIQUE(clicks.ip)',
       'NUM_UNIQUE(clicks.device)', 'NUM_UNIQUE(clicks.os)',
       'NUM_UNIQUE(clicks.channel)', 'MODE(clicks.ip)', 'MODE(clicks.device)',
       'MODE(clicks.os)', 'MODE(clicks.channel)',
       'COUNT(clicks WHERE is_attributed = True)',
       'NUM_UNIQUE(clicks.DAY(click_time))',
       'NUM_UNIQUE(clicks.YEAR(click_time))',
       'NUM_UNIQUE(clicks.MONTH(click_time))',
       'NUM_UNIQUE(clicks.WEEKDAY(click_time))',
       'MODE(clicks.DAY(click_time))', 'MODE(clicks.YEAR(click_time))',
       'MODE(clicks.MONTH(click_time))', 'MODE(clicks.WEEKDAY(click_time))'],
      dtype='object')

In [14]:
X_tmp = X.drop(columns='NUM_UNIQUE(clicks.YEAR(click_time))')

## 2. Variance

* Removes features with low variance
* Docs: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html#sklearn.feature_selection.VarianceThreshold

In [24]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_tmp = sel.fit_transform(X)
len(X_tmp[0])

19

In [20]:
sel.variances_ 

array([4.87695579e+09, 2.23246188e+02, 6.74266752e+04, 3.12960316e+03,
       1.68282123e+04, 2.26484710e-03, 8.33333333e+08, 2.22434039e-01,
       3.38776416e+07, 4.59798623e-04, 1.37597005e+07, 2.61801414e+01,
       1.60288232e+02, 1.08544473e+02, 5.72870826e+08, 2.19034011e+04,
       1.22545909e+03, 1.73382946e+04, 3.08243438e+01, 2.81597500e-02,
       0.00000000e+00, 0.00000000e+00, 2.81597500e-02, 3.60955211e-01,
       0.00000000e+00, 0.00000000e+00, 3.60955211e-01])

# Wrapper methods

* "wraps" machine learning algorithms
* adds or removes features from a subset

**Examples:**

* Forward Selection - starts from no features, adds one by one
* Backward Elimination - starts from all features, removes one by one
* Recursive Feature elimination - optimization algorithm, 

## 1. Recursive feature elimination
* Recursively considers smaller and smaller sets of features
* Docs: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE

In [37]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [38]:
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, y)

In [39]:
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Num Features: 3
Selected Features: [False False False False False False False False  True False False False
 False False False False False False  True False False  True False False
 False False]
Feature Ranking: [19 12 15 14 10 21  6 17  1 16  4  7  8 22  9 18 11  3  1 23 24  1  5 13
 20  2]


In [41]:
for x in zip(X.columns, fit.support_):
    if x[1]:
        print(f"{x[0]}: {x[1]}")

PERCENT_TRUE(clicks.is_attributed): True
NUM_UNIQUE(clicks.DAY(click_time)): True
NUM_UNIQUE(clicks.WEEKDAY(click_time)): True


# Embedded Methods
* L1 regularization - Lasso Regression
* Vowpal Wabbit supports regularization via --l1 and --l2 CLI arguments

## 1. L1 regularization - Lasso Regression

In [81]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(penalty='l1')
sfm = SelectFromModel(model, threshold=0.05)
fit = sfm.fit(X, y)
X_tmp = sfm.transform(X)
n_features = X_tmp.shape[1]

In [82]:
n_features

8

In [83]:
X.columns[fit.get_support()]

Index(['app.isin([1, 2, 3, 4, 5])', 'PERCENT_TRUE(clicks.is_attributed)',
       'NUM_UNIQUE(clicks.device)', 'NUM_UNIQUE(clicks.os)',
       'COUNT(clicks WHERE is_attributed = True)',
       'NUM_UNIQUE(clicks.DAY(click_time))',
       'NUM_UNIQUE(clicks.WEEKDAY(click_time))',
       'MODE(clicks.WEEKDAY(click_time))'],
      dtype='object')

## Tree-based feature selection

### 1. ExtraTreesClassifier

In [88]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model = model.fit(X, y)

In [89]:
for x in zip(X.columns, model.feature_importances_):
    print(f"{x[0]}: {x[1]}")

ip: 0.2592652184984376
app: 0.003253600013990679
device: 0.03946934184906167
os: 0.15059304629280784
channel: 0.11440145993224052
id: 0.21848299585344466
app.isin([1, 2, 3, 4, 5]): 0.0006180732882415673
COUNT(clicks): 0.005268426818902718
PERCENT_TRUE(clicks.is_attributed): 0.11234806628569807
NUM_UNIQUE(clicks.ip): 0.0065896650674301255
NUM_UNIQUE(clicks.device): 0.008148857456277017
NUM_UNIQUE(clicks.os): 0.01912024812887446
NUM_UNIQUE(clicks.channel): 0.002707400680935038
MODE(clicks.ip): 0.003958664527773973
MODE(clicks.device): 0.0012040621637600364
MODE(clicks.os): 0.0015037517285559493
MODE(clicks.channel): 0.005502121715533106
COUNT(clicks WHERE is_attributed = True): 0.03543788279187996
NUM_UNIQUE(clicks.DAY(click_time)): 0.0005099438002270518
NUM_UNIQUE(clicks.YEAR(click_time)): 0.0
NUM_UNIQUE(clicks.MONTH(click_time)): 0.0
NUM_UNIQUE(clicks.WEEKDAY(click_time)): 0.0064868212434843735
MODE(clicks.DAY(click_time)): 0.002824747205215367
MODE(clicks.YEAR(click_time)): 0.0
MODE(c

### 2. RandomForestClassifier

In [94]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(n_estimators=400, n_jobs=-1)
scores = cross_val_score(estimator=clf,X=X, y=y, cv=3, scoring="roc_auc", verbose=True)

"AUC %.2f +/- %.2f" % (scores.mean(), scores.std())

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   12.3s finished


'AUC 0.90 +/- 0.06'

In [95]:
clf = clf.fit(X, y)

In [96]:
def feature_importances(model, features, n=10):
    importances = model.feature_importances_
    zipped = sorted(zip(features, importances), key=lambda x: -x[1])
    for i, f in enumerate(zipped[:n]):
        print("%d: Feature: %s, %.3f" % (i+1, f[0], f[1]))

    return [f[0] for f in zipped[:n]]

top_features = feature_importances(clf, X, n=20)

1: Feature: ip, 0.295
2: Feature: id, 0.226
3: Feature: os, 0.116
4: Feature: PERCENT_TRUE(clicks.is_attributed), 0.086
5: Feature: channel, 0.082
6: Feature: COUNT(clicks WHERE is_attributed = True), 0.038
7: Feature: device, 0.035
8: Feature: NUM_UNIQUE(clicks.os), 0.017
9: Feature: COUNT(clicks), 0.015
10: Feature: MODE(clicks.ip), 0.014
11: Feature: NUM_UNIQUE(clicks.ip), 0.014
12: Feature: NUM_UNIQUE(clicks.device), 0.014
13: Feature: MODE(clicks.channel), 0.012
14: Feature: app, 0.009
15: Feature: NUM_UNIQUE(clicks.channel), 0.007
16: Feature: MODE(clicks.device), 0.005
17: Feature: MODE(clicks.os), 0.005
18: Feature: MODE(clicks.DAY(click_time)), 0.003
19: Feature: MODE(clicks.WEEKDAY(click_time)), 0.003
20: Feature: NUM_UNIQUE(clicks.WEEKDAY(click_time)), 0.003


## 3. XGBoost

In [97]:
import xgboost as xgb

clf_xgBoost = xgb.XGBClassifier(
    max_depth = 4,
    subsample = 0.8,
    colsample_bytree = 0.7,
    colsample_bylevel = 0.7,
    scale_pos_weight = 9,
    min_child_weight = 0,
    reg_alpha = 4,
    n_jobs = 4, 
    objective = 'binary:logistic'
)
# Fit the models
clf_xgBoost.fit(X, y)

ModuleNotFoundError: No module named 'xgboost'

In [None]:
from sklearn import preprocessing

importance_dict = {}
for import_type in ['weight', 'gain', 'cover']:
    importance_dict['xgBoost-'+import_type] = clf_xgBoost.get_booster().get_score(importance_type=import_type)
    
importance_df = pd.DataFrame(importance_dict).fillna(0)
importance_df = pd.DataFrame(
    preprocessing.MinMaxScaler().fit_transform(importance_df),
    columns=importance_df.columns,
    index=importance_df.index
)

importance_df['mean'] = importance_df.mean(axis=1)

In [None]:
importance_df.sort_values('mean').plot(kind='bar', figsize=(20, 7))