# Feature selection

In [1]:
import os
import time
from math import log

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn import metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from IPython.display import display

t = time.time()

In [2]:
df = pd.read_csv('data/df_train_prepro.csv').sample(frac=1)
df.shape

(1610494, 46)

In [3]:
columns_quant = ['contextid',
 'campaignctrlast24h',
 'dayssincelastvisitdouble',
 'ltf_nbglobaldisplay_4w',
 'ltf_nbpartnerdisplayssincelastclick',
 'ltf_nbpartnerdisplay_90d',
 'ltf_nbpartnerclick_90d',
 'ltf_nbpartnersales_90d',
 'nbdayssincelastclick',
 'nbdisplay_1hour',
 'nbdisplayglobalapprox_1d_sum_xdevice',
 'display_size',
 'zonecostineuro']

columns_cat = ['display_env',
            'target_env',
            'campaignscenario',
            'campaignvertical',
            'is_interstitial',
            'device_type']

X_quant = df[columns_quant]
X_quant_scaled = StandardScaler().fit_transform(X_quant)
print(len(X_quant.columns))
display(X_quant.columns)

X_cat = df[columns_cat]
X_cat = pd.get_dummies(X_cat, columns=columns_cat, drop_first=True)
X_cat_scaled = StandardScaler().fit_transform(X_cat)
print(len(X_cat.columns))
display(X_cat.columns)

X_quant_cat = df[columns_quant + columns_cat]
X_quant_cat = pd.get_dummies(X_quant_cat, columns=columns_cat, drop_first=True)
X_quant_cat_scaled = StandardScaler().fit_transform(X_quant_cat)
print(len(X_quant_cat.columns))
display(X_quant_cat.columns)

y = df['is_display_clicked']

13


Index(['contextid', 'campaignctrlast24h', 'dayssincelastvisitdouble',
       'ltf_nbglobaldisplay_4w', 'ltf_nbpartnerdisplayssincelastclick',
       'ltf_nbpartnerdisplay_90d', 'ltf_nbpartnerclick_90d',
       'ltf_nbpartnersales_90d', 'nbdayssincelastclick', 'nbdisplay_1hour',
       'nbdisplayglobalapprox_1d_sum_xdevice', 'display_size',
       'zonecostineuro'],
      dtype='object')

21


Index(['display_env_app_ios', 'display_env_other', 'display_env_web',
       'target_env_2', 'campaignscenario_5', 'campaignscenario_6',
       'campaignscenario_11', 'campaignscenario_12', 'campaignscenario_13',
       'campaignscenario_17', 'campaignvertical_20.0', 'campaignvertical_23.0',
       'campaignvertical_129.0', 'campaignvertical_200002.0',
       'is_interstitial_True', 'device_type_Desktop',
       'device_type_Desktop - Safari', 'device_type_Mobile - Other',
       'device_type_Unknown', 'device_type_iPad', 'device_type_iPhone'],
      dtype='object')

34


Index(['contextid', 'campaignctrlast24h', 'dayssincelastvisitdouble',
       'ltf_nbglobaldisplay_4w', 'ltf_nbpartnerdisplayssincelastclick',
       'ltf_nbpartnerdisplay_90d', 'ltf_nbpartnerclick_90d',
       'ltf_nbpartnersales_90d', 'nbdayssincelastclick', 'nbdisplay_1hour',
       'nbdisplayglobalapprox_1d_sum_xdevice', 'display_size',
       'zonecostineuro', 'display_env_app_ios', 'display_env_other',
       'display_env_web', 'target_env_2', 'campaignscenario_5',
       'campaignscenario_6', 'campaignscenario_11', 'campaignscenario_12',
       'campaignscenario_13', 'campaignscenario_17', 'campaignvertical_20.0',
       'campaignvertical_23.0', 'campaignvertical_129.0',
       'campaignvertical_200002.0', 'is_interstitial_True',
       'device_type_Desktop', 'device_type_Desktop - Safari',
       'device_type_Mobile - Other', 'device_type_Unknown', 'device_type_iPad',
       'device_type_iPhone'],
      dtype='object')

## Méthode LASSO

On choisit `fit_intercept=False` et `normalize=False` car les données sont déjà centrées et réduites.

In [4]:
def run_LassoCV(alphas_range, X, y, X_columns=False):
    lcv = LassoCV(alphas=alphas_range, normalize=False, fit_intercept=False, random_state=0, n_jobs=-1).fit(X, y)

    print(f"Best alpha : {lcv.alpha_}")

    lasso = Lasso(fit_intercept=False, normalize=False, alpha=lcv.alpha_)
    lasso.fit(X, y)
    
    if type(X_columns) == bool:
        X_columns = X.columns
    
    coeff_df = pd.DataFrame(np.transpose(lasso.coef_), X_columns, columns=['Coefficient abs']).abs()

    display(coeff_df.sort_values(by='Coefficient abs', ascending=False))

    print(f"\nNombre de coeffs non nuls : {len(coeff_df[coeff_df['Coefficient abs'] != 0])}")

### Variables quantitatives

In [5]:
%%time
alphas_range = np.linspace(0.001, 1, num=100)
print(alphas_range[:5])
run_LassoCV(alphas_range, X_quant_scaled, y, X_columns=X_quant.columns)

[0.001      0.01109091 0.02118182 0.03127273 0.04136364]
Best alpha : 0.001


Unnamed: 0,Coefficient abs
zonecostineuro,0.043662
display_size,0.023909
campaignctrlast24h,0.023156
nbdisplay_1hour,0.007687
nbdayssincelastclick,0.003271
contextid,0.003043
ltf_nbpartnerclick_90d,0.002896
nbdisplayglobalapprox_1d_sum_xdevice,0.00228
ltf_nbpartnerdisplayssincelastclick,3.1e-05
dayssincelastvisitdouble,0.0



Nombre de coeffs non nuls : 9
Wall time: 4.14 s


Le best alpha correspond à la borne inférieure de l'intervalle, nous continuons donc la recherche du best alpha.

In [6]:
%%time
alphas_range = np.linspace(0.000001, 0.002, num=100)
print(alphas_range[:5])
run_LassoCV(alphas_range, X_quant_scaled, y, X_columns=X_quant.columns)

[1.00000000e-06 2.11919192e-05 4.13838384e-05 6.15757576e-05
 8.17676768e-05]
Best alpha : 4.138383838383838e-05


Unnamed: 0,Coefficient abs
zonecostineuro,0.04453
display_size,0.024733
campaignctrlast24h,0.024137
nbdisplay_1hour,0.008331
ltf_nbpartnerclick_90d,0.004725
nbdayssincelastclick,0.004068
contextid,0.004039
nbdisplayglobalapprox_1d_sum_xdevice,0.003056
ltf_nbpartnerdisplay_90d,0.001402
ltf_nbpartnersales_90d,0.000738



Nombre de coeffs non nuls : 13
Wall time: 4.93 s


In [7]:
%%time
alphas_range = np.linspace(0.00001, 0.0001, num=100)
print(alphas_range[:5])
run_LassoCV(alphas_range, X_quant_scaled, y, X_columns=X_quant.columns)

[1.00000000e-05 1.09090909e-05 1.18181818e-05 1.27272727e-05
 1.36363636e-05]
Best alpha : 3.7272727272727276e-05


Unnamed: 0,Coefficient abs
zonecostineuro,0.044534
display_size,0.024736
campaignctrlast24h,0.024141
nbdisplay_1hour,0.008334
ltf_nbpartnerclick_90d,0.004735
nbdayssincelastclick,0.004072
contextid,0.004043
nbdisplayglobalapprox_1d_sum_xdevice,0.003059
ltf_nbpartnerdisplay_90d,0.001415
ltf_nbpartnersales_90d,0.000743



Nombre de coeffs non nuls : 13
Wall time: 5.02 s


Nous pouvons en conclure que la variable `dayssincelastvisitdouble` n'est pas utile.

### Variables quantitatives + catégorielles
On rajoute à présent les variables catégorielles.

In [8]:
%%time
alphas_range = np.linspace(0.001, 1, num=100)
print(alphas_range[:5])
run_LassoCV(alphas_range, X_quant_cat_scaled, y, X_columns=X_quant_cat.columns)

[0.001      0.01109091 0.02118182 0.03127273 0.04136364]
Best alpha : 0.001


Unnamed: 0,Coefficient abs
is_interstitial_True,0.039362
zonecostineuro,0.03624
campaignctrlast24h,0.01676
device_type_Desktop,0.007656
display_size,0.007503
nbdisplay_1hour,0.004086
nbdayssincelastclick,0.003941
campaignscenario_13,0.003517
device_type_iPhone,0.002767
ltf_nbpartnerclick_90d,0.002765



Nombre de coeffs non nuls : 25
Wall time: 11 s


In [9]:
%%time
alphas_range = np.linspace(0.000001, 0.002, num=100)
print(alphas_range[:5])
run_LassoCV(alphas_range, X_quant_cat_scaled, y, X_columns=X_quant_cat.columns)

[1.00000000e-06 2.11919192e-05 4.13838384e-05 6.15757576e-05
 8.17676768e-05]
Best alpha : 1e-06


Unnamed: 0,Coefficient abs
is_interstitial_True,0.039164
zonecostineuro,0.03714
campaignctrlast24h,0.017124
device_type_Desktop,0.009555
display_size,0.008221
device_type_iPhone,0.006927
display_env_app_ios,0.006209
target_env_2,0.005827
campaignscenario_13,0.005017
nbdayssincelastclick,0.004896



Nombre de coeffs non nuls : 34
Wall time: 17.3 s


In [10]:
%%time
alphas_range = np.linspace(0.000000001, 0.000002, num=100)
print(alphas_range[:5])
run_LassoCV(alphas_range, X_quant_cat_scaled, y, X_columns=X_quant_cat.columns)

[1.00000000e-09 2.11919192e-08 4.13838384e-08 6.15757576e-08
 8.17676768e-08]
Best alpha : 1e-09


Unnamed: 0,Coefficient abs
is_interstitial_True,0.039163
zonecostineuro,0.037141
campaignctrlast24h,0.017123
device_type_Desktop,0.009557
display_size,0.008222
device_type_iPhone,0.006931
display_env_app_ios,0.006214
target_env_2,0.005831
campaignscenario_13,0.005019
nbdayssincelastclick,0.004897



Nombre de coeffs non nuls : 34
Wall time: 24.1 s


## Variance feature selection

In [11]:
def low_variance_feature_selection(X, threshold) :
    sel = VarianceThreshold(threshold)
    sel.fit(X)
    print(sel.get_support(indices=True))

    df = pd.DataFrame(sel.variances_, index=X.columns, columns=['Variance']).sort_values(by='Variance', ascending=False)

    display(df)

    return sel.transform(X)

In [12]:
low_variance_feature_selection(X_quant, 0.01)

[ 0  2  3  4  5  6  7  8  9 10 11 12]


Unnamed: 0,Variance
display_size,9444482000.0
ltf_nbglobaldisplay_4w,26070.98
ltf_nbpartnerdisplay_90d,10619.08
nbdisplayglobalapprox_1d_sum_xdevice,4449.996
dayssincelastvisitdouble,2642.414
nbdayssincelastclick,2066.988
ltf_nbpartnerdisplayssincelastclick,759.0539
nbdisplay_1hour,71.76346
zonecostineuro,8.801236
ltf_nbpartnerclick_90d,6.238911


array([[ 6.00000000e+00,  5.25260000e+01,  0.00000000e+00, ...,
         1.52291667e+01,  7.50000000e+04,  1.00199997e+00],
       [ 6.00000000e+00,  9.55000000e-01,  0.00000000e+00, ...,
         1.61322222e+01,  4.00000000e+00,  7.16162348e+00],
       [ 6.00000000e+00,  1.02140000e+01,  0.00000000e+00, ...,
         8.00000000e+00,  0.00000000e+00,  2.54258811e-02],
       ...,
       [ 6.00000000e+00,  1.22880000e+01,  0.00000000e+00, ...,
         5.86554630e+01,  0.00000000e+00,  1.18774921e-01],
       [ 6.00000000e+00,  6.19200000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  1.53600000e+05,  1.57291842e+00],
       [ 1.00000000e+00, -1.00000000e+00,  0.00000000e+00, ...,
         4.00000000e+00,  0.00000000e+00,  1.51292443e+00]])

In [13]:
low_variance_feature_selection(X_cat, 0.01)

[ 0  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 18 19 20]


Unnamed: 0,Variance
campaignscenario_13,0.232464
device_type_Desktop,0.216539
campaignvertical_20.0,0.211358
display_env_web,0.20743
campaignscenario_12,0.113986
device_type_iPhone,0.103931
campaignvertical_129.0,0.102221
target_env_2,0.093956
display_env_app_ios,0.082996
campaignscenario_6,0.060119


array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

## Univariate feature selection

In [14]:
from scipy.stats.stats import pearsonr
def univariate_selection(X, y) :
    features = list(X)
    correlation = []
    significance = []
    for feature in features:
        correl = pearsonr(X[feature].values, y.values)
        correlation.append(correl[0])
        significance.append(correl[1])
    df = pd.DataFrame()
    df['feature'] = features
    df['correlation'] = correlation
    df['abs_correlation'] = np.abs(correlation)
    df['significance'] = significance
    df['significant'] = df['significance'] < 0.01 # Label those P<0.01
    df.sort_values(by='abs_correlation', ascending=False, inplace=True)
    return df

In [15]:
univariate_selection(X_quant, y)

Unnamed: 0,feature,correlation,abs_correlation,significance,significant
12,zonecostineuro,0.214615,0.214615,0.0,True
11,display_size,0.145901,0.145901,0.0,True
1,campaignctrlast24h,0.104729,0.104729,0.0,True
9,nbdisplay_1hour,-0.045473,0.045473,0.0,True
10,nbdisplayglobalapprox_1d_sum_xdevice,-0.032693,0.032693,0.0,True
8,nbdayssincelastclick,-0.024608,0.024608,3.742619e-214,True
6,ltf_nbpartnerclick_90d,0.023766,0.023766,6.851296e-200,True
2,dayssincelastvisitdouble,-0.022727,0.022727,5.873928e-183,True
0,contextid,0.021707,0.021707,4.346806e-167,True
5,ltf_nbpartnerdisplay_90d,0.00492,0.00492,4.269942e-10,True


In [16]:
univariate_selection(X_cat, y)

Unnamed: 0,feature,correlation,abs_correlation,significance,significant
14,is_interstitial_True,0.246777,0.246777,0.0,True
2,display_env_web,-0.087985,0.087985,0.0,True
3,target_env_2,0.067383,0.067383,0.0,True
15,device_type_Desktop,-0.05981,0.05981,0.0,True
0,display_env_app_ios,0.056112,0.056112,0.0,True
8,campaignscenario_13,-0.048137,0.048137,0.0,True
9,campaignscenario_17,0.040227,0.040227,0.0,True
19,device_type_iPad,0.034953,0.034953,0.0,True
20,device_type_iPhone,0.023572,0.023572,1.145756e-196,True
7,campaignscenario_12,-0.018978,0.018978,3.478621e-128,True


In [17]:
print(f"Temps d'exécution total : {time.strftime('%H:%M:%S', time.gmtime(time.time()-t))}")

Temps d'exécution total : 00:01:28
