# Feature selection

In [1]:
import os
import time
from math import log

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from IPython.display import display

t = time.time()

In [2]:
df = pd.read_csv('data/df_train_prepro.csv').sample(frac=1)
df.shape

(1610494, 48)

In [3]:
X = df[['contextid',
        'dayssincelastvisitdouble',
        'campaignctrlast24h',
        'nbdisplay_1hour',
        'nbdayssincelastclick',
        'display_size',
        'ltf_nbpartnerdisplayssincelastclick',
        'ltf_nbglobaldisplay_4w',
        'ltf_nbpartnerclick_90d',
        'ltf_nbpartnerdisplay_90d',
        'ltf_nbpartnersales_90d',
        'nbdisplayglobalapprox_1d_sum_xdevice',
        'zonecostineuro']]
y = df['is_display_clicked']

In [4]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

columns_scaled = X.select_dtypes(include=np.number).columns

## Méthode LASSO

On choisit `fit_intercept=False` et `normalize=False` car les données sont déjà centrées et réduites.

In [5]:
def run_LassoCV(alphas_range, X_scaled, y):
    lcv = LassoCV(alphas=alphas_range, normalize=False, fit_intercept=False, random_state=0, n_jobs=-1).fit(X_scaled, y)

    print(f"Best alpha : {lcv.alpha_}")

    lasso = Lasso(fit_intercept=False, normalize=False, alpha=lcv.alpha_)
    lasso.fit(X_scaled, y)
    coeff_df = pd.DataFrame(np.transpose(lasso.coef_), columns_scaled, columns=['Coefficient'])

    display(coeff_df.sort_values(by='Coefficient', ascending=False))

    print(f"\nNombre de coeffs non nuls : {len(coeff_df[coeff_df.Coefficient != 0])}")

In [6]:
%%time
alphas_range = np.linspace(0.001, 1, num=100)
print(alphas_range[:5])
run_LassoCV(alphas_range, X_scaled, y)

[0.001      0.01109091 0.02118182 0.03127273 0.04136364]
Best alpha : 0.001


Unnamed: 0,Coefficient
zonecostineuro,0.043662
display_size,0.023909
campaignctrlast24h,0.023155
ltf_nbpartnerclick_90d,0.002896
dayssincelastvisitdouble,-0.0
ltf_nbglobaldisplay_4w,-0.0
ltf_nbpartnerdisplay_90d,-0.0
ltf_nbpartnersales_90d,-0.0
ltf_nbpartnerdisplayssincelastclick,-3.1e-05
nbdisplayglobalapprox_1d_sum_xdevice,-0.00228



Nombre de coeffs non nuls : 9
Wall time: 16.8 s


## Variance feature selection

In [7]:
from sklearn.feature_selection import VarianceThreshold

In [8]:
def low_variance_feature_selection(X, threshold) :
    sel = VarianceThreshold(threshold)
    sel.fit(X)
    print(sel.get_support(indices=True))

    df = pd.DataFrame(sel.variances_, index=X.columns, columns=['Variance']).sort_values(by='Variance', ascending=False)

    display(df)

    return sel.transform(X)

In [9]:
low_variance_feature_selection(X, 0.01)

[ 0  1  3  4  5  6  7  8  9 10 11 12]


Unnamed: 0,Variance
display_size,9444482000.0
ltf_nbglobaldisplay_4w,26070.98
ltf_nbpartnerdisplay_90d,10619.08
nbdisplayglobalapprox_1d_sum_xdevice,4449.996
dayssincelastvisitdouble,2642.414
nbdayssincelastclick,2066.988
ltf_nbpartnerdisplayssincelastclick,759.0539
nbdisplay_1hour,71.76346
zonecostineuro,8.801236
ltf_nbpartnerclick_90d,6.238911


array([[4.00000000e+00, 2.08950000e+01, 4.00000000e+00, ...,
        0.00000000e+00, 8.00000000e+00, 4.50431794e-01],
       [5.00000000e+00, 2.77200000e+00, 8.00000000e+00, ...,
        0.00000000e+00, 9.90000000e+01, 4.35971946e-01],
       [8.00000000e+00, 3.13000000e-01, 5.60000000e+01, ...,
        0.00000000e+00, 6.39877778e+01, 1.50000000e+00],
       ...,
       [7.00000000e+00, 1.17910000e+01, 5.00000000e+00, ...,
        0.00000000e+00, 9.72637500e+01, 9.12121758e-02],
       [9.00000000e+00, 5.13460000e+01, 0.00000000e+00, ...,
        0.00000000e+00, 2.00000000e+00, 6.21276617e-01],
       [8.00000000e+00, 2.10000000e-02, 3.50000000e+01, ...,
        0.00000000e+00, 4.00000000e+01, 1.53559005e+00]])

In [10]:
columns_cat = ['display_env',
            'target_env',
            'campaignscenario',
            'campaignvertical',
            'is_interstitial',
            'device_type']

In [11]:
df_cat = pd.get_dummies(df[columns_cat], columns=columns_cat, drop_first=True)
len(df_cat.columns)

21

In [12]:
Y = df_cat
low_variance_feature_selection(Y, 0.01)

[ 0  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 18 19 20]


Unnamed: 0,Variance
campaignscenario_13,0.232464
device_type_Desktop,0.216539
campaignvertical_20.0,0.211358
display_env_web,0.20743
campaignscenario_12,0.113986
device_type_iPhone,0.103931
campaignvertical_129.0,0.102221
target_env_2,0.093956
display_env_app_ios,0.082996
campaignscenario_6,0.060119


array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 1, ..., 0, 0, 1],
       [1, 0, 1, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 0]], dtype=uint8)

In [13]:
print(f"Temps d'exécution total : {time.strftime('%H:%M:%S', time.gmtime(time.time()-t))}")

Temps d'exécution total : 00:00:56


## Univariate feature selection

In [14]:
from scipy.stats.stats import pearsonr
def univariate_selection(X, y) :
    features = list(X)
    correlation = []
    significance = []
    for feature in features:
        correl = pearsonr(X[feature].values, y.values)
        correlation.append(correl[0])
        significance.append(correl[1])
    df = pd.DataFrame()
    df['feature'] = features
    df['correlation'] = correlation
    df['abs_correlation'] = np.abs(correlation)
    df['significance'] = significance
    df['significant'] = df['significance'] < 0.01 # Label those P<0.01
    df.sort_values(by='abs_correlation', ascending=False, inplace=True)
    return df

In [15]:
univariate_selection(X, y)

Unnamed: 0,feature,correlation,abs_correlation,significance,significant
12,zonecostineuro,0.214615,0.214615,0.0,True
5,display_size,0.145901,0.145901,0.0,True
2,campaignctrlast24h,0.104729,0.104729,0.0,True
3,nbdisplay_1hour,-0.045473,0.045473,0.0,True
11,nbdisplayglobalapprox_1d_sum_xdevice,-0.032693,0.032693,0.0,True
4,nbdayssincelastclick,-0.024608,0.024608,3.742619e-214,True
8,ltf_nbpartnerclick_90d,0.023766,0.023766,6.851296e-200,True
1,dayssincelastvisitdouble,-0.022727,0.022727,5.873928e-183,True
0,contextid,0.021707,0.021707,4.346806e-167,True
9,ltf_nbpartnerdisplay_90d,0.00492,0.00492,4.269942e-10,True


In [16]:
univariate_selection(Y, y)

Unnamed: 0,feature,correlation,abs_correlation,significance,significant
14,is_interstitial_True,0.246777,0.246777,0.0,True
2,display_env_web,-0.087985,0.087985,0.0,True
3,target_env_2,0.067383,0.067383,0.0,True
15,device_type_Desktop,-0.05981,0.05981,0.0,True
0,display_env_app_ios,0.056112,0.056112,0.0,True
8,campaignscenario_13,-0.048137,0.048137,0.0,True
9,campaignscenario_17,0.040227,0.040227,0.0,True
19,device_type_iPad,0.034953,0.034953,0.0,True
20,device_type_iPhone,0.023572,0.023572,1.145756e-196,True
7,campaignscenario_12,-0.018978,0.018978,3.478621e-128,True
