# Feature selection

In [1]:
import os
import time
from math import log

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from IPython.display import display

t = time.time()

In [2]:
df = pd.read_csv('data/df_train_prepro.csv').sample(frac=1)
df.shape

(1610494, 50)

In [3]:
X = df[['contextid',
        'dayssincelastvisitdouble',
        'campaignctrlast24h',
        'nbdisplay_1hour',
        'nbdayssincelastclick',
        'display_size',
        'ltf_nbpartnerdisplayssincelastclick',
        'ltf_nbglobaldisplay_4w',
        'ltf_nbpartnerclick_90d',
        'ltf_nbpartnerdisplay_90d',
        'ltf_nbpartnersales_90d',
        'nbdisplayglobalapprox_1d_sum_xdevice']]
y = df['is_display_clicked']

In [4]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

columns_scaled = X.select_dtypes(include=np.number).columns

## Méthode LASSO

On choisit `fit_intercept=False` et `normalize=False` car les données sont déjà centrées et réduites.

In [5]:
def run_LassoCV(alphas_range, X_scaled, y):
    lcv = LassoCV(alphas=alphas_range, normalize=False, fit_intercept=False, random_state=0, n_jobs=-1).fit(X_scaled, y)

    print(f"Best alpha : {lcv.alpha_}")

    lasso = Lasso(fit_intercept=False, normalize=False, alpha=lcv.alpha_)
    lasso.fit(X_scaled, y)
    coeff_df = pd.DataFrame(np.transpose(lasso.coef_), columns_scaled, columns=['Coefficient'])

    display(coeff_df.sort_values(by='Coefficient', ascending=False))

    print(f"\nNombre de coeffs non nuls : {len(coeff_df[coeff_df.Coefficient != 0])}")

In [6]:
%%time
alphas_range = np.linspace(0.001, 1, num=100)
print(alphas_range[:5])
run_LassoCV(alphas_range, X_scaled, y)

[0.001      0.01109091 0.02118182 0.03127273 0.04136364]
Best alpha : 0.001


Unnamed: 0,Coefficient
display_size,0.033157
campaignctrlast24h,0.022455
ltf_nbpartnerclick_90d,0.003694
contextid,0.00262
ltf_nbglobaldisplay_4w,-0.0
ltf_nbpartnerdisplay_90d,-0.0
ltf_nbpartnersales_90d,-0.0
ltf_nbpartnerdisplayssincelastclick,-0.000266
dayssincelastvisitdouble,-0.002458
nbdisplayglobalapprox_1d_sum_xdevice,-0.004043



Nombre de coeffs non nuls : 9
Wall time: 5.66 s


In [7]:
%%time
alphas_range = np.linspace(0.00001, 0.002, num=1000)
print(alphas_range[:5])
run_LassoCV(alphas_range, X_scaled, y)

[1.0000000e-05 1.1991992e-05 1.3983984e-05 1.5975976e-05 1.7967968e-05]
Best alpha : 4.386386386386386e-05


Unnamed: 0,Coefficient
display_size,0.0341
campaignctrlast24h,0.023158
ltf_nbpartnerclick_90d,0.005561
contextid,0.003797
ltf_nbglobaldisplay_4w,8.3e-05
ltf_nbpartnerdisplayssincelastclick,-0.000525
ltf_nbpartnersales_90d,-0.000726
ltf_nbpartnerdisplay_90d,-0.001532
dayssincelastvisitdouble,-0.003324
nbdisplayglobalapprox_1d_sum_xdevice,-0.004769



Nombre de coeffs non nuls : 12
Wall time: 2min 42s


In [8]:
print(f"Temps d'exécution total : {time.strftime('%H:%M:%S', time.gmtime(time.time()-t))}")

Temps d'exécution total : 00:03:13
