In [None]:
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import (GridSearchCV, ParameterGrid,
                                     RandomizedSearchCV)
from sklearn.metrics import fbeta_score, make_scorer
from imblearn.pipeline import Pipeline as Pipeline_imb
from imblearn.over_sampling import SMOTENC, RandomOverSampler
from tools import (COLUMNS_QUANT, COLUMNS_CAT,
                   Modelisation, datasets, SearchCV, restauration_CV, 
                   graph_2scores_CV, graph_3scores_CV, graph_param_CV, 
                   best_score_CV)

## Matplotlib : 
# Pour affichage interactif (notamment 3D) dans notebook
# %matplotlib widget

# Pour affichage interactif (notamment 3D) dans une fenêtre qt externe au notebook
# %matplotlib qt

# Pour affichage simple dans notebook
# %config InlineBackend.figure_format = 'png'

t = time.time()

In [None]:
df = pd.read_csv('data/df_train_prepro.csv').sample(frac=1)
df.shape

**Définition des variables quantitatives, des variables catégorielles et des datasets :**

In [None]:
datasets_df = datasets(df)
X_quant = datasets_df['X_quant']
X_cat = datasets_df['X_cat']
X = datasets_df['X']
y = datasets_df['y']

Afin de comparer nos modèles en termes de performances brutes et de temps d'exécution, il est fondamental de déterminer quelques métriques de référence. Ici, il faut surtout faire en sorte d'éviter de prédire des non-clics qui seraient en réalité des clics (ie prédire trop de 0), quitte à prédire trop de 1. Autrement dit, il faut maximiser le recall et le NPV (Negative predictive value) afin de limiter les erreurs de type II. Le F1 Score, combinaison du recall et de la précision, est également pertinent. 

# Régression logistique
## Variables quantitatives

In [None]:
m = Modelisation(X_quant, y, LogisticRegression(), scaling=True)
m.show_conf_matrix()
m.show_metrics_score()

In [None]:
m.show_ROC()

In [None]:
coeff_df = pd.DataFrame(np.transpose(m.model.coef_), X_quant.columns, columns=['Coefficient'])
coeff_df

**Problème :** pas de scaling dans la cellule suivante

In [None]:
from statsmodels.discrete.discrete_model import Logit
from statsmodels.tools import add_constant

X_quant_scaled = datasets_df['X_quant_scaled']
X_ = add_constant(X_quant_scaled)
print(Logit(y, X_).fit().summary())

Avec LASSO (+ univariate mais variable déjà exclue par Lasso): 

In [None]:
X1 = df[['contextid',
         'zonecostineuro',
         'campaignctrlast24h',
         'ltf_nbpartnerclick_90d',
         'nbdisplay_1hour',
         'nbdayssincelastclick',
         'display_size',
         'nbdisplayglobalapprox_1d_sum_xdevice']]
y1 = df['is_display_clicked']

In [None]:
m1 = Modelisation(X1, y1, LogisticRegression(), scaling=True)
m1.show_conf_matrix()
m1.show_metrics_score()

## Variables catégorielles

In [None]:
# Nombre de valeurs distinctes
from tools import COLUMNS_CAT
for column in COLUMNS_CAT:
    print(f"{column} : {len(set(df[column]))}")

In [None]:
m = Modelisation(X_cat, y, LogisticRegression(), scaling=True)
m.show_conf_matrix()
m.show_metrics_score()

## Variables quantitatives + catégorielles

In [None]:
m = Modelisation(X, y, LogisticRegression(), scaling=True)
m.show_conf_matrix()
m.show_metrics_score()

In [None]:
m = Modelisation(X, y, LogisticRegression(class_weight='balanced'), scaling=True)
m.show_conf_matrix()
m.show_metrics_score()

Avec RFECV :

In [None]:
X2 = X[['zonecostineuro',
         'campaignctrlast24h',
         'nbdisplay_1hour',
         'nbdayssincelastclick',
         'display_size',
         'is_interstitial_True',
         'device_type_Desktop',
         'device_type_iPhone',
         'display_env_app_ios',
         'target_env_2',
         'campaignscenario_13']]

In [None]:
m2 = Modelisation(X2, y, LogisticRegression(), scaling=True)
m2.show_conf_matrix()
m2.show_metrics_score()

## Comparaison par rapport aux valeurs de C 

In [None]:
m = Modelisation(X, y, LogisticRegression(C=0.2), scaling=True)
m.show_conf_matrix()
m.show_metrics_score()

In [None]:
%%time
C, list_recall = [], []
for c in np.logspace(-5.5, 5.5, num=11): 
    m = Modelisation(X, y, LogisticRegression(C=c), scaling=True)
    C.append(c)
    list_recall.append(m.recall)
plt.plot(C, list_recall)
plt.xscale('log')
plt.title("Recall de la régression logistique en fonction des valeurs de C")
plt.show()

## Tuning des hyperparamètres avec GridSearchCV

In [None]:
model = LogisticRegression()

cv_params = {
        "C": np.logspace(-5, 4, 50),
        "class_weight": [None, 'balanced', {0:1, 1:2}, {0:1, 1:4}, {0:1, 1:8}, {0:1, 1:10}, {0:1, 1:12}, {0:1, 1:14}, {0:1, 1:16.5}] 
        }

scoring = {'recall': 'recall',
           'precision': 'precision',
           'f1': 'f1',
           'f3': make_scorer(fbeta_score, beta=3),
           'f5': make_scorer(fbeta_score, beta=5)
          }

print(len(ParameterGrid(cv_params)))

In [None]:
SearchCV(model, cv_params, data_frac=1, scaling=True, scoring=scoring, random=False, n_jobs=56)

In [None]:
dico, results = restauration_CV('LR_CV_Grid_450_1')

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f1', s=20)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'precision', s=20)

In [None]:
graph_3scores_CV(dico, results, 'recall', 'precision', 'f1', s=15)

In [None]:
graph_3scores_CV(dico, results, 'f1', 'f3', 'f5', s=15)

In [None]:
graph_param_CV(dico, results, xscale={'C': 'log'}, ncols=2, height=5, width=6)

In [None]:
best_params = best_score_CV(dico, results, 'f5')

In [None]:
m = Modelisation(X, y, LogisticRegression(**best_params), scaling=True)
m.show_conf_matrix()
m.show_metrics_score()

## Régression logistique avec sur-échantillonnage

**/!\ : Seule la base de training est sur-échantillonnée.**

In [None]:
df = pd.read_csv('data/df_train_prepro.csv').sample(frac=0.05)
df.shape

In [None]:
datasets_df = datasets(df, verbose=False)
X = datasets_df['X']
y = datasets_df['y']

In [None]:
categorical_features = list(range(len(COLUMNS_QUANT), len(X.columns)))
print(categorical_features)

over = SMOTENC(categorical_features=categorical_features, 
              sampling_strategy=1, 
              k_neighbors=5,
              random_state=1234)

pipeline = Pipeline_imb([
            ('over', over),
            ('classifier', LogisticRegression())
            ])

In [None]:
m = Modelisation(X, y, pipeline, scaling=True)
m.show_conf_matrix()
m.show_metrics_score()

In [None]:
over = RandomOverSampler(sampling_strategy=1, random_state=1234)

pipeline = Pipeline_imb([
            ('over', over),
            ('classifier', LogisticRegression())
            ])

In [None]:
m = Modelisation(X, y, pipeline, scaling=True)
m.show_conf_matrix()
m.show_metrics_score()

### GridSearchCV

In [None]:
over = RandomOverSampler(random_state=1234)

pipeline = Pipeline_imb([
            ('over', over),
            ('model', LogisticRegression())
            ])

cv_params = {
        "over__sampling_strategy": [0.2, 0.4, 0.6, 0.8, 1],
        "model__C": np.logspace(-5, 4, 50),
        "model__class_weight": [None, 'balanced', {0:1, 1:2}, {0:1, 1:4}, {0:1, 1:8}, {0:1, 1:10}, {0:1, 1:12}, {0:1, 1:14}, {0:1, 1:16.5}] 
        }

print(len(ParameterGrid(cv_params)))

In [None]:
SearchCV(pipeline, cv_params, data_frac=1, scaling=True, random=False, scoring=scoring, n_jobs=56)

In [None]:
dico, results = restauration_CV('RandomOver_LR_CV_Grid_2250_1')

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f1', s=20)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'precision', s=20)

In [None]:
graph_3scores_CV(dico, results, 'recall', 'precision', 'f1', s=15)

In [None]:
graph_3scores_CV(dico, results, 'f1', 'f3', 'f5', s=15)

In [None]:
graph_param_CV(dico, results, xscale={'C': 'log'}, ncols=2, height=5, width=6)

In [None]:
best_params = best_score_CV(dico, results, 'f5')

In [None]:
m = Modelisation(X, y, pipeline.set_params(**best_params), scaling=True)
m.show_conf_matrix()
m.show_metrics_score()

In [None]:
print(f"Temps d'exécution total : {time.strftime('%H:%M:%S', time.gmtime(time.time()-t))}")