# Régression logistique

In [None]:
import json
import time

import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTENC, RandomOverSampler
from imblearn.pipeline import Pipeline as Pipeline_imb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ParameterGrid
from statsmodels.discrete.discrete_model import Logit
from statsmodels.tools import add_constant

from tools import (COLUMNS_CAT, COLUMNS_QUANT, Modelisation, SearchCV,
                   best_score_CV, datasets, graph_2scores_CV,
                   graph_2scores_CV_comp, graph_3scores_CV, graph_param_CV,
                   restauration_CV)

## Matplotlib : 
# Pour affichage interactif (notamment 3D) dans notebook
# %matplotlib widget

# Pour affichage interactif (notamment 3D) dans une fenêtre qt externe au notebook
# %matplotlib qt

# Pour affichage simple dans notebook
# %config InlineBackend.figure_format = 'png'

t = time.time()

In [None]:
df = pd.read_csv('data/df_train_prepro.csv').sample(frac=1)
df.shape

**Définition des variables quantitatives, des variables catégorielles et des datasets :**

In [None]:
datasets_df = datasets(df, drop='first')
X_quant = datasets_df['X_quant_scaled']
X_cat = datasets_df['X_cat']
X = datasets_df['X_only_quant_scaled']
y = datasets_df['y']

Afin de comparer nos modèles en termes de performances brutes et de temps d'exécution, il est fondamental de déterminer quelques métriques de référence. Ici, il faut surtout faire en sorte d'éviter de prédire des non-clics qui seraient en réalité des clics (ie prédire trop de 0), quitte à prédire trop de 1. Autrement dit, il faut maximiser le recall et le NPV (Negative predictive value) afin de limiter les erreurs de type II. Le F1 Score, combinaison du recall et de la précision, est également pertinent. 

## 1. Modèle brut
### 1.1. Variables quantitatives

In [None]:
m = Modelisation(X_quant, y, LogisticRegression())
m.show_conf_matrix()
m.show_metrics_score()

In [None]:
m.show_ROC()

In [None]:
coeff_df = pd.DataFrame(np.transpose(m.model.coef_), X_quant.columns, columns=['Coefficient'])
coeff_df

In [None]:
X_ = add_constant(X_quant)
print(Logit(y, X_).fit().summary())

### 1.2. Variables catégorielles

In [None]:
# Nombre de valeurs distinctes
for column in COLUMNS_CAT:
    print(f"{column} : {len(set(df[column]))}")

In [None]:
m = Modelisation(X_cat, y, LogisticRegression())
m.show_conf_matrix()
m.show_metrics_score()

### 1.3. Variables quantitatives + catégorielles

In [None]:
m = Modelisation(X, y, LogisticRegression())
m.show_conf_matrix()
m.show_metrics_score()

In [None]:
m = Modelisation(X, y, LogisticRegression(class_weight='balanced'))
m.show_conf_matrix()
m.show_metrics_score()

## 2. Tuning des hyperparamètres avec GridSearchCV

In [None]:
model = LogisticRegression()

cv_params = {
    "C": np.logspace(-6, 2, 30),
    "class_weight": [None, 'balanced', {0:1, 1:2}, {0:1, 1:4}, {0:1, 1:6}, {0:1, 1:8},
                     {0:1, 1:10}, {0:1, 1:12}, {0:1, 1:14}, {0:1, 1:16}, {0:1, 1:18},
                     {0:1, 1:20}, {0:1, 1:22}, {0:1, 1:24}, {0:1, 1:26}, {0:1, 1:28},
                     {0:1, 1:30}, {0:1, 1:32}]
}

print(len(ParameterGrid(cv_params)))

In [None]:
SearchCV(model, cv_params, data_frac=1, drop='first', scaling=True, n_jobs=28)

In [None]:
dico, results = restauration_CV('LR_CV_Grid_540_1')

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f1', s=20)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f3', s=20)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f5', s=20)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'precision', s=20)

In [None]:
graph_3scores_CV(dico, results, 'recall', 'precision', 'f1', s=15)

In [None]:
graph_3scores_CV(dico, results, 'f1', 'f3', 'f5', s=15)

In [None]:
graph_param_CV(dico, results, xscale={'C': 'log'}, ncols=2, height=5, width=6)

In [None]:
best_params = best_score_CV(dico, results, 'f3')

In [None]:
PARAMS = {}
PARAMS[dico['model_name']] = best_params

In [None]:
m = Modelisation(X, y, LogisticRegression(**best_params))
m.show_conf_matrix()
m.show_metrics_score()

## 3. Régression logistique avec oversampling

In [None]:
len(y[y == 1]) / (len(y[y == 0]))

In [None]:
df_small = pd.read_csv('data/df_train_prepro.csv').sample(frac=0.05)
df_small.shape

In [None]:
datasets_df_small = datasets(df_small, verbose=False, drop='first')
X_small = datasets_df_small['X']
y_small = datasets_df_small['y']

In [None]:
categorical_features = list(range(len(COLUMNS_QUANT), len(X.columns)))
print(categorical_features)

over = SMOTENC(categorical_features=categorical_features,
               sampling_strategy=1,
               k_neighbors=5,
               random_state=1234,
               n_jobs=-1)

pipeline = Pipeline_imb([('over', over),
                         ('model', LogisticRegression())
                         ])

In [None]:
m = Modelisation(X_small, y_small, pipeline)
m.show_conf_matrix()
m.show_metrics_score()

In [None]:
over = RandomOverSampler(sampling_strategy=1, random_state=1234)

pipeline = Pipeline_imb([('over', over),
                         ('model', LogisticRegression())
                         ])

In [None]:
m = Modelisation(X_small, y_small, pipeline)
m.show_conf_matrix()
m.show_metrics_score()

### 3.1. GridSearchCV avec RandomOverSampler

In [None]:
over = RandomOverSampler(random_state=1234)

pipeline = Pipeline_imb([('over', over),
                         ('model', LogisticRegression())
                         ])

cv_params = {
    "over__sampling_strategy": [0.1, 0.2, 0.4, 0.6, 0.8, 1],
    "model__C": np.logspace(-6, 2, 30),
    "model__class_weight": [None, 'balanced', {0:1, 1:2}, {0:1, 1:4}, {0:1, 1:6}, {0:1, 1:8}, 
                            {0:1, 1:10}, {0:1, 1:12}, {0:1, 1:14}, {0:1, 1:16}, {0:1, 1:18}, 
                            {0:1, 1:20}, {0:1, 1:22}, {0:1, 1:24}, {0:1, 1:26}, {0:1, 1:28}, 
                            {0:1, 1:30}, {0:1, 1:32}]
}

print(len(ParameterGrid(cv_params)))

In [None]:
SearchCV(pipeline, cv_params, data_frac=1, drop='first', scaling=True, n_jobs=28)

In [None]:
dico, results = restauration_CV('RandomOver_LR_CV_Grid_3240_1')

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f1', s=10)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f3', s=10)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f5', s=10)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'precision', s=10)

In [None]:
graph_3scores_CV(dico, results, 'recall', 'precision', 'f1', s=10)

In [None]:
graph_3scores_CV(dico, results, 'f1', 'f3', 'f5', s=10)

In [None]:
graph_param_CV(dico, results, xscale={'model__C': 'log'}, ncols=2, height=5, width=6)

In [None]:
best_params = best_score_CV(dico, results, 'f3')

In [None]:
PARAMS[dico['model_name']] = best_params

In [None]:
m = Modelisation(X, y, pipeline.set_params(**best_params))
m.show_conf_matrix()
m.show_metrics_score()

### 3.2. GridSearchCV avec SMOTE

In [None]:
categorical_features = list(range(len(COLUMNS_QUANT), len(X.columns)))
print(categorical_features)

over = SMOTENC(categorical_features=categorical_features,
               random_state=1234)

pipeline = Pipeline_imb([('over', over),
                         ('model', LogisticRegression())
                         ])

cv_params = {
    "over__sampling_strategy": [0.1, 0.2, 0.4, 0.6, 0.8, 1],
    "over__k_neighbors": [3, 4, 5],
    "model__C": np.logspace(-6, 2, 30),
    "model__class_weight": [None, 'balanced', {0:1, 1:2}, {0:1, 1:4}, {0:1, 1:6}, {0:1, 1:8}, 
                            {0:1, 1:10}, {0:1, 1:12}, {0:1, 1:14}, {0:1, 1:16}, {0:1, 1:18}, 
                            {0:1, 1:20}, {0:1, 1:22}, {0:1, 1:24}, {0:1, 1:26}, {0:1, 1:28}, 
                            {0:1, 1:30}, {0:1, 1:32}] 
}

print(len(ParameterGrid(cv_params)))

In [None]:
SearchCV(pipeline, cv_params, data_frac=0.1, drop='first', scaling=True, random=True, n_iter=2500, random_state=1234, n_jobs=28)

In [None]:
dico, results = restauration_CV('SMOTENC_LR_CV_Randomized2500_9720_0.1')

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f1', s=5)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f3', s=5)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f5', s=5)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'precision', s=5)

In [None]:
graph_3scores_CV(dico, results, 'recall', 'precision', 'f1', s=2)

In [None]:
graph_3scores_CV(dico, results, 'f1', 'f3', 'f5', s=2)

In [None]:
graph_param_CV(dico, results, xscale={'model__C': 'log'}, ncols=2, height=5, width=6)

In [None]:
best_params = best_score_CV(dico, results, 'f3')

In [None]:
best_params['over__n_jobs'] = -1
PARAMS[dico['model_name']] = best_params

In [None]:
m = Modelisation(X, y, pipeline.set_params(**best_params))
m.show_conf_matrix()
m.show_metrics_score()

### 3.3. Comparaison

In [None]:
dr1 = restauration_CV('LR_CV_Grid_540_1', verbose=False)
dr2 = restauration_CV('RandomOver_LR_CV_Grid_3240_1', verbose=False)
dr3 = restauration_CV('SMOTENC_LR_CV_Randomized2500_9720_0.1', verbose=False)

In [None]:
graph_2scores_CV_comp([dr3, dr2, dr1], 'recall', 'f3', s=[1, 5, 15], alpha=0.3)

In [None]:
graph_2scores_CV_comp([dr3, dr2, dr1], 'recall', 'precision', s=[1, 5, 15], alpha=0.3)

## 4. Régression logistique avec RFECV

### 4.1. Sans oversampling

In [None]:
with open("backups/RFECV_LR.json", 'r') as f:
    export = json.load(f)
    columns_quant_RFECV = export['columns_quant']
    columns_cat_RFECV = export['columns_cat']

In [None]:
model = LogisticRegression()

cv_params = {
    "C": np.logspace(-6, 2, 30),
    "class_weight": [None, 'balanced', {0:1, 1:2}, {0:1, 1:4}, {0:1, 1:6}, {0:1, 1:8}, 
                     {0:1, 1:10}, {0:1, 1:12}, {0:1, 1:14}, {0:1, 1:16}, {0:1, 1:18}, 
                     {0:1, 1:20}, {0:1, 1:22}, {0:1, 1:24}, {0:1, 1:26}, {0:1, 1:28}, 
                     {0:1, 1:30}, {0:1, 1:32}] 
}

print(len(ParameterGrid(cv_params)))

In [None]:
SearchCV(model, cv_params, columns_quant=columns_quant_RFECV, columns_cat=columns_cat_RFECV, data_frac=1, drop='first', scaling=False, n_jobs=28, name='RFECV')

In [None]:
dico, results = restauration_CV('LR_RFECV_CV_Grid_540_1')

In [None]:
best_params = best_score_CV(dico, results, 'f3', display_table=False)

In [None]:
PARAMS[dico['model_name']] = best_params

In [None]:
datasets_df_RFECV = datasets(df, columns_quant=columns_quant_RFECV, columns_cat=columns_cat_RFECV, drop='first')
X_RFECV = datasets_df_RFECV['X_only_quant_scaled']

In [None]:
m = Modelisation(X_RFECV, y, LogisticRegression(**best_params))
m.show_conf_matrix()
m.show_metrics_score()

### 4.2. Avec oversampling

In [None]:
over = RandomOverSampler(random_state=1234)

pipeline = Pipeline_imb([('over', over),
                         ('model', LogisticRegression())
                         ])

cv_params = {
        "over__sampling_strategy": [0.1, 0.2, 0.4, 0.6, 0.8, 1],
        "model__C": np.logspace(-6, 2, 30),
        "model__class_weight": [None, 'balanced', {0:1, 1:2}, {0:1, 1:4}, {0:1, 1:6}, {0:1, 1:8}, 
                                 {0:1, 1:10}, {0:1, 1:12}, {0:1, 1:14}, {0:1, 1:16}, {0:1, 1:18}, 
                                 {0:1, 1:20}, {0:1, 1:22}, {0:1, 1:24}, {0:1, 1:26}, {0:1, 1:28}, 
                                 {0:1, 1:30}, {0:1, 1:32}]
        }

print(len(ParameterGrid(cv_params)))

In [None]:
SearchCV(pipeline, cv_params, columns_quant=columns_quant_RFECV, columns_cat=columns_cat_RFECV, data_frac=1, drop='first', scaling=False, n_jobs=28, name='RFECV')

In [None]:
dico, results = restauration_CV('RandomOver_LR_RFECV_CV_Grid_3240_1')

In [None]:
best_params = best_score_CV(dico, results, 'f3', display_table=False)

In [None]:
PARAMS[dico['model_name']] = best_params
with open("backups/PARAMS_LR.json", 'w') as f:
    json.dump(PARAMS, f, indent=2)

In [None]:
m = Modelisation(X_RFECV, y, pipeline.set_params(**best_params))
m.show_conf_matrix()
m.show_metrics_score()

### 4.3. Comparaison

In [None]:
dr1 = restauration_CV('LR_CV_Grid_540_1', verbose=False)
dr2 = restauration_CV('LR_RFECV_CV_Grid_540_1', verbose=False)
dr3 = restauration_CV('RandomOver_LR_CV_Grid_3240_1', verbose=False)
dr4 = restauration_CV('RandomOver_LR_RFECV_CV_Grid_3240_1', verbose=False)

In [None]:
graph_2scores_CV_comp([dr1, dr2], 'recall', 'f3', alpha=0.3)

In [None]:
graph_2scores_CV_comp([dr1, dr2], 'recall', 'precision', alpha=0.3)

In [None]:
graph_2scores_CV_comp([dr3, dr4], 'recall', 'f3', s=[5, 5], alpha=0.3)

In [None]:
graph_2scores_CV_comp([dr3, dr4], 'recall', 'precision', s=[5, 5], alpha=0.3)

In [None]:
print(f"Temps d'exécution total : {time.strftime('%H:%M:%S', time.gmtime(time.time()-t))}")