In [None]:
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import (GridSearchCV, ParameterGrid,
                                     RandomizedSearchCV)
from imblearn.pipeline import Pipeline as Pipeline_imb
from imblearn.over_sampling import SMOTENC, RandomOverSampler
from tools import (Modelisation, datasets, SearchCV, restauration_CV, 
                   graph_2scores_CV, graph_3scores_CV, graph_param_CV, 
                   best_score_CV)

t = time.time()

In [None]:
df = pd.read_csv('data/df_train_prepro.csv').sample(frac=1)
df.shape

**Définition des variables quantitatives, des variables catégorielles et des datasets :**

In [None]:
datasets_df = datasets(df)
X_quant = datasets_df['X_quant']
X_cat = datasets_df['X_cat']
X = datasets_df['X']
y = datasets_df['y']

# Decision tree
## 1. Modèle brut
### 1.1. Variables quantitatives

In [None]:
m = Modelisation(X_quant, y, DecisionTreeClassifier())
m.show_conf_matrix()
m.show_metrics_score()

In [None]:
m.show_ROC()

In [None]:
m.show_attributes()

In [None]:
m = Modelisation(X_quant, y, DecisionTreeClassifier(max_depth=2))
m.show_conf_matrix()
m.show_metrics_score()

In [None]:
m.show_attributes()

In [None]:
m.plot_tree()

### 1.2. Variables catégorielles

In [None]:
m = Modelisation(X_cat, y, DecisionTreeClassifier())
m.show_conf_matrix()
m.show_metrics_score()

In [None]:
m.show_attributes()

### 1.3. Variables quantitatives + catégorielles

In [None]:
m = Modelisation(X, y, DecisionTreeClassifier())
m.show_conf_matrix()
m.show_metrics_score()

In [None]:
m.show_attributes()

# Random Forest

## 1. Modèle brut

In [None]:
m = Modelisation(X, y, RandomForestClassifier(n_estimators=10, n_jobs=-1))
m.show_conf_matrix()
m.show_metrics_score()

In [None]:
m = Modelisation(X, y, RandomForestClassifier(n_estimators=100, n_jobs=-1))
m.show_conf_matrix()
m.show_metrics_score()

In [None]:
m = Modelisation(X, y, RandomForestClassifier(min_samples_leaf=15, n_estimators=100, n_jobs=-1))
m.show_conf_matrix()
m.show_metrics_score()

## 2. Tuning des hyperparamètres avec GridSearchCV

In [None]:
model = RandomForestClassifier()

cv_params = {
        'bootstrap': [False, True],
        'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        'max_features': ['sqrt', 'log2'],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 5, 10],
        'n_estimators': [50, 100, 200, 400, 600, 800, 1000, 1500],
        "class_weight": [None, 'balanced', {0:1, 1:2}, {0:1, 1:4}, {0:1, 1:8}, {0:1, 1:10}, {0:1, 1:12}, {0:1, 1:14}, {0:1, 1:16.5}, {0:1, 1:32}] 
}

print(len(ParameterGrid(cv_params)))

In [None]:
SearchCV(model, cv_params, data_frac=0.02, random=True, n_iter=5000, random_state=1234, n_jobs=28)

In [None]:
dico, results = restauration_CV('Forest_CV_Randomized5000_31680_0.02')

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f1', s=1)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f3', s=1)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f5', s=1)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'precision', s=1)

In [None]:
graph_3scores_CV(dico, results, 'recall', 'precision', 'f1', s=0.5)

In [None]:
graph_3scores_CV(dico, results, 'f1', 'f3', 'f5', s=0.5)

In [None]:
graph_param_CV(dico, results, ncols=2, height=3.5, width=6)

In [None]:
best_params = best_score_CV(dico, results, 'f3')

In [None]:
best_params['n_jobs'] = -1
# best_params['n_estimators'] = 400
PARAMS = {}
PARAMS[dico['model_name']] = best_params

In [None]:
m = Modelisation(X, y, RandomForestClassifier(**best_params))
m.show_conf_matrix()
m.show_metrics_score()

## 3. RandomForest avec oversampling

### 3.1. GridSearchCV avec RandomOverSampler

In [None]:
over = RandomOverSampler(random_state=1234)

pipeline = Pipeline_imb([
            ('over', over),
            ('model', RandomForestClassifier())
            ])

cv_params = {
        "over__sampling_strategy": [0.1, 0.2, 0.4, 0.6, 0.8, 1],
        "model__bootstrap": [False, True],
        "model__max_depth": [10, 20, 30, 40, 50, 60, 80, 100, None],
        "model__max_features": ['sqrt', 'log2'],
        "model__min_samples_leaf": [1, 2, 4],
        "model__min_samples_split": [2, 5, 10],
        "model__n_estimators": [50, 100, 200, 400, 600, 800, 1000, 1500],
        "model__class_weight": [None, 'balanced', {0:1, 1:2}, {0:1, 1:4}, {0:1, 1:8}, {0:1, 1:10}, {0:1, 1:12}, {0:1, 1:14}, {0:1, 1:16.5}, {0:1, 1:32}] 
}

print(len(ParameterGrid(cv_params)))

In [None]:
SearchCV(pipeline, cv_params, data_frac=0.02, random=True, n_iter=5000, random_state=1234, n_jobs=28)

In [None]:
dico, results = restauration_CV('RandomOver_Forest_CV_Randomized5000_155520_0.02')

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f1', s=1)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f3', s=1)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'f5', s=1)

In [None]:
graph_2scores_CV(dico, results, 'recall', 'precision', s=1)

In [None]:
graph_3scores_CV(dico, results, 'recall', 'precision', 'f1', s=0.5)

In [None]:
graph_3scores_CV(dico, results, 'f1', 'f3', 'f5', s=0.5)

In [None]:
graph_param_CV(dico, results, ncols=2, height=3.5, width=6)

In [None]:
best_params = best_score_CV(dico, results, 'f3')

In [None]:
best_params['model__n_jobs'] = -1
PARAMS[dico['model_name']] = best_params

In [None]:
m = Modelisation(X, y, pipeline.set_params(**best_params))
m.show_conf_matrix()
m.show_metrics_score()

### 3.2. GridSearchCV avec SMOTE

### 3.3. Comparaison

In [None]:
dr1 = restauration_CV('Forest_CV_Randomized5000_31680_0.02', verbose=False)
dr2 = restauration_CV('RandomOver_Forest_CV_Randomized5000_155520_0.02', verbose=False)

In [None]:
graph_2scores_CV_comp([dr2, dr1], 'recall', 'f3', s=[1, 1], alpha=0.3)

In [None]:
graph_2scores_CV_comp([dr2, dr1], 'recall', 'precision', s=[1, 1], alpha=0.3)

## 4. RandomForest avec RFECV
### 4.1. Sans oversampling

In [None]:
with open("backups/RFECV_Forest.json", 'r') as f:
    export = json.load(f)
    columns_quant_RFECV = export['columns_quant']
    columns_cat_RFECV = export['columns_cat']

In [None]:
model = RandomForestClassifier()

cv_params = {
        'bootstrap': [False, True],
        'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        'max_features': ['sqrt', 'log2'],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 5, 10],
        'n_estimators': [50, 100, 200, 400, 600, 800, 1000, 1500],
        "class_weight": [None, 'balanced', {0:1, 1:2}, {0:1, 1:4}, {0:1, 1:8}, {0:1, 1:10}, {0:1, 1:12}, {0:1, 1:14}, {0:1, 1:16.5}, {0:1, 1:32}] 
}

print(len(ParameterGrid(cv_params)))

In [None]:
SearchCV(model, cv_params, columns_quant=columns_quant_RFECV, columns_cat=columns_cat_RFECV, data_frac=0.02, random=True, n_iter=5000, random_state=1234, n_jobs=28)

In [None]:
dico, results = restauration_CV('Forest_RFECV_CV_Randomized5000_31680_0.02')

In [None]:
best_params = best_score_CV(dico, results, 'f3', display_table=False)

In [None]:
best_params['n_jobs'] = -1
PARAMS[dico['model_name']] = best_params

In [None]:
datasets_df_RFECV = datasets(df, columns_quant=columns_quant_RFECV, columns_cat=columns_cat_RFECV)
X_RFECV = datasets_df_RFECV['X']

In [None]:
m = Modelisation(X, y, RandomForestClassifier(**best_params))
m.show_conf_matrix()
m.show_metrics_score()

### 4.2. Avec oversampling

In [None]:
over = RandomOverSampler(random_state=1234)

pipeline = Pipeline_imb([
            ('over', over),
            ('model', RandomForestClassifier())
            ])

cv_params = {
        "over__sampling_strategy": [0.1, 0.2, 0.4, 0.6, 0.8, 1],
        "model__bootstrap": [False, True],
        "model__max_depth": [10, 20, 30, 40, 50, 60, 80, 100, None],
        "model__max_features": ['sqrt', 'log2'],
        "model__min_samples_leaf": [1, 2, 4],
        "model__min_samples_split": [2, 5, 10],
        "model__n_estimators": [50, 100, 200, 400, 600, 800, 1000, 1500],
        "model__class_weight": [None, 'balanced', {0:1, 1:2}, {0:1, 1:4}, {0:1, 1:8}, {0:1, 1:10}, {0:1, 1:12}, {0:1, 1:14}, {0:1, 1:16.5}, {0:1, 1:32}] 
}

print(len(ParameterGrid(cv_params)))

In [None]:
SearchCV(pipeline, cv_params, columns_quant=columns_quant_RFECV, columns_cat=columns_cat_RFECV, data_frac=0.02, random=True, n_iter=5000, random_state=1234, n_jobs=28)

In [None]:
dico, results = restauration_CV('RandomOver_Forest_RFECV_CV_Randomized5000_155520_0.02')

In [None]:
best_params = best_score_CV(dico, results, 'f3', display_table=False)

In [None]:
best_params['model__n_jobs'] = -1
PARAMS[dico['model_name']] = best_params
with open(f"backups/PARAMS_XGBoost.json", 'w') as f:
    json.dump(PARAMS, f, indent=2)

In [None]:
m = Modelisation(X, y, pipeline.set_params(**best_params))
m.show_conf_matrix()
m.show_metrics_score()

### 4.3. Comparaison

In [None]:
dr1 = restauration_CV('Forest_CV_Randomized5000_31680_0.02', verbose=False)
dr2 = restauration_CV('Forest_RFECV_CV_Randomized5000_31680_0.02', verbose=False)
dr3 = restauration_CV('RandomOver_Forest_CV_Randomized5000_155520_0.02', verbose=False)
dr4 = restauration_CV('RandomOver_Forest_RFECV_CV_Randomized5000_155520_0.02', verbose=False)

In [None]:
graph_2scores_CV_comp([dr1, dr2], 'recall', 'f3', s=[1, 1], alpha=0.3)

In [None]:
graph_2scores_CV_comp([dr1, dr2], 'recall', 'precision', s=[1, 1], alpha=0.3)

In [None]:
graph_2scores_CV_comp([dr3, dr4], 'recall', 'f3', s=[1, 1], alpha=0.3)

In [None]:
graph_2scores_CV_comp([dr3, dr4], 'recall', 'precision', s=[1, 1], alpha=0.3)

In [None]:
print(f"Temps d'exécution total : {time.strftime('%H:%M:%S', time.gmtime(time.time()-t))}")