<a href="https://colab.research.google.com/github/amadousysada/AI-For-Beginners/blob/main/04_explication_resultats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import warnings

# Matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec

# Scikit Learn
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, roc_curve, auc, make_scorer, confusion_matrix, fbeta_score, ConfusionMatrixDisplay
from sklearn.impute import SimpleImputer
from sklearn.calibration import CalibrationDisplay

# Modèles de Boosting
from lightgbm import LGBMClassifier

# Imblearn, pour le réequilibrage
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

In [None]:
folder_path = "/content/drive/MyDrive/data/" # A ajuster selon le chemin où se trouve vos cvs.

In [None]:
X_train = pd.read_csv(f"{folder_path}final_cleaned_app_train.csv")
X_test = pd.read_csv(f"{folder_path}final_cleaned_app_test.csv")

Y = X_train['TARGET']

X_train.drop(columns=['SK_ID_CURR', 'TARGET'], inplace=True)
X_test.drop(columns=['SK_ID_CURR'], inplace=True)

In [None]:
params = {
    'num_leaves': 34,
    'max_depth': 5,
    'learning_rate': 0.12151829295886919
    'n_estimators': 118,
    'min_data_in_leaf': 89,
    'lambda_l1': 0.04008372779214459,
    'lambda_l2': 0.9407387582460319,
    'feature_fraction': 0.5071854627075131,
    'bagging_fraction': 0.9146284425997193
}

model = LGBMClassifier(**params, random_state=42, verbose=-1, n_jobs=-1),

pipeline = ImbPipeline(steps=[
      ("scaler", StandardScaler()),
      ("imputer", SimpleImputer(strategy="median")),
      ("smote", SMOTE(random_state=42, k_neighbors=2, sampling_strategy=0.1)),
      ("model", model)
])

lgb = pipeline.fit(X_train, Y)

In [None]:
# Predict

In [None]:
X_test = SimpleImputer(strategy="median").fit_transform(X_test)
X_test_scaled = StandardScaler().fit_transform(X_test)

y_pred = lgm.predict(X_test_scaled)
y_pred_proba = lgm.predict_proba(X_test_scaled)[:, 1]

In [None]:
# importance des variables
coef_imp_feature = pd.DataFrame({'variable': X_train.columns,
                                 'coef': lgm.feature_importances_})
print(coef_imp_feature.sort_values(by='coef', ascending=False))

In [None]:
sns.set_style("ticks")

feature_imp = pd.DataFrame(sorted(zip(model_LGBM.feature_importances_, X_train.columns)), columns=['Value','Feature'])

plt.figure(figsize=(10, 10))
sns.barplot(x="Value", y="Feature",
            data=feature_imp.sort_values(by="Value", ascending=False),
            palette="BrBG_r")
plt.title('LightGBM Features (avg over folds)', fontsize=20, fontstyle='italic')
plt.tight_layout()
plt.show()

In [None]:
lgb.plot_tree(model_LGBM, figsize=(30,40))

In [None]:
print("Prédictions sur l'ensemble de test")
print(pd.DataFrame({
    "COUNT": y_pred_lgbm_df.value_counts(),
    "RATIO": y_pred_lgbm_df.value_counts() / len(y_pred_lgbm_df) * 100}))

In [None]:
y_pred_lgbm_proba = model_LGBM.predict_proba(X_test)

y_pred_lgbm_proba

In [None]:
y_pred_lgbm_proba_df = pd.DataFrame(y_pred_lgbm_proba, columns=['proba_classe_0', 'proba_classe_1'])
y_pred_lgbm_proba_df[y_pred_lgbm_proba_df['proba_classe_1'] > 0.9].sort_values(by='proba_classe_1', ascending=False)
