<a href="https://colab.research.google.com/github/adamxkadd/Projet_7/blob/main/mlflow_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**PROJET 7 : Implémentez un modèle de scoring**  
Notebook 1 Dashboard et API

In [None]:
# pip install pydantic==2.0a1
# pip install mlflow==2.2.0
# pip install pyngrok
# ! pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [1]:
import pandas as pd
import numpy as np
import joblib as jl
import mlflow


from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn import model_selection
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.metrics import mean_squared_error, r2_score
from pandas_profiling import ProfileReport
import lightgbm as lgb

from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from pyngrok import ngrok
import mlflow.sklearn

import warnings
warnings.filterwarnings("ignore")

  from pandas_profiling import ProfileReport


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Étape 1 : Chargement des données
def load_data():
    print('Chargement data...')
    df_application_train = pd.read_csv('/content/drive/My Drive/DataScientist/Projet_7/application_train.csv')
    df_application_test = pd.read_csv('/content/drive/My Drive/DataScientist/Projet_7/application_test.csv')
    df_bureau = pd.read_csv('/content/drive/My Drive/DataScientist/Projet_7/bureau.csv')
    df_bureau_balance = pd.read_csv('/content/drive/My Drive/DataScientist/Projet_7/bureau_balance.csv')
    return df_application_train, df_application_test, df_bureau, df_bureau_balance


# Étape 2 : Préparation des DataFrames
def prepare_data(df_application_train, df_application_test):
    print('Préparer les df...')
    df_data = df_application_train.head(100).copy()
    df_prod = df_application_test.head(100).copy() # Data pour production (nouveaux clients)
    return df_data, df_prod


# Étape 3 : Nettoyage des données
def clean_data(df_data):
    print('Nettoyage...')
    df_data = df_data[df_data.CODE_GENDER != 'XNA']
    df_data = df_data[df_data.NAME_INCOME_TYPE != 'Maternity leave']
    df_data = df_data[df_data.NAME_FAMILY_STATUS != 'Unknown']
    return df_data


# Étape 4 : Gestion des outliers
def gerer_outliers(df_data):
    print('Outliers...')
    outliers = df_data.loc[df_data.DAYS_EMPLOYED > 366, 'DAYS_EMPLOYED'].unique()
    df_data.DAYS_EMPLOYED.replace(outliers, np.nan, inplace=True)
    return df_data


# Étape 5 : Feature Engineering
def feature_engineering(df_data, df_bureau, df_bureau_balance):
    print('Feature Engineering...')
    df_data['PROPORTION_LIFE_EMPLOYED'] = df_data['DAYS_EMPLOYED'] / df_data['DAYS_BIRTH'] # Calcul du rapport de jours employés par rapport à l'âge
    df_data['INCOME_TO_CREDIT_RATIO'] = df_data['AMT_INCOME_TOTAL'] / df_data['AMT_CREDIT'] # Calcul du rapport revenu/crédit
    df_data['INCOME_TO_ANNUITY_RATIO'] = df_data['AMT_INCOME_TOTAL'] / df_data['AMT_ANNUITY'] # Calcul du rapport revenu/annuité
    df_data['INCOME_TO_ANNUITY_RATIO_BY_AGE'] = df_data['INCOME_TO_ANNUITY_RATIO'] * df_data['DAYS_BIRTH'] # Calcul du rapport revenu/annuité par âge
    df_data['CREDIT_TO_ANNUITY_RATIO'] = df_data['AMT_CREDIT'] / df_data['AMT_ANNUITY'] # Calcul du rapport crédit/annuité
    df_data['CREDIT_TO_ANNUITY_RATIO_BY_AGE'] = df_data['CREDIT_TO_ANNUITY_RATIO'] * df_data['DAYS_BIRTH'] # Calcul du rapport crédit/annuité par âge
    df_data['INCOME_TO_FAMILYSIZE_RATIO'] = df_data['AMT_INCOME_TOTAL'] / df_data['CNT_FAM_MEMBERS'] # Calcul du rapport revenu/taille de la familledf_data.DAYS_EMPLOYED.replace(outliers, np.nan, inplace=True)
    nb_pre_prets = df_bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU': 'NB_PRE_LOANS'})
    df_data = df_data.merge(nb_pre_prets, on='SK_ID_CURR', how='left')
    mt_pre_prets = df_bureau_balance.groupby('SK_ID_BUREAU', as_index=False).mean().rename(columns = {'MONTHS_BALANCE': 'MONTHS_BALANCE_MEAN'})
    bureau_tmp = df_bureau.merge(mt_pre_prets, on='SK_ID_BUREAU', how='left').drop('SK_ID_BUREAU', axis=1)
    bureau_tmp = bureau_tmp.groupby('SK_ID_CURR', as_index=False).mean().add_prefix('PREV_BUR_MEAN_').rename(columns = {'PREV_BUR_MEAN_SK_ID_CURR' : 'SK_ID_CURR'})
    df_data = df_data.merge(bureau_tmp, on='SK_ID_CURR', how='left')
    return df_data


# Étape 6 : Définition de l'index
def set_index(df_data):
    print('Set index...')
    df_data['SK_ID_CURR'] = df_data['SK_ID_CURR'].astype(int)
    df_data.set_index('SK_ID_CURR', inplace=True)
    return df_data


# Étape 7 : Encodage des variables catégorielles
def encode_categorical(df_data):
    print('Encodage categoriel...')
    days_week = {'MONDAY':0,'TUESDAY':1,'WEDNESDAY':2,'THURSDAY':3,'FRIDAY':4,'SATURDAY':5,'SUNDAY':6,}
    df_data['WEEKDAY_APPR_PROCESS_START'] = df_data['WEEKDAY_APPR_PROCESS_START'].map(days_week)
    le = LabelEncoder()
    for col in df_data.select_dtypes(include=['object']).columns:
        df_data[col] = le.fit_transform(df_data[col])
    df_data = pd.get_dummies(df_data)
    return df_data


# Étape 8 : Imputation des valeurs manquantes
def impute_missing(df_data):
    print('Imputation des NaN...')
    columns_to_impute = df_data.columns[df_data.columns != 'TARGET']
    imputer = SimpleImputer(strategy='median')
    df_data[columns_to_impute] = imputer.fit_transform(df_data[columns_to_impute])
    return df_data


# Étape 9 : Équilibrage de classes
def equilibre_classes(df_data):
    print('Équilibrage de classes...')
    sm = SMOTE(random_state=42)
    features, targets = sm.fit_resample(df_data.drop(columns=['TARGET']), df_data['TARGET'].ravel())
    df_data_balanced = pd.concat([pd.DataFrame(features), pd.DataFrame(targets, columns=['TARGET'])], axis=1)
    return df_data_balanced


# Étape 10 : Séparation des données en ensembles d'entraînement et de test
def split_data(df_data):
    print('Split train/test...')
    df_train = df_data[~df_data['TARGET'].isna()]
    y_train = df_train['TARGET']
    X_train = df_train.drop(columns=['TARGET'])
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X_train, y_train, train_size=0.8)
    return X_train, X_test, y_train, y_test


# Étape 11 : Standardisation des données
def standardize_data(X_train, X_test):
    print('Standardisation...')
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test




In [18]:
print('Debut Pre-process')
df_application_train, df_application_test, df_bureau, df_bureau_balance = load_data()
df_data, df_prod = prepare_data(df_application_train, df_application_test)
df_data = clean_data(df_data)
df_data = feature_engineering(df_data, df_bureau, df_bureau_balance)
df_data = gerer_outliers(df_data)
df_data = encode_categorical(df_data)
df_data = impute_missing(df_data)
df_data = set_index(df_data)
df_data = equilibre_classes(df_data)
X_train, X_test, y_train, y_test = split_data(df_data)
X_train, X_test = standardize_data(X_train, X_test)
print('X_train',X_train.shape, 'y_train',y_train.shape)
print('X_test',X_train.shape, 'y_test',y_train.shape)
print('Fin Pre-process')

Debut Pre-process
Chargement data...
Préparer les df...
Nettoyage...
Feature Engineering...


  mt_pre_prets = df_bureau_balance.groupby('SK_ID_BUREAU', as_index=False).mean().rename(columns = {'MONTHS_BALANCE': 'MONTHS_BALANCE_MEAN'})
  bureau_tmp = bureau_tmp.groupby('SK_ID_CURR', as_index=False).mean().add_prefix('PREV_BUR_MEAN_').rename(columns = {'PREV_BUR_MEAN_SK_ID_CURR' : 'SK_ID_CURR'})


Outliers...
Encodage categoriel...
Imputation des NaN...
Set index...
Équilibrage de classes...
Split train/test...
Standardisation...
X_train (150, 141) y_train (150,)
X_test (150, 141) y_test (150,)
Fin Pre-process


**MLFLOW**

**Models simples**

In [20]:
mlflow.set_experiment("Dummy_models")
mlflow.sklearn.autolog()

2023/08/23 18:26:03 INFO mlflow.tracking.fluent: Experiment with name 'Dummy_models' does not exist. Creating a new experiment.


**DummyRegressor**

In [21]:
with mlflow.start_run(run_name = 'regressor_model' ):
    dr = DummyRegressor()
    dr.fit(X_train,y_train)



**DummyClassifier**

In [147]:
# mlflow.sklearn.autolog(disable=True)
with mlflow.start_run(run_name = 'classifier_model'):
  dc = DummyClassifier()
  dc.fit(X_train, y_train)
  # mlflow.sklearn.log_model(dc, "dummy_classifier_model")



# **Lancer mlflow en mode interface**

In [148]:
ngrok. kill()
ngrok.set_auth_token("2UDkEnEAel94wMucTKrRzP54czT_3NFaKK2AAxFEfT1GSZbo")
print(ngrok.connect(addr="5000", proto="http", bind_tls=True).public_url)



https://b5f8-35-199-63-192.ngrok-free.app


In [149]:
!mlflow ui

[2023-08-23 15:46:05 +0000] [51630] [INFO] Starting gunicorn 21.2.0
[2023-08-23 15:46:05 +0000] [51630] [INFO] Listening at: http://127.0.0.1:5000 (51630)
[2023-08-23 15:46:05 +0000] [51630] [INFO] Using worker: sync
[2023-08-23 15:46:05 +0000] [51631] [INFO] Booting worker with pid: 51631
[2023-08-23 15:49:01 +0000] [51630] [INFO] Handling signal: int

Aborted!
[2023-08-23 15:49:01 +0000] [51631] [INFO] Worker exiting (pid: 51631)
[2023-08-23 15:49:02 +0000] [51630] [INFO] Shutting down: Master


# **Predictions**

In [129]:
# df_prod_save = df_prod.copy()
df_prod = df_prod_save

In [130]:
print('Debut Pre-process')
df_prod = clean_data(df_prod)
df_prod = feature_engineering(df_prod, df_bureau, df_bureau_balance)
df_prod = gerer_outliers(df_prod)
df_prod = encode_categorical(df_prod)
df_prod = impute_missing(df_prod)
df_prod = set_index(df_prod)
print('Fin Pre-process')

Debut Pre-process
Nettoyage...
Feature Engineering...
Outliers...
Encodage categoriel...
Imputation des NaN...
Set index...
Fin Pre-process


In [None]:
df_prod['TARGET_DR'] = mlflow.pyfunc.load_model('runs:/fd8149c9af0346a780e23725104a13c8/model').predict(df_prod.head(100))
df_prod['TARGET_DC'] = mlflow.pyfunc.load_model('runs:/0002a800d94f4d178e7dd817594f9926/dummy_classifier_model').predict(df_prod.head(100))
# df_prod['TARGET_DC'] = dc.predict(df_prod.head(100))

In [140]:
print(df_prod.TARGET_DR.value_counts())
print(df_prod.TARGET_DC.value_counts())

0.513333    100
Name: TARGET_DR, dtype: int64
1    100
Name: TARGET_DC, dtype: int64


In [None]:
profile = ProfileReport(
    df_application_train,
    title="Analyse exploratoire : Scoring Bank",
    html={'style':{'full_width':True}}
)

profile.to_file(output_file = "data-Scoring-Bank.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

In [39]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from sklearn.model_selection import GridSearchCV

In [24]:
print('Debut Pre-process')
# df_application_train, df_application_test, df_bureau, df_bureau_balance = load_data()
df_data, df_prod = prepare_data(df_application_train, df_application_test)
df_data = clean_data(df_data)
num_vars = df_data.drop(['SK_ID_CURR', 'TARGET'], axis=1).select_dtypes(exclude=['object']).columns
cat_vars = df_data.drop(['SK_ID_CURR', 'TARGET'], axis=1).drop(num_vars, axis=1).columns
df_data = feature_engineering(df_data, df_bureau, df_bureau_balance)
df_data = gerer_outliers(df_data)
df_data = encode_categorical(df_data)
df_data = impute_missing(df_data)
df_data = set_index(df_data)
df_data = equilibre_classes(df_data)
X_train, X_test, y_train, y_test = split_data(df_data)
# X_train, X_test = standardize_data(X_train, X_test)
print('X_train',X_train.shape, 'y_train',y_train.shape)
print('X_test',X_train.shape, 'y_test',y_train.shape)
print('Fin Pre-process')

Debut Pre-process
Préparer les df...
Nettoyage...
Feature Engineering...


2023/08/23 23:19:07 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '2e58ae51b5a6464d8414d4035b734dd5', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Outliers...
Encodage categoriel...
Imputation des NaN...
Set index...
Équilibrage de classes...
Split train/test...
X_train (150, 141) y_train (150,)
X_test (150, 141) y_test (150,)
Fin Pre-process


In [43]:
def eval_metrics(actual, pred):
  mape = mean_absolute_percentage_error(actual, pred) * 100
  rmse = np.sqrt(mean_squared_error(actual, pred))
  return mape, rmse


def build_pipeline(algo_ml,
                   impute_num = SimpleImputer(strategy="median"),
                   impute_var=SimpleImputer(strategy="constant", fill_value="Unknown"),
                   scaler = StandardScaler()):

  # Pipeline de transformation des variables numériques / catégorielles
  numeric_transformer = make_pipeline(impute_num,scaler)
  categorical_transformer = make_pipeline(impute_var,OneHotEncoder(handle_unknown="ignore") )

  # Combinaison des 2 étapes en un seul objet
  preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_vars),('cat', categorical_transformer, cat_vars)])

  # Pipeline final de Machine Learning
  model = Pipeline(steps=[('preprocessing', preprocessor), ('regressor', algo_ml)])

  return model

In [28]:
mlflow.sklearn.autolog() # Tracking automatique des modèles Sklearn

with mlflow.start_run(run_name = 'DummyRegressor'):

  # Générer le pipeline modèle
  model = build_pipeline(
                            algo_ml = DummyRegressor(),
                            impute_num=SimpleImputer(strategy="mean"),
                            impute_var=SimpleImputer(strategy="most_frequent"),
                            scaler=RobustScaler()
                        )

  # Entraîner le pipeline modèle
  model.fit(X_train,y_train)

  # Prédictions sur les données d'entraînement
  train_preds = model.predict(X_train)

  # Métriques de performances sur les données d'entraînement
  train_mape, train_rmse = eval_metrics(y_train, train_preds)

  # Prédictions sur les données de validation
#   val_preds = model.predict(X_val)

  # Métriques de performances sur les données de validation
#   val_mape, val_rmse = eval_metrics(y_val, val_preds)

  mlflow.log_metric("train_mape_experience", train_mape)
  mlflow.log_metric("train_rmse_experience", train_rmse)
#   mlflow.log_metric("val_mape_experience", val_mape)
#   mlflow.log_metric("val_rmse_experience", val_rmse)

  # Sauvegarde du modèle
  mlflow.sklearn.log_model(model, "rf_run2")

  print("Mean Absolute Percentage Error on Train Data:", train_mape) # optionnel : pour ne pas forcément avoir besoin de regarder l'interface graphique
#   print("Mean Absolute Percentage Error on Validation Data:", val_mape) # optionnel : pour ne pas forcément avoir besoin de regarder l'interface graphique
  print("Root Mean Squared Error on Train Data:", train_rmse) # optionnel : pour ne pas forcément avoir besoin de regarder l'interface graphique
#   print("Root Mean Squared Error on Validation Data:", val_rmse) # optionnel : pour ne pas forcément avoir besoin de regarder l'interface graphique
  print("Model run ID: ", mlflow.active_run().info.run_uuid) # optionnel : pour ne pas forcément avoir besoin de regarder l'interface graphique

  mlflow.end_run()

                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer()),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_B...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer()),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH',
       'DAYS...`
                ('robustscaler', RobustScaler())]), Index(['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 

Mean Absolute Percentage Error on Train Data: 1.1240984669916757e+17
Root Mean Squared Error on Train Data: 0.49959983987187184
Model run ID:  741400aafce540b5994421bf011f3e5a


In [44]:
# Définir la grille des hyperparamètres que vous souhaitez rechercher
param_grid = {
    'regressor__strategy': ['mean', 'median'],  # Choix de stratégie pour le DummyRegressor
    'preprocessing__num__simpleimputer__strategy': ['mean', 'median'],  # Stratégie pour la transformation numérique
    'preprocessing__num__robustscaler__quantile_range': [(25.0, 75.0), (10.0, 90.0)],  # Plage de quantile pour RobustScaler
}

# Créer le modèle de pipeline
model = build_pipeline(
    algo_ml=DummyRegressor(),
    impute_num=SimpleImputer(strategy="median"),
    impute_var=SimpleImputer(strategy="most_frequent"),
    scaler=RobustScaler()
)

# Créer l'objet GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')

# Lancer la recherche des meilleurs hyperparamètres
mlflow.set_experiment("Dummy_models_CV")
mlflow.sklearn.autolog()
with mlflow.start_run(run_name='DummyRegressor_GridSearchCV'):
    grid_search.fit(X_train, y_train)

# Obtenir les meilleurs hyperparamètres et le meilleur score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Meilleurs hyperparamètres:", best_params)
print("Meilleur score (RMSE):", np.sqrt(-best_score))

mlflow.end_run()

2023/08/24 01:34:24 INFO mlflow.tracking.fluent: Experiment with name 'Dummy_models_CV' does not exist. Creating a new experiment.
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('robustscaler',
                                                                   RobustScaler())]),
                                                  I...`
2023/08/24 01:34:33 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                  

Meilleurs hyperparamètres: {'preprocessing__num__robustscaler__quantile_range': (25.0, 75.0), 'preprocessing__num__simpleimputer__strategy': 'mean', 'regressor__strategy': 'mean'}
Meilleur score (RMSE): 0.5032395056034452


In [48]:
# Définir la grille des hyperparamètres que vous souhaitez rechercher pour LightGBM
param_grid = {
    'regressor__boosting_type': ['gbdt', 'dart'],  # Type de boosting
    'regressor__num_leaves': [10, 20, 30],  # Nombre maximal de feuilles dans un arbre
    'regressor__learning_rate': [0.01, 0.1, 0.2],  # Taux d'apprentissage
}

# Créer le modèle de pipeline avec LightGBM
model = build_pipeline(
    algo_ml=lgb.LGBMRegressor(),
    impute_num=SimpleImputer(strategy="median"),
    impute_var=SimpleImputer(strategy="most_frequent"),
    scaler=RobustScaler()
)

# Créer l'objet GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')

# Lancer la recherche des meilleurs hyperparamètres avec MLflow
mlflow.set_experiment("LightGBM_models_CV")
mlflow.sklearn.autolog()
with mlflow.start_run(run_name='LightGBM_GridSearchCV'):
    grid_search.fit(X_train, y_train)

# Obtenir les meilleurs hyperparamètres et le meilleur score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Meilleurs hyperparamètres:", best_params)
print("Meilleur score (RMSE):", np.sqrt(-best_score))

# Mettre fin à la run MLflow
mlflow.end_run()


2023/08/24 01:40:50 INFO mlflow.tracking.fluent: Experiment with name 'LightGBM_models_CV' does not exist. Creating a new experiment.
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('robustscaler',
                                                                   RobustScaler())]),
                                                  I...`


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1629
[LightGBM] [Info] Number of data points in the train set: 120, number of used features: 94
[LightGBM] [Info] Start training from score 0.516667
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1714
[LightGBM] [Info] Number of data points in the train set: 120, number of used features: 93
[LightGBM] [Info] Start training from score 0.550000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1558
[LightGBM] [Info] Number of data points in the train set: 120, number of used features: 94
[LightGBM] [Info] Start training from score 0.491667
You can 

2023/08/24 01:41:18 INFO mlflow.sklearn.utils: Logging the 5 best runs, 13 runs will be omitted.
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'AMT_GOODS_PRICE', 'REGION_POPULATION_R...`
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('robustscaler',
                                                  RobustScaler())]),
                                 Index(['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'AMT_GOODS_PRICE', 'REGION_POPULATION_RELATIVE', 'DAYS_BIR...

Meilleurs hyperparamètres: {'regressor__boosting_type': 'dart', 'regressor__learning_rate': 0.2, 'regressor__num_leaves': 10}
Meilleur score (RMSE): 0.15892309900772336


In [53]:
y_pred

array([ 0.93205267,  0.20326037,  0.96178655,  0.0727333 ,  0.00226983,
       -0.0073993 ,  1.04994111,  0.04160431,  0.9399758 ,  0.95631673,
       -0.01662205,  0.15638765,  0.03482562,  0.01885679,  0.96691291,
        1.02353664,  0.95546018,  0.49007005,  0.57911545,  0.10097807,
       -0.08609076,  0.04746153,  0.02683697,  0.23390572,  1.02877643,
        0.239884  ,  0.91020784,  1.02959086,  0.97925003,  0.39243655,
        0.059576  ,  0.01149623, -0.12332876,  0.37619071,  0.00310805,
        0.96775603, -0.09387094, -0.12404547])

In [54]:

# Définir la grille des hyperparamètres que vous souhaitez rechercher pour LightGBM
param_grid = {
    'regressor__boosting_type': ['gbdt', 'dart'],  # Type de boosting
    'regressor__num_leaves': [10, 20, 30],  # Nombre maximal de feuilles dans un arbre
    'regressor__learning_rate': [0.01, 0.1, 0.2],  # Taux d'apprentissage
}

# Créer le modèle de pipeline avec LightGBM
model = build_pipeline(
    algo_ml=lgb.LGBMRegressor(),
    impute_num=SimpleImputer(strategy="median"),
    impute_var=SimpleImputer(strategy="most_frequent"),
    scaler=RobustScaler()
)

# Créer l'objet GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')

# Lancer la recherche des meilleurs hyperparamètres avec MLflow
mlflow.set_experiment("LightGBM_models_CV")
mlflow.sklearn.autolog()
with mlflow.start_run(run_name='LightGBM_GridSearchCV'):
    grid_search.fit(X_train, y_train)

    # Prédire les étiquettes sur l'ensemble de test
    y_pred = grid_search.predict(X_test)

    # Calculer le score F1
    f1 = f1_score(y_test, y_pred)

    # Calculer le score AUC
    auc = roc_auc_score(y_test, y_pred)

    # Obtenir les meilleurs hyperparamètres et le meilleur score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print("Meilleurs hyperparamètres:", best_params)
    print("Meilleur score (RMSE):", np.sqrt(-best_score))
    print("Score F1:", f1)
    print("Score AUC:", auc)

# Mettre fin à la run MLflow
mlflow.end_run()

                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('robustscaler',
                                                                   RobustScaler())]),
                                                  I...`


ValueError: ignored

In [49]:
ngrok. kill()
ngrok.set_auth_token("2UDkEnEAel94wMucTKrRzP54czT_3NFaKK2AAxFEfT1GSZbo")
print(ngrok.connect(addr="5000", proto="http", bind_tls=True).public_url)



https://3ccd-34-73-111-90.ngrok-free.app


In [50]:
!mlflow ui

[2023-08-24 01:42:32 +0000] [114551] [INFO] Starting gunicorn 20.1.0
[2023-08-24 01:42:32 +0000] [114551] [INFO] Listening at: http://127.0.0.1:5000 (114551)
[2023-08-24 01:42:32 +0000] [114551] [INFO] Using worker: sync
[2023-08-24 01:42:32 +0000] [114556] [INFO] Booting worker with pid: 114556
[2023-08-24 01:42:32 +0000] [114557] [INFO] Booting worker with pid: 114557
[2023-08-24 01:42:32 +0000] [114558] [INFO] Booting worker with pid: 114558
[2023-08-24 01:42:32 +0000] [114559] [INFO] Booting worker with pid: 114559
[2023-08-24 01:46:27 +0000] [114551] [INFO] Handling signal: int
[2023-08-24 01:46:27 +0000] [114556] [INFO] Worker exiting (pid: 114556)
[2023-08-24 01:46:27 +0000] [114558] [INFO] Worker exiting (pid: 114558)
[2023-08-24 01:46:27 +0000] [114557] [INFO] Worker exiting (pid: 114557)
[2023-08-24 01:46:27 +0000] [114559] [INFO] Worker exiting (pid: 114559)

Aborted!
[2023-08-24 01:46:28 +0000] [114551] [INFO] Shutting down: Master


In [31]:
df_prod_save = df_prod.copy()
# df_prod = df_prod_save

In [35]:
df_prod = clean_data(df_prod)
num_vars = df_prod.drop(['SK_ID_CURR'], axis=1).select_dtypes(exclude=['object']).columns
cat_vars = df_prod.drop(['SK_ID_CURR'], axis=1).drop(num_vars, axis=1).columns
df_prod = feature_engineering(df_prod, df_bureau, df_bureau_balance)
df_prod = gerer_outliers(df_prod)
df_prod = encode_categorical(df_prod)
df_prod = impute_missing(df_prod)
df_prod = set_index(df_prod)
# df_prod = equilibre_classes(df_prod)
# X_train, X_test, y_train, y_test = split_data(df_prod)

In [36]:
df_prod['TARGET_DR'] = mlflow.pyfunc.load_model('runs:/741400aafce540b5994421bf011f3e5a/rf_run2').predict(df_prod.head(100))
# df_prod['TARGET_DC'] = dc.predict(df_prod.head(100))

0.52    100
Name: TARGET_DR, dtype: int64


In [37]:
print(df_prod.TARGET_DR.value_counts())

0.52    100
Name: TARGET_DR, dtype: int64


tests git

In [1]:
print("OK git")

OK git


In [None]:
print("FIN")

FIN
