<a href="https://colab.research.google.com/github/adamxkadd/Projet_7/blob/main/pretraitement_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**PROJET 7 : Implémentez un modèle de scoring**  
Notebook 1 Dashboard et API

In [None]:
# pip install pydantic==2.0a1

In [2]:
# pip install mlflow==1.24.0

In [4]:
# pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-6.0.0.tar.gz (681 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m681.2/681.2 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyngrok
  Building wheel for pyngrok (setup.py) ... [?25l[?25hdone
  Created wheel for pyngrok: filename=pyngrok-6.0.0-py3-none-any.whl size=19867 sha256=218494d6974d899e2f312900d89dbbe027adabea57e7de92d9a61bd6575fdc03
  Stored in directory: /root/.cache/pip/wheels/5c/42/78/0c3d438d7f5730451a25f7ac6cbf4391759d22a67576ed7c2c
Successfully built pyngrok
Installing collected packages: pyngrok
Successfully installed pyngrok-6.0.0


In [13]:
import pandas as pd
import numpy as np
import joblib as jl
import mlflow


from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyRegressor, DummyClassifier
from sklearn import model_selection
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.metrics import mean_squared_error, r2_score


from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from pyngrok import ngrok
import mlflow.sklearn

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [121]:
# Étape 1 : Chargement des données
def load_data():
    print('Chargement data...')
    df_application_train = pd.read_csv('/content/drive/My Drive/DataScientist/Projet_7/application_train.csv')
    df_application_test = pd.read_csv('/content/drive/My Drive/DataScientist/Projet_7/application_test.csv')
    df_bureau = pd.read_csv('/content/drive/My Drive/DataScientist/Projet_7/bureau.csv')
    df_bureau_balance = pd.read_csv('/content/drive/My Drive/DataScientist/Projet_7/bureau_balance.csv')
    return df_application_train, df_application_test, df_bureau, df_bureau_balance


# Étape 2 : Préparation des DataFrames
def prepare_data(df_application_train, df_application_test):
    print('Préparer les df...')
    df_data = df_application_train.head(100).copy()
    df_prod = df_application_test.head(100).copy() # Data pour production (nouveaux clients)
    return df_data, df_prod


# Étape 3 : Nettoyage des données
def clean_data(df_data):
    print('Nettoyage...')
    df_data = df_data[df_data.CODE_GENDER != 'XNA']
    df_data = df_data[df_data.NAME_INCOME_TYPE != 'Maternity leave']
    df_data = df_data[df_data.NAME_FAMILY_STATUS != 'Unknown']
    return df_data


# Étape 4 : Gestion des outliers
def gerer_outliers(df_data):
    print('Outliers...')
    outliers = df_data.loc[df_data.DAYS_EMPLOYED > 366, 'DAYS_EMPLOYED'].unique()
    df_data.DAYS_EMPLOYED.replace(outliers, np.nan, inplace=True)
    return df_data


# Étape 5 : Feature Engineering
def feature_engineering(df_data, df_bureau, df_bureau_balance):
    print('Feature Engineering...')
    df_data['PROPORTION_LIFE_EMPLOYED'] = df_data['DAYS_EMPLOYED'] / df_data['DAYS_BIRTH'] # Calcul du rapport de jours employés par rapport à l'âge
    df_data['INCOME_TO_CREDIT_RATIO'] = df_data['AMT_INCOME_TOTAL'] / df_data['AMT_CREDIT'] # Calcul du rapport revenu/crédit
    df_data['INCOME_TO_ANNUITY_RATIO'] = df_data['AMT_INCOME_TOTAL'] / df_data['AMT_ANNUITY'] # Calcul du rapport revenu/annuité
    df_data['INCOME_TO_ANNUITY_RATIO_BY_AGE'] = df_data['INCOME_TO_ANNUITY_RATIO'] * df_data['DAYS_BIRTH'] # Calcul du rapport revenu/annuité par âge
    df_data['CREDIT_TO_ANNUITY_RATIO'] = df_data['AMT_CREDIT'] / df_data['AMT_ANNUITY'] # Calcul du rapport crédit/annuité
    df_data['CREDIT_TO_ANNUITY_RATIO_BY_AGE'] = df_data['CREDIT_TO_ANNUITY_RATIO'] * df_data['DAYS_BIRTH'] # Calcul du rapport crédit/annuité par âge
    df_data['INCOME_TO_FAMILYSIZE_RATIO'] = df_data['AMT_INCOME_TOTAL'] / df_data['CNT_FAM_MEMBERS'] # Calcul du rapport revenu/taille de la familledf_data.DAYS_EMPLOYED.replace(outliers, np.nan, inplace=True)
    nb_pre_prets = df_bureau.groupby('SK_ID_CURR', as_index=False)['SK_ID_BUREAU'].count().rename(columns = {'SK_ID_BUREAU': 'NB_PRE_LOANS'})
    df_data = df_data.merge(nb_pre_prets, on='SK_ID_CURR', how='left')
    mt_pre_prets = df_bureau_balance.groupby('SK_ID_BUREAU', as_index=False).mean().rename(columns = {'MONTHS_BALANCE': 'MONTHS_BALANCE_MEAN'})
    bureau_tmp = df_bureau.merge(mt_pre_prets, on='SK_ID_BUREAU', how='left').drop('SK_ID_BUREAU', axis=1)
    bureau_tmp = bureau_tmp.groupby('SK_ID_CURR', as_index=False).mean().add_prefix('PREV_BUR_MEAN_').rename(columns = {'PREV_BUR_MEAN_SK_ID_CURR' : 'SK_ID_CURR'})
    df_data = df_data.merge(bureau_tmp, on='SK_ID_CURR', how='left')
    return df_data


# Étape 6 : Définition de l'index
def set_index(df_data):
    print('Set index...')
    df_data['SK_ID_CURR'] = df_data['SK_ID_CURR'].astype(int)
    df_data.set_index('SK_ID_CURR', inplace=True)
    return df_data


# Étape 7 : Encodage des variables catégorielles
def encode_categorical(df_data):
    print('Encodage categoriel...')
    days_week = {'MONDAY':0,'TUESDAY':1,'WEDNESDAY':2,'THURSDAY':3,'FRIDAY':4,'SATURDAY':5,'SUNDAY':6,}
    df_data['WEEKDAY_APPR_PROCESS_START'] = df_data['WEEKDAY_APPR_PROCESS_START'].map(days_week)
    le = LabelEncoder()
    for col in df_data.select_dtypes(include=['object']).columns:
        df_data[col] = le.fit_transform(df_data[col])
    df_data = pd.get_dummies(df_data)
    return df_data


# Étape 8 : Imputation des valeurs manquantes
def impute_missing(df_data):
    print('Imputation des NaN...')
    columns_to_impute = df_data.columns[df_data.columns != 'TARGET']
    imputer = SimpleImputer(strategy='median')
    df_data[columns_to_impute] = imputer.fit_transform(df_data[columns_to_impute])
    return df_data


# Étape 9 : Équilibrage de classes
def equilibre_classes(df_data):
    print('Équilibrage de classes...')
    sm = SMOTE(random_state=42)
    features, targets = sm.fit_resample(df_data.drop(columns=['TARGET']), df_data['TARGET'].ravel())
    df_data_balanced = pd.concat([pd.DataFrame(features), pd.DataFrame(targets, columns=['TARGET'])], axis=1)
    return df_data_balanced


# Étape 10 : Séparation des données en ensembles d'entraînement et de test
def split_data(df_data):
    print('Split train/test...')
    df_train = df_data[~df_data['TARGET'].isna()]
    y_train = df_train['TARGET']
    X_train = df_train.drop(columns=['TARGET'])
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X_train, y_train, train_size=0.8)
    return X_train, X_test, y_train, y_test


# Étape 11 : Standardisation des données
def standardize_data(X_train, X_test):
    print('Standardisation...')
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

In [122]:
print('Debut Pre-process')
df_application_train, df_application_test, df_bureau, df_bureau_balance = load_data()
df_data, df_prod = prepare_data(df_application_train, df_application_test)
df_data = clean_data(df_data)
df_data = feature_engineering(df_data, df_bureau, df_bureau_balance)
df_data = gerer_outliers(df_data)
df_data = encode_categorical(df_data)
df_data = impute_missing(df_data)
df_data = set_index(df_data)
df_data = equilibre_classes(df_data)
X_train, X_test, y_train, y_test = split_data(df_data)
X_train, X_test = standardize_data(X_train, X_test)
print('X_train',X_train.shape, 'y_train',y_train.shape)
print('X_test',X_train.shape, 'y_test',y_train.shape)
print('Fin Pre-process')

Debut Pre-process
Chargement data...
Préparer les df...
Nettoyage...
Feature Engineering...
Outliers...
Encodage categoriel...
Imputation des NaN...
Set index...
Équilibrage de classes...
Split train/test...
Standardisation...
X_train (150, 141) y_train (150,)
X_test (150, 141) y_test (150,)
Fin Pre-process


**MLFLOW**

**Models simples**

In [123]:
mlflow.set_experiment("Dummy_models")
mlflow.sklearn.autolog()



**DummyRegressor**

In [124]:
with mlflow.start_run(run_name = 'regressor_model' ):
    dr = DummyRegressor()
    dr.fit(X_train,y_train)

**DummyClassifier**

In [125]:
mlflow.sklearn.autolog(disable=True)
with mlflow.start_run(run_name = 'classifier_model'):
  dc = DummyClassifier()
  dc.fit(X_train, y_train)
  mlflow.sklearn.log_model(dc, "dummy_classifier_model")

# **Lancer mlflow en mode interface**

In [134]:
ngrok. kill()
ngrok.set_auth_token("2UDkEnEAel94wMucTKrRzP54czT_3NFaKK2AAxFEfT1GSZbo")
print(ngrok.connect(addr="5000", proto="http", bind_tls=True).public_url)

!mlflow ui



https://7eee-35-199-63-192.ngrok-free.app
[2023-08-23 15:00:04 +0000] [40453] [INFO] Starting gunicorn 21.2.0
[2023-08-23 15:00:04 +0000] [40453] [INFO] Listening at: http://127.0.0.1:5000 (40453)
[2023-08-23 15:00:04 +0000] [40453] [INFO] Using worker: sync
[2023-08-23 15:00:04 +0000] [40454] [INFO] Booting worker with pid: 40454
[2023-08-23 15:01:23 +0000] [40453] [INFO] Handling signal: int

Aborted!
[2023-08-23 15:01:23 +0000] [40454] [INFO] Worker exiting (pid: 40454)
[2023-08-23 15:01:23 +0000] [40453] [INFO] Shutting down: Master


# **Predictions**

In [129]:
# df_prod_save = df_prod.copy()
df_prod = df_prod_save

In [130]:
print('Debut Pre-process')
df_prod = clean_data(df_prod)
df_prod = feature_engineering(df_prod, df_bureau, df_bureau_balance)
df_prod = gerer_outliers(df_prod)
df_prod = encode_categorical(df_prod)
df_prod = impute_missing(df_prod)
df_prod = set_index(df_prod)
print('Fin Pre-process')

Debut Pre-process
Nettoyage...
Feature Engineering...
Outliers...
Encodage categoriel...
Imputation des NaN...
Set index...
Fin Pre-process


In [None]:
df_prod['TARGET_DR'] = mlflow.pyfunc.load_model('runs:/fd8149c9af0346a780e23725104a13c8/model').predict(df_prod.head(100))
df_prod['TARGET_DC'] = mlflow.pyfunc.load_model('runs:/0002a800d94f4d178e7dd817594f9926/dummy_classifier_model').predict(df_prod.head(100))

In [140]:
print(df_prod.TARGET_DR.value_counts())
print(df_prod.TARGET_DC.value_counts())

0.513333    100
Name: TARGET_DR, dtype: int64
1    100
Name: TARGET_DC, dtype: int64


In [136]:
df_prod[['TARGET_DR','TARGET_DC']]

Unnamed: 0_level_0,TARGET_DR,TARGET_DC
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1
100001,0.513333,1
100005,0.513333,1
100013,0.513333,1
100028,0.513333,1
100038,0.513333,1
...,...,...
100752,0.513333,1
100753,0.513333,1
100754,0.513333,1
100760,0.513333,1


In [132]:
df_prod.shape

(100, 141)

In [133]:
df_prod_save.shape

(100, 121)

In [None]:
print("FIN")

FIN
