# Import

In [None]:
import warnings
warnings.simplefilter(action='ignore')

import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import model_selection

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn import metrics

import pickle

import mlflow.sklearn
from mlflow.models.signature import infer_signature

mlflow.set_tracking_uri("http://127.0.0.1:5000")
import pickle
import plotly.offline as pyo
import plotly.express as px
# Set notebook mode to work in offline
pyo.init_notebook_mode()

# Fonction d'évaluation

In [2]:
def F_alpha_score(y_true, y_pred):
    alpha = 0.1
    
    
    tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()    
        
    f_alpha_score = ( 2 * tp) / (2 * tp + alpha *fp +  fn)
    f_alpha_score = float("{:.5f}".format(f_alpha_score)) 
    
    return f_alpha_score

# pour LGBM
def F_alpha_score_lbgm(y_true, y_pred, printScore = False):
    f_alpha_score = F_alpha_score(y_true, y_pred)
    is_higher_better = True
    return "f10score", f_alpha_score, is_higher_better 

def F_alpha_score_mlflow(eval_df, _builtin_metrics):
    return F_alpha_score(eval_df["target"], eval_df["prediction"])
        
def F_alpha_score_bayes_search(estimator, X, y):
    y_pred = estimator.predict(X)
    f_alpha_score = F_alpha_score(y, y_pred)
    return f_alpha_score
    

def eval_metrics(y_test, y_pred):
    recall = metrics.recall_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    f1_score = metrics.f1_score(y_test, y_pred)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    f10_score = F_alpha_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, y_pred)
    
    return recall, precision, f1_score, accuracy, f10_score, auc

# def conf_mat_transform(y_true, y_pred):
#     conf_mat = metrics.confusion_matrix(y_true, y_pred)

#     labels = pd.Series(y_true, name="y_true").to_frame()
#     labels['y_pred'] = y_pred
#     labels['y_pred_transform'] = labels['y_pred'].apply(lambda x: corresp[x])

#     return labels['y_pred_transform']


# def DisplayConfMatrix(conf_mat, display_labels):
#     cm_display = metrics.ConfusionMatrixDisplay(
#         confusion_matrix=conf_mat, display_labels=display_labels)
#     fig, ax = plt.subplots(figsize=(15, 10))
#     cm_display.plot(ax=ax)
#     ax.tick_params(axis='x', labelrotation=45)
#     plt.show()
    
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

In [3]:
def MlFlow_log_score(modele, nom_du_modele, run_name, X_test, y_test):

    with mlflow.start_run(run_name= run_name):

        signature = infer_signature(X_test, y_test)

        mlflow.sklearn.log_model(
            modele, "model", registered_model_name=nom_du_modele, signature=signature)

        model_uri = mlflow.get_artifact_uri("model")

        f_alpha_score_metric = mlflow.models.make_metric(
            eval_fn=F_alpha_score_mlflow, greater_is_better=True)
        eval_data = valid_x.copy()
        eval_data["TARGET"] = valid_y

        # Evaluate the logged model
        result = mlflow.evaluate(model_uri, eval_data, targets="TARGET", model_type="classifier", evaluators=[
                                 "default"], custom_metrics=[f_alpha_score_metric])

# Fonction de traitement  

In [4]:
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [5]:
# Preprocess application_train.csv and application_test.csv
def application_train_test(num_rows = None, nan_as_category = False):
    # Read data and merge
    df = pd.read_csv('input/application_train.csv', nrows= num_rows)
#     test_df = pd.read_csv('input/application_test.csv', nrows= num_rows)
#     print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
#     df = pd.concat([df, test_df], ignore_index=True)
    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']
    
    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    
    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
    # Some simple new features (percentages)
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
#     del test_df
    gc.collect()
    return df

In [6]:
# Preprocess bureau.csv and bureau_balance.csv
def bureau_and_balance(num_rows = None, nan_as_category = True):
    bureau = pd.read_csv('input/bureau.csv', nrows = num_rows)
    bb = pd.read_csv('input/bureau_balance.csv', nrows = num_rows)
    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
    
    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
    del bb, bb_agg
    gc.collect()
    
    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
    
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
    del active, active_agg
    gc.collect()
    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
    del closed, closed_agg, bureau
    gc.collect()
    return bureau_agg

In [7]:
# Preprocess previous_applications.csv
def previous_applications(num_rows = None, nan_as_category = True):
    prev = pd.read_csv('input/previous_application.csv', nrows = num_rows)
    prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)
    # Days 365.243 values -> nan
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    # Add feature: value ask / value received percentage
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
    # Previous applications numeric features
    num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean'],
        'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': ['min', 'max', 'mean'],
        'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
    }
    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
    
    prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()
    return prev_agg

In [8]:
# Preprocess POS_CASH_balance.csv
def pos_cash(num_rows = None, nan_as_category = True):
    pos = pd.read_csv('input/POS_CASH_balance.csv', nrows = num_rows)
    pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg

In [9]:
# Preprocess installments_payments.csv
def installments_payments(num_rows = None, nan_as_category = True):
    ins = pd.read_csv('input/installments_payments.csv', nrows = num_rows)
    ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum'],
        'DBD': ['max', 'mean', 'sum'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    # Count installments accounts
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
    del ins
    gc.collect()
    return ins_agg

In [10]:
# Preprocess credit_card_balance.csv
def credit_card_balance(nan_as_category = True):
    cc = pd.read_csv('input/credit_card_balance.csv')
    cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
    cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    del cc
    gc.collect()
    return cc_agg

In [11]:
a = credit_card_balance() 
a.info()

<class 'pandas.core.frame.DataFrame'>
Index: 103558 entries, 100006 to 456250
Columns: 141 entries, CC_MONTHS_BALANCE_MIN to CC_COUNT
dtypes: bool(16), float64(101), int64(24)
memory usage: 101.1 MB


# Classification

In [12]:
def train_models(df, log_reg = True):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train_df[feats], train_df['TARGET'], train_size=0.8) 
    
    print("Starting LightGBM. Train shape: {}, valid shape: {}".format(train_x.shape, train_y.shape))
    
     
    train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train_df[feats], train_df['TARGET'], train_size=0.8) 

    # Dummy classifier
    dum_clf = DummyClassifier(strategy="most_frequent")
    dum_clf.fit(train_x, train_y)
    
    # Logistic Regression
    log_clf = LogisticRegression(max_iter=1000, random_state=42)
    if(log_reg):
        log_clf.fit(train_x, train_y)
    
    
    # LightGBM
    lgbm_clf = LGBMClassifier(random_state=42)

    lgbm_clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
        eval_metric= 'F_alpha_score_lbgm', verbose= 200, early_stopping_rounds= 200)
    
    
    return dum_clf, log_clf, lgbm_clf, valid_x, valid_y

In [13]:
def train_lgbm(df):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train_df[feats], train_df['TARGET'], train_size=0.8) 

    # LightGBM
    lgbm_clf = LGBMClassifier(random_state=42)

    lgbm_clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
        eval_metric= 'F_alpha_score_lbgm', verbose= 200, early_stopping_rounds= 200)
    
    
    return lgbm_clf, valid_x, valid_y

In [14]:
# LightGBM GBDT with KFold or Stratified KFold
# Parameters from Tilii kernel: https://www.kaggle.com/tilii7/olivier-lightgbm-parameters-by-bayesian-opt/code
def kfold_lightgbm(df, num_folds, stratified = False, debug= False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for  n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        # LightGBM parameters found by Bayesian optimization
        clf = LGBMClassifier(
            nthread=4,
            n_estimators=5000,
            learning_rate=0.02,
            num_leaves=34,
            colsample_bytree=0.9497036,
            subsample=0.8715623,
            max_depth=8,
            reg_alpha=0.041545473,
            reg_lambda=0.0735294,
            min_split_gain=0.0222415,
            min_child_weight=39.3259775,
            silent=-1,
            verbose=-1, )

        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric= 'F_alpha_score_lbgm', verbose= 200, early_stopping_rounds= 200)
       
        del train_x, train_y, valid_x, valid_y
        gc.collect()

    return clf

In [15]:
with timer("Process application train and test"):
    df = application_train_test()    
    
    # Suppression des valeurs null, afin de pouvoir lancer la log reg
        # Colonnes
    null_values = (df.isnull().sum() / len(df)) * 100
    columns_to_delete = null_values[null_values > 30].index
    df_clean = df.drop(columns=columns_to_delete)
        # Lignes
    df_clean.dropna(inplace=True)
    
    dum_clf, log_clf, lgbm_clf, valid_x, valid_y = train_models(df_clean)
    
    MlFlow_log_score(dum_clf, "Dummy classifier", "dummy", valid_x, valid_y)
    
    MlFlow_log_score(log_clf, "Log reg classifier", "logistic regression", valid_x, valid_y)
    
    MlFlow_log_score(lgbm_clf, "LGBM classifier", "lgbm: application_train", valid_x, valid_y)

Starting LightGBM. Train shape: (160870, 200), valid shape: (160870,)
Training until validation scores don't improve for 200 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's binary_logloss: 0.233317	valid_1's binary_logloss: 0.249975


Registered model 'Dummy classifier' already exists. Creating a new version of this model...
2023/09/27 20:19:26 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: Dummy classifier, version 15
Created version '15' of model 'Dummy classifier'.


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2023/09/27 20:19:27 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/09/27 20:19:27 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Registered model 'Log reg classifier' already exists. Creating a new version of this model...
2023/09/27 20:19:32 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: Log reg classifier, version 15
Created version '15' of model 'Log reg classifier'.


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2023/09/27 20:19:32 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/09/27 20:19:32 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.
Registered model 'LGBM classifier' already exists. Creating a new version of this model...
2023/09/27 20:19:39 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: LGBM classifier, version 84
Created version '84' of model 'LGBM classifier'.


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2023/09/27 20:19:39 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/09/27 20:19:39 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.


Process application train and test - done in 60s


<Figure size 1050x700 with 0 Axes>

In [16]:
with timer("Process bureau and bureau_balance"):
    bureau = bureau_and_balance()
               
    df = df.join(bureau, how='left', on='SK_ID_CURR')
    
    del bureau
    gc.collect()    
        
    lgbm_clf, valid_x, valid_y = train_lgbm(df)    
    
    MlFlow_log_score(lgbm_clf, "LGBM classifier", "lgbm: add bureau and balance", valid_x, valid_y)

Training until validation scores don't improve for 200 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's binary_logloss: 0.22553	valid_1's binary_logloss: 0.248972


Registered model 'LGBM classifier' already exists. Creating a new version of this model...
2023/09/27 20:20:27 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: LGBM classifier, version 85
Created version '85' of model 'LGBM classifier'.


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2023/09/27 20:20:28 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/09/27 20:20:29 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.


Process bureau and bureau_balance - done in 52s


In [17]:
with timer("Process previous_applications"):
    prev = previous_applications()

    df = df.join(prev, how='left', on='SK_ID_CURR')
    del prev
    gc.collect()
    
    lgbm_clf, valid_x, valid_y = train_lgbm(df)    
    
    MlFlow_log_score(lgbm_clf, "LGBM classifier", "lgbm: add previous applications",  valid_x, valid_y)

Training until validation scores don't improve for 200 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's binary_logloss: 0.222144	valid_1's binary_logloss: 0.244543


Registered model 'LGBM classifier' already exists. Creating a new version of this model...
2023/09/27 20:21:56 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: LGBM classifier, version 86
Created version '86' of model 'LGBM classifier'.


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2023/09/27 20:21:57 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/09/27 20:22:00 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.


Process previous_applications - done in 94s


In [18]:
 with timer("Process POS-CASH balance"):
        pos = pos_cash()
        print("Pos-cash balance df shape:", pos.shape)
        df = df.join(pos, how='left', on='SK_ID_CURR')
        del pos
        gc.collect()
    
        lgbm_clf, valid_x, valid_y = train_lgbm(df)    

        MlFlow_log_score(lgbm_clf, "LGBM classifier", "lgbm: add POS-CASH balance", valid_x, valid_y)

Pos-cash balance df shape: (337252, 18)
Training until validation scores don't improve for 200 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's binary_logloss: 0.222404	valid_1's binary_logloss: 0.239369


Registered model 'LGBM classifier' already exists. Creating a new version of this model...
2023/09/27 20:23:16 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: LGBM classifier, version 87
Created version '87' of model 'LGBM classifier'.


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2023/09/27 20:23:17 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/09/27 20:23:20 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.


Process POS-CASH balance - done in 80s


In [19]:
with timer("Process installments payments"):
        ins = installments_payments()
        
        df = df.join(ins, how='left', on='SK_ID_CURR')
        del ins
        gc.collect()
    
        lgbm_clf, valid_x, valid_y = train_lgbm(df)    

        MlFlow_log_score(lgbm_clf, "LGBM classifier", "lgbm: add installments payments", valid_x, valid_y)

Training until validation scores don't improve for 200 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's binary_logloss: 0.221953	valid_1's binary_logloss: 0.234052


Registered model 'LGBM classifier' already exists. Creating a new version of this model...
2023/09/27 20:24:45 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: LGBM classifier, version 88
Created version '88' of model 'LGBM classifier'.


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2023/09/27 20:24:46 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/09/27 20:24:49 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.


Process installments payments - done in 90s


In [20]:
with timer("Process credit card balance"):
        cc = credit_card_balance()
                
        df = df.join(cc, how='left', on='SK_ID_CURR')
        
        # Nos colonnes bools sont transformées en object suite au merge. On rétabli la vérité
        boolean_columns = [col for col in df.columns if pd.api.types.is_object_dtype(df[col])]
        df[boolean_columns] = df[boolean_columns].astype(bool)
        
        del cc
        gc.collect()
    
        lgbm_clf, valid_x, valid_y = train_lgbm(df)    

        MlFlow_log_score(lgbm_clf, "LGBM classifier", "lgbm: add credit card balance", valid_x, valid_y)

Training until validation scores don't improve for 200 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's binary_logloss: 0.220345	valid_1's binary_logloss: 0.237554


Registered model 'LGBM classifier' already exists. Creating a new version of this model...
2023/09/27 20:26:30 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: LGBM classifier, version 89
Created version '89' of model 'LGBM classifier'.


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2023/09/27 20:26:31 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/09/27 20:26:35 INFO mlflow.models.evaluation.default_evaluator: The evaluation dataset is inferred as binary dataset, positive label is 1, negative label is 0.


Process credit card balance - done in 106s


In [25]:
pickle.dump(lgbm_clf, open('lgbm_client_scoring.pkl', 'wb'))
df.to_csv('home_credit_data.csv',index=False)