In [1]:
%%capture
%run part03_preparation.ipynb

In [33]:
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

import sys
import os
from datetime import datetime

import pickle

from mlflow.models.signature import infer_signature

#Visualization Tools
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns


# Machine Learning Models
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Model Selection Tools
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, cross_validate, StratifiedKFold
from sklearn.pipeline import Pipeline, make_pipeline
#import optuna

# Model Evaluation Tools
from sklearn.metrics import accuracy_score, precision_score, recall_score, log_loss, roc_auc_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, classification_report


RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [3]:
ROOT_DIR = '/home/alysson/projects/Hotel-Booking-Cancelations'
MLFLOW_DIR = '/home/alysson/projects/Hotel-Booking-Cancelations/mlflow'
MLFLOW_PLOTS_FOLDER = '/home/alysson/projects/Hotel-Booking-Cancelations/mlflow/plots'
MLFLOW_CONFUSION_MATRIX_PATH = '/home/alysson/projects/Hotel-Booking-Cancelations/mlflow/plots/confusion_matrix.png'

In [4]:
os.makedirs(MLFLOW_DIR, exist_ok=True)
os.makedirs(MLFLOW_PLOTS_FOLDER, exist_ok=True)

In [5]:
if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)
os.chdir(ROOT_DIR)

In [15]:
def get_metrics(y_true, y_pred):    
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    #entropy = log_loss(y_true, y_pred_prob)
    return {'Accuracy': round(acc, 3), 
            'Precision': round(prec, 3), 
            'Recall': round(recall, 3), 
            'F1': round(f1, 3),
            'Auc': round(auc, 3),
            #'Entropy': round(entropy, 3)
           }

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, stratify=y, shuffle=True, random_state=RANDOM_SEED)

## Begin

In [6]:
mlflow.set_tracking_uri("sqlite:///mlflow/mlflow.db")
mlflow.set_experiment("Hotel Cancelations Booking")

2023/06/12 13:01:45 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/06/12 13:01:45 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2023/06/12 13:01:45 INFO mlflow.tracking.fluent: Experiment with name 'Hotel Cancelations Booking' does not exist. Creating a new experiment.


<Experiment: artifact_location='/home/alysson/projects/Hotel-Booking-Cancelations/mlruns/1', creation_time=1686585705056, experiment_id='1', last_update_time=1686585705056, lifecycle_stage='active', name='Hotel Cancelations Booking', tags={}>

In [32]:
mlflow.sklearn.autolog(disable=True)

run_name='lightgbm'

with mlflow.start_run(run_name=run_name):
    
    lgbm_params = {'objective': 'binary', 
              'boosting_type': 'gbdt', 
              'num_leaves': 220, 
              'max_depth': 7, 
              'learning_rate': 0.075, 
              'reg_alpha': 0.002, 
              'reg_lambda': 0.025, 
              'subsample_freq': 2, 
              'min_child_samples': 1, 
              'scale_pos_weight': 2.7}

    mlflow.set_tag("Tag1", "Tag2")
    mlflow.log_params(lgbm_params)

    lgbm = LGBMClassifier(**lgbm_params)
    lgbm.fit(X_train, y_train)
    lgbm_preds = lgbm.predict(X_val)
    lgbm_scores = get_metrics(y_val, lgbm_preds)
    
    for score in lgbm_scores:
            mlflow.log_metric(score, lgbm_scores[score])
    

    mlflow.lightgbm.log_model(lgbm, "lgbm")
    
    #Print
    run_id = mlflow.active_run().info.run_id
        
    print(f'Run Name: {run_name}\nRun id: {run_id}\n')

Run Name: lightgbm
Run id: 401a11aa36104fe49d61248b3da678cc



In [39]:
mlflow.sklearn.autolog(disable=True)

run_name='xgboost'

with mlflow.start_run(run_name=run_name):
    
    xgb_params = {'objective': 'binary:logistic',
                  'eval_metric': 'auc',
                  'booster': 'gbtree',
                  'max_depth': 12,
                  'learning_rate': 0.1,
                  'n_estimators': 180,
                  'alpha': 2.95e-08,
                  'lambda': 4.00e-07,
                  'min_child_weight': 5,
                  'colsample_bytree': 0.5,
                  'scale_pos_weight': 2.63,
                  'seed': 42}

    mlflow.set_tag("XGBoost", "XGboost Tag2")
    mlflow.log_params(xgb_params)

    xgb = XGBClassifier(**xgb_params)
    xgb.fit(X_train, y_train)
    xgb_preds = xgb.predict(X_val)
    xgb_scores = get_metrics(y_val, xgb_preds)
    
    for score in xgb_scores:
            mlflow.log_metric(score, xgb_scores[score])
    

    mlflow.xgboost.log_model(xgb, "xgboost")
    
    run_id = mlflow.active_run().info.run_id
        
    print(f'Run Name: {run_name}\nRun id: {run_id}\n')

Run Name: xgboost
Run id: 60d96ab7386f4b34aa5029a246ff7557



In [40]:
model_id = "runs:/401a11aa36104fe49d61248b3da678cc/lgbm" # take it from mlflow
loaded_ft = mlflow.lightgbm.load_model(model_id)