Dataset: https://www.kaggle.com/uciml/default-of-credit-card-clients-dataset

Variáveis disponíveis:

    ID: ID of each client
    LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit
    SEX: Gender (1=male, 2=female)
    EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
    MARRIAGE: Marital status (1=married, 2=single, 3=others)
    AGE: Age in years
    PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, ... 8=payment delay for eight months, 9=payment delay for nine months and above)
    PAY_2: Repayment status in August, 2005 (scale same as above)
    PAY_3: Repayment status in July, 2005 (scale same as above)
    PAY_4: Repayment status in June, 2005 (scale same as above)
    PAY_5: Repayment status in May, 2005 (scale same as above)
    PAY_6: Repayment status in April, 2005 (scale same as above)
    BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)
    BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)
    BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
    BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)
    BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)
    BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)
    PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
    PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)
    PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
    PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)
    PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)
    PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)
    default.payment.next.month: Default payment (1=yes, 0=no)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import  precision_recall_curve, roc_auc_score, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score,auc, roc_curve
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier


import mlflow
from mlflow.models import infer_signature

In [2]:
def total_cost(y_test, y_preds, threshold = 0.5):
    
    tn, fp, fn, tp = confusion_matrix(y_test == 1, y_preds > threshold).ravel()
    
    cost_fn = fn*3000
    cost_fp = fp*1000
    
    return cost_fn + cost_fp

In [3]:
ROOT_PATH = '../data/'
PATH = ROOT_PATH + 'lending_data.csv'
TARGET_COL = 'default.payment.next.month'

SEED = 42

In [4]:
df = pd.read_csv(PATH)

In [5]:
df = df.drop('ID', axis = 1)

In [6]:
from pathlib import Path

# uri = "../../mlruns"

# Path(uri).mkdir(parents=True, exist_ok=True)

# mlflow.set_tracking_uri(uri)

from pathlib import Path

uri = "http://0.0.0.0:5001"

mlflow.set_tracking_uri(uri)

In [7]:
model_name = "random_forest"
model_version = "champion"
f"models:/{model_name}@{model_version}"

'models:/random_forest@champion'

In [8]:
model = mlflow.pyfunc.load_model(f"models:/{model_name}@{model_version}")
model

OSError: No such file or directory: '/Users/bernardomatos/Documents/Pos_Graduacao_Data_Science/Project OML/OML_BM/notebooks/../../mlruns/471014617806567252/23502ff28afe4ff49f2e0fd550d42c40/artifacts/random_forest/.'

In [None]:
client = mlflow.tracking.MlflowClient(
    tracking_uri=uri
)
model_data = client.get_run("900f5d3e2245408cac33c6410eaf4c5c").data.to_dictionary()
model_data

{'metrics': {'accuracy': 0.814,
  'recall': 0.3373952779893374,
  'roc_auc': 0.6424548397627505,
  'total_cost': 2856000.0,
  'precision': 0.6429608127721336,
  'f1': 0.44255744255744256},
 'params': {'seed': '42',
  'bootstrap': 'True',
  'max_depth': 'None',
  'max_samples': 'None',
  'min_weight_fraction_leaf': '0.0',
  'max_leaf_nodes': 'None',
  'class_weight': 'balanced',
  'min_samples_leaf': '1',
  'random_state': '42',
  'min_impurity_decrease': '0.0',
  'verbose': '0',
  'n_estimators': '300',
  'criterion': 'gini',
  'oob_score': 'False',
  'ccp_alpha': '0.0',
  'warm_start': 'False',
  'max_features': 'sqrt',
  'monotonic_cst': 'None',
  'n_jobs': 'None',
  'min_samples_split': '2'},
 'tags': {'mlflow.user': 'bernardomatos',
  'mlflow.runName': 'Random Forest Run',
  'mlflow.source.name': '/opt/anaconda3/envs/OML_Latest/lib/python3.12/site-packages/ipykernel_launcher.py',
  'mlflow.log-model.history': '[{"run_id": "900f5d3e2245408cac33c6410eaf4c5c", "artifact_path": "random

In [None]:
random_forest_params = model_data['params']
del random_forest_params['seed']
random_forest_params


{'bootstrap': 'True',
 'max_depth': 'None',
 'max_samples': 'None',
 'min_weight_fraction_leaf': '0.0',
 'max_leaf_nodes': 'None',
 'class_weight': 'balanced',
 'min_samples_leaf': '1',
 'random_state': '42',
 'min_impurity_decrease': '0.0',
 'verbose': '0',
 'n_estimators': '300',
 'criterion': 'gini',
 'oob_score': 'False',
 'ccp_alpha': '0.0',
 'warm_start': 'False',
 'max_features': 'sqrt',
 'monotonic_cst': 'None',
 'n_jobs': 'None',
 'min_samples_split': '2'}

In [9]:
random_forest_params = {'bootstrap': True,
 'max_depth': None,
 'max_samples': None,
 'min_weight_fraction_leaf': 0,
 'max_leaf_nodes': None,
 'class_weight': 'balanced',
 'min_samples_leaf': 1,
 'random_state': SEED,
 'min_impurity_decrease': 0,
 'verbose': 0,
 'n_estimators': 300,
 'criterion': 'gini',
 'oob_score': False,
 'ccp_alpha': 0,
 'warm_start': False,
 'max_features': 'sqrt',
 'monotonic_cst': None,
 'n_jobs': None,
 'min_samples_split': 2}

In [10]:
mlflow.set_experiment("Good Clients Prediction Experiment")

<Experiment: artifact_location=('/Users/bernardomatos/Documents/Pos_Graduacao_Data_Science/Project '
 'OML/OML_BM/notebooks/../../mlruns/471014617806567252'), creation_time=1743545831221, experiment_id='471014617806567252', last_update_time=1743545831221, lifecycle_stage='active', name='Good Clients Prediction Experiment', tags={}>

In [11]:
train_set, test_set = train_test_split(df, test_size = 0.2, random_state = SEED)

In [12]:
X_train = train_set.drop(['default.payment.next.month'], axis = 'columns')
y_train = train_set['default.payment.next.month']

X_test = test_set.drop(['default.payment.next.month'], axis = 1)
y_test = test_set['default.payment.next.month']

In [13]:
run = mlflow.start_run(run_name="Random Forest - pipeline")
RUN_ID = run.info.run_uuid
RUN_ID

'225f403e726842c4b7066c967ff29c8c'

In [14]:
# guardarmos o dataset de treino e de teste associado à run
train_dataset = mlflow.data.from_pandas(train_set, source=PATH, targets=TARGET_COL, name="Lending Dataset")
test_dataset = mlflow.data.from_pandas(test_set, source=PATH, targets=TARGET_COL, name="Lending Dataset")
mlflow.log_input(train_dataset, context="train")
mlflow.log_input(test_dataset, context="test")

# Guardamos a seed utilizado como parametro
mlflow.log_param("seed", SEED)

signature = infer_signature(X_train, y_train)

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


In [15]:
rf_pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('rf', RandomForestClassifier(**random_forest_params))
])
rf_pipeline.fit(X_train, y_train)
mlflow.sklearn.log_model(
    rf_pipeline,
    artifact_path="rf_pipeline",
    signature=signature,
    registered_model_name="random_forest"
)

Registered model 'random_forest' already exists. Creating a new version of this model...
2025/04/04 18:01:40 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest, version 5
Created version '5' of model 'random_forest'.


<mlflow.models.model.ModelInfo at 0x177399430>

In [16]:
params=rf_pipeline.get_params()

modified_params = {}
for k, v in params.items():
    new_key = k.replace("rf__", '')
    modified_params[new_key] = v

mlflow.log_params(modified_params)

In [17]:
y_preds = rf_pipeline.predict(X_test)

In [18]:
mlflow.log_metric("accuracy", accuracy_score(y_test, y_preds))
mlflow.log_metric("recall", recall_score(y_test, y_preds))
mlflow.log_metric("precision", precision_score(y_test, y_preds))
mlflow.log_metric("f1", f1_score(y_test, y_preds))
mlflow.log_metric("roc_auc", roc_auc_score(y_test, y_preds))
mlflow.log_metric("total_cost", total_cost(y_test, y_preds, threshold = 0.5))

In [19]:
mlflow.end_run()

🏃 View run Random Forest - pipeline at: http://0.0.0.0:5001/#/experiments/471014617806567252/runs/225f403e726842c4b7066c967ff29c8c
🧪 View experiment at: http://0.0.0.0:5001/#/experiments/471014617806567252
