# Random Forest Without Scaling
Uma vez que o modelo Random Forest foi o que obteve melhores resultados , irei criar uma nova Run, treinando os dados sem normalização. Deste modo, os dados não precisam de ser normalizados antes de serem utilizados no modelo.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import  precision_recall_curve, roc_auc_score, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score,auc, roc_curve
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

import mlflow
from mlflow.models import infer_signature

In [2]:
def total_cost(y_test, y_preds, threshold = 0.5):
    
    tn, fp, fn, tp = confusion_matrix(y_test == 1, y_preds > threshold).ravel()
    
    cost_fn = fn*3000
    cost_fp = fp*1000
    
    return cost_fn + cost_fp

In [3]:
ROOT_PATH = '../data/'
PATH = ROOT_PATH + 'lending_data.csv'
TARGET_COL = 'default.payment.next.month'

SEED = 42

In [4]:
df = pd.read_csv(PATH)

In [5]:
df = df.drop('ID', axis = 1)

In [6]:
train_set, test_set = train_test_split(df, test_size = 0.2, random_state = SEED)

In [7]:
X_train = train_set.drop(['default.payment.next.month'], axis = 'columns')
y_train = train_set['default.payment.next.month']

X_test = test_set.drop(['default.payment.next.month'], axis = 1)
y_test = test_set['default.payment.next.month']

# Random Forest Without Pipeline Run

In [8]:
from pathlib import Path

uri = "../../mlruns"

Path(uri).mkdir(parents=True, exist_ok=True)

mlflow.set_tracking_uri(uri)

In [9]:
mlflow.set_experiment("Good Clients Prediction Experiment")

<Experiment: artifact_location='/Users/bernardomatos/Documents/Pos_Graduacao_Data_Science/Project_OML/OML_BM/notebooks/../../mlruns/151652261966181122', creation_time=1744968126687, experiment_id='151652261966181122', last_update_time=1744968126687, lifecycle_stage='active', name='Good Clients Prediction Experiment', tags={}>

In [10]:
run = mlflow.start_run(run_name="Random Forest - without scaling")
RUN_ID = run.info.run_uuid
RUN_ID

'e3d8ee14a1cb4c84bbd9654e253854d1'

In [11]:
# guardarmos o dataset de treino e de teste associado à run
train_dataset = mlflow.data.from_pandas(train_set, source=PATH, targets=TARGET_COL, name="Lending Dataset")
test_dataset = mlflow.data.from_pandas(test_set, source=PATH, targets=TARGET_COL, name="Lending Dataset")
mlflow.log_input(train_dataset, context="train")
mlflow.log_input(test_dataset, context="test")

# Guardamos a seed utilizado como parametro
mlflow.log_param("seed", SEED)

signature = infer_signature(X_train, y_train)

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


In [12]:
rf = RandomForestClassifier(random_state = SEED,  class_weight = 'balanced').fit(X_train, y_train)

parameters = {'n_estimators':[10, 100, 300, 1000]}

clf_rf = GridSearchCV(rf, parameters, cv = 5).fit(X_train, y_train)

In [13]:
mlflow.sklearn.log_model(clf_rf.best_estimator_, artifact_path="random_forest", registered_model_name="random_forest", signature=signature)

params=clf_rf.best_estimator_.get_params()
mlflow.log_params(params)
params

Registered model 'random_forest' already exists. Creating a new version of this model...
Created version '2' of model 'random_forest'.


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 300,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [14]:
y_preds = clf_rf.best_estimator_.predict(X_test)

In [15]:
mlflow.log_metric("accuracy", accuracy_score(y_test, y_preds))
mlflow.log_metric("recall", recall_score(y_test, y_preds))
mlflow.log_metric("precision", precision_score(y_test, y_preds))
mlflow.log_metric("f1", f1_score(y_test, y_preds))
mlflow.log_metric("roc_auc", roc_auc_score(y_test, y_preds))
mlflow.log_metric("total_cost", total_cost(y_test, y_preds, threshold = 0.5))

In [16]:
mlflow.end_run()