# Pipeline com Logistic Regression

Uma vez que o github tem uma limitação de tamanho de ficheiro e o modelo Random Forest ultrapassa o limite. Criei uma run com logistic regression. Esta run terá como base um pipeline em que o primeiro passo será normalizar os dados e o segundo aplicar o modelo. Com isto é possivel usar os dados sem os transformar antes de usar o modelo.

In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import  precision_recall_curve, roc_auc_score, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score,auc, roc_curve
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier


import mlflow
from mlflow.models import infer_signature

In [97]:
def total_cost(y_test, y_preds, threshold = 0.5):
    
    tn, fp, fn, tp = confusion_matrix(y_test == 1, y_preds > threshold).ravel()
    
    cost_fn = fn*3000
    cost_fp = fp*1000
    
    return cost_fn + cost_fp

In [98]:
ROOT_PATH = '../data/'
PATH = ROOT_PATH + 'lending_data.csv'
TARGET_COL = 'default.payment.next.month'

SEED = 42

In [99]:
df = pd.read_csv(PATH)

In [100]:
df = df.drop('ID', axis = 1)

### Correr localmente

In [101]:
from pathlib import Path

uri = "../mlruns"

Path(uri).mkdir(parents=True, exist_ok=True)

mlflow.set_tracking_uri(uri)



### Correr com Tracking Server

In [102]:
# from pathlib import Path

# uri = "http://0.0.0.0:5001"

# mlflow.set_tracking_uri(uri)

In [103]:
mlflow.set_experiment("Good Clients Prediction Experiment")

<Experiment: artifact_location='/Users/bernardomatos/Documents/Pos_Graduacao_Data_Science/Project_OML/OML_BM/notebooks/../mlruns/533740516389590842', creation_time=1745088104831, experiment_id='533740516389590842', last_update_time=1745088104831, lifecycle_stage='active', name='Good Clients Prediction Experiment', tags={}>

# Logistic Regression com Pipeline

In [104]:
train_set, test_set = train_test_split(df, test_size = 0.2, random_state = SEED)

In [105]:
X_train = train_set.drop(['default.payment.next.month'], axis = 'columns')
y_train = train_set['default.payment.next.month']

X_test = test_set.drop(['default.payment.next.month'], axis = 1)
y_test = test_set['default.payment.next.month']

In [106]:
scaler = MinMaxScaler()

features_names = X_train.columns

X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train, columns = features_names)

X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test, columns = features_names)

In [107]:
lr = LogisticRegression(max_iter = 500, solver = 'lbfgs', random_state = SEED, class_weight = 'balanced' )

parameters = {'C':[0.001, 0.01, 0.1, 1, 10, 100]}

clf_lr = GridSearchCV(lr, parameters, cv = 5).fit(X_train_scaled, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [108]:
y_preds = clf_lr.best_estimator_.predict(X_test)

## Pipeline

In [109]:
run = mlflow.start_run(run_name="Logistic Regression - Pipeline")
RUN_ID = run.info.run_uuid
RUN_ID

'e004c71400184833a039482d83692503'

In [110]:
# guardarmos o dataset de treino e de teste associado à run
train_dataset = mlflow.data.from_pandas(train_set, source=PATH, targets=TARGET_COL, name="Lending Dataset")
test_dataset = mlflow.data.from_pandas(test_set, source=PATH, targets=TARGET_COL, name="Lending Dataset")
mlflow.log_input(train_dataset, context="train")
mlflow.log_input(test_dataset, context="test")

signature = infer_signature(X_train, y_train)

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


In [111]:
lr_pipeline = Pipeline(
    steps=[
        ("scaler", MinMaxScaler()),
        ("logistic_regression", clf_lr.best_estimator_)
])
lr_pipeline.fit(X_train, y_train)
mlflow.sklearn.log_model(lr_pipeline, artifact_path="lr_pipeline", registered_model_name="logistic_reg", signature=signature)
lr_pipeline

Registered model 'logistic_reg' already exists. Creating a new version of this model...
Created version '5' of model 'logistic_reg'.


In [112]:
params=lr_pipeline.get_params()

modified_params = {}
for k, v in params.items():
    new_key = k.replace("logistic_regression__", '')
    modified_params[new_key] = v

mlflow.log_params(modified_params)
modified_params

{'memory': None,
 'steps': [('scaler', MinMaxScaler()),
  ('logistic_regression',
   LogisticRegression(C=0.1, class_weight='balanced', max_iter=500,
                      random_state=42))],
 'transform_input': None,
 'verbose': 0,
 'scaler': MinMaxScaler(),
 'logistic_regression': LogisticRegression(C=0.1, class_weight='balanced', max_iter=500,
                    random_state=42),
 'scaler__clip': False,
 'scaler__copy': True,
 'scaler__feature_range': (0, 1),
 'C': 0.1,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 500,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 42,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'warm_start': False}

In [113]:
y_preds = lr_pipeline.predict(X_test)

In [114]:
mlflow.log_metric("accuracy", accuracy_score(y_test, y_preds))
mlflow.log_metric("recall", recall_score(y_test, y_preds))
mlflow.log_metric("precision", precision_score(y_test, y_preds))
mlflow.log_metric("f1", f1_score(y_test, y_preds))
mlflow.log_metric("roc_auc", roc_auc_score(y_test, y_preds))
mlflow.log_metric("total_cost", total_cost(y_test, y_preds, threshold = 0.5))

In [115]:
mlflow.end_run()

# Neural Network with Pipeline

In [116]:
mlp = MLPClassifier(solver = 'lbfgs',  random_state = SEED, max_iter = 1000 )

parameters = {'hidden_layer_sizes': [(20,), (20,10), (20, 10, 2)], 'learning_rate_init':[0.0001, 0.001, 0.01, 0.1]}

clf_mlp = GridSearchCV(mlp, parameters, cv = 5).fit(X_train_scaled, y_train)

ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https:/

In [117]:
run = mlflow.start_run(run_name="Neural Network - Pipeline")
RUN_ID = run.info.run_uuid
RUN_ID

'3ec1b31625cd4a90b2e72319b40a3b6d'

In [118]:
# guardarmos o dataset de treino e de teste associado à run
train_dataset = mlflow.data.from_pandas(train_set, source=PATH, targets=TARGET_COL, name="Lending Dataset")
test_dataset = mlflow.data.from_pandas(test_set, source=PATH, targets=TARGET_COL, name="Lending Dataset")
mlflow.log_input(train_dataset, context="train")
mlflow.log_input(test_dataset, context="test")

signature = infer_signature(X_train, y_train)

  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


In [119]:
mlp_pipeline = Pipeline(
    steps=[
        ("scaler", MinMaxScaler()),
        ("neural_networks", clf_mlp.best_estimator_)
])
mlp_pipeline.fit(X_train, y_train)
mlflow.sklearn.log_model(mlp_pipeline, artifact_path="mlp_pipeline", registered_model_name="neural_networks", signature=signature)
mlp_pipeline

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
Registered model 'neural_networks' already exists. Creating a new version of this model...
Created version '3' of model 'neural_networks'.


In [120]:
params=mlp_pipeline.get_params()

modified_params = {}
for k, v in params.items():
    new_key = k.replace("neural_networks__", '')
    modified_params[new_key] = v

mlflow.log_params(modified_params)
modified_params

{'memory': None,
 'steps': [('scaler', MinMaxScaler()),
  ('neural_networks',
   MLPClassifier(hidden_layer_sizes=(20, 10), learning_rate_init=0.0001,
                 max_iter=1000, random_state=42, solver='lbfgs'))],
 'transform_input': None,
 'verbose': False,
 'scaler': MinMaxScaler(),
 'neural_networks': MLPClassifier(hidden_layer_sizes=(20, 10), learning_rate_init=0.0001,
               max_iter=1000, random_state=42, solver='lbfgs'),
 'scaler__clip': False,
 'scaler__copy': True,
 'scaler__feature_range': (0, 1),
 'activation': 'relu',
 'alpha': 0.0001,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (20, 10),
 'learning_rate': 'constant',
 'learning_rate_init': 0.0001,
 'max_fun': 15000,
 'max_iter': 1000,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': 42,
 'shuffle': True,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'warm

In [121]:
y_preds = mlp_pipeline.predict(X_test)

In [122]:
mlflow.log_metric("accuracy", accuracy_score(y_test, y_preds))
mlflow.log_metric("recall", recall_score(y_test, y_preds))
mlflow.log_metric("precision", precision_score(y_test, y_preds))
mlflow.log_metric("f1", f1_score(y_test, y_preds))
mlflow.log_metric("roc_auc", roc_auc_score(y_test, y_preds))
mlflow.log_metric("total_cost", total_cost(y_test, y_preds, threshold = 0.5))

In [123]:
mlflow.end_run()