In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [3]:
df = pd .read_csv(r"./data/dataset_credito_transacional.csv", delimiter=",")

In [4]:
list_target =["personal_loan", 
              "mortgage",
              "auto_loan",
              "credit_card",
              "overdraft", 
              "payroll_loan",
              "student_loan",
              "working_capital_loan"]

df_targets = df[list_target]

list_primary_keys = ["person_id", "transaction_date", "name"]

df_primary_keys = df[list_primary_keys]

df_features = df.drop(columns=list_target + list_primary_keys)

In [5]:
dict_education_cat = {
    'Primary': 1,
    'Secondary': 2,
    'Bachelor': 3,
    'Postgraduate': 4
}

dict_salary_cat = {
    "<30k": 1,
    "30k-50k": 2,
    "50k-80k": 3,
    "80k-120k": 4,
    ">120k": 5
}

dict_investment_cat = {
    "<1k": 1,
    "1k-10k": 2,
    "10k-50k": 3,
    ">50k": 4
}

df_features['education'] = df_features['education'].map(dict_education_cat)
df_features['annual_salary'] = df_features['annual_salary'].map(dict_salary_cat)
df_features['invested_amount'] = df_features['invested_amount'].map(dict_investment_cat)




le = LabelEncoder()
df_features['sex'] = le.fit_transform(df_features['sex'])

for col in df_features.columns:
    if "has_" in col or "ever_loan" in col or "loan_paid" in col:
        df_features[col] = le.fit_transform(df_features[col])


df_final = pd.concat([df_primary_keys, df_features, df_targets], axis=1)


In [6]:
X = df_final.drop(columns=list_primary_keys + list_target)
y = df_final[list_target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# 1. Instanciar o Scaler
scaler = StandardScaler()

# 2. Ajustar nos dados de treino e transformar ambos
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
lr = LogisticRegression(C=10000,
                        solver= 'lbfgs',
                        max_iter=1000, 
                        class_weight='balanced')

model = MultiOutputClassifier(lr)

model.fit(X_train, y_train)

In [9]:
y_proba = model.predict_proba(X_test)

y_proba

[array([[0.60173512, 0.39826488],
        [0.56243365, 0.43756635],
        [0.56926825, 0.43073175],
        ...,
        [0.37611048, 0.62388952],
        [0.59857719, 0.40142281],
        [0.68425862, 0.31574138]]),
 array([[0.19062813, 0.80937187],
        [0.35327506, 0.64672494],
        [0.35956595, 0.64043405],
        ...,
        [0.69316831, 0.30683169],
        [0.44713458, 0.55286542],
        [0.86292104, 0.13707896]]),
 array([[0.56611139, 0.43388861],
        [0.31369745, 0.68630255],
        [0.31429314, 0.68570686],
        ...,
        [0.46913284, 0.53086716],
        [0.56428198, 0.43571802],
        [0.64130362, 0.35869638]]),
 array([[0.49665347, 0.50334653],
        [0.44888164, 0.55111836],
        [0.44774786, 0.55225214],
        ...,
        [0.57671115, 0.42328885],
        [0.50585637, 0.49414363],
        [0.6710303 , 0.3289697 ]]),
 array([[0.58359431, 0.41640569],
        [0.47611884, 0.52388116],
        [0.47380047, 0.52619953],
        ...,
        [

In [10]:
# 1. Obter as predições e probabilidades
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)

# 2. Lista para armazenar os resultados de cada target
performance_metrics = []

# 3. Iterar sobre cada um dos 8 targets
for i, target_name in enumerate(y_test.columns):
    # Extrair as predições para este target específico (coluna i)
    y_true_target = y_test[target_name]
    y_pred_target = y_pred[:, i]
    
    # Extrair a probabilidade da classe 1 (coluna 1 da matriz i da lista proba)
    y_proba_target = y_proba[i][:, 1]
    
    # Calcular métricas individuais
    metrics = {
        "Target": target_name,
        "Accuracy": accuracy_score(y_true_target, y_pred_target),
        "Precision": precision_score(y_true_target, y_pred_target, zero_division=0),
        "Recall": recall_score(y_true_target, y_pred_target, zero_division=0),
        "AUC": roc_auc_score(y_true_target, y_proba_target)
    }
    performance_metrics.append(metrics)

# 4. Criar o DataFrame final e exibir como tabela
df_metrics = pd.DataFrame(performance_metrics)

# Formatação para facilitar a leitura
print(df_metrics.to_string(index=False, formatters={
    'Accuracy': '{:,.4f}'.format,
    'Precision': '{:,.4f}'.format,
    'Recall': '{:,.4f}'.format,
    'AUC': '{:,.4f}'.format
}))

              Target Accuracy Precision Recall    AUC
       personal_loan   0.6572    0.7599 0.6442 0.7159
            mortgage   0.7391    0.8328 0.7542 0.8061
           auto_loan   0.6613    0.6387 0.6679 0.7151
         credit_card   0.5767    0.6429 0.5864 0.6096
           overdraft   0.5685    0.3143 0.5728 0.5957
        payroll_loan   0.6809    0.4369 0.6611 0.6981
        student_loan   0.5743    0.1050 0.5884 0.6064
working_capital_loan   0.6981    0.5589 0.7223 0.7730


In [None]:
# Salvar e baixar.
with open (r"./models/trained_model-0.1.0.pkl", "wb") as f:
    pickle.dump(model, f)

In [12]:
list_target =["personal_loan", 
            "mortgage",
            "auto_loan",
            "credit_card",
            "overdraft", 
            "payroll_loan",
            "student_loan",
            "working_capital_loan"]

dict_probas = dict()

for target, proba in zip(list_target, y_proba):
    dict_probas[target] = proba[0][1]

print(dict_probas)

{'personal_loan': np.float64(0.3982648778778678), 'mortgage': np.float64(0.8093718677506146), 'auto_loan': np.float64(0.43388861464760176), 'credit_card': np.float64(0.5033465263914724), 'overdraft': np.float64(0.4164056941722648), 'payroll_loan': np.float64(0.29028873605597794), 'student_loan': np.float64(0.5281230089712263), 'working_capital_loan': np.float64(0.5408988301448563)}
