# Scikit-learn

Scikit-learn es una biblioteca esencial para el aprendizaje automático en Python construida sobre numpy. Ofrece una interfaz simple y eficiente para realizar tareas como preprocesamiento de datos, clasificación, regresión, agrupamiento, y evaluación de modelos. Es ampliamente utilizada tanto en investigación como en producción, y se integra bien con otras herramientas de Python.

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import pathlib
import kaggle
import numpy as np
import pandas as pd
import joblib

from databank_mlops.preprocessors.field_number import (
    mb_simple_imputer,
    mb_standard_scaler,
)
from databank_mlops.preprocessors.field_text_to_number import mb_clean_text_number
from databank_mlops.preprocessors.field_text import mb_clean_text, mb_woe_encoder

## Descarga del dataframe

In [None]:
URL_DATASET = r"parisrohan/credit-score-classification"
PATH_DATA = pathlib.Path("../data/")

In [None]:
kaggle.api.dataset_download_files(URL_DATASET, path=PATH_DATA, unzip=True)
filenames = [f.name for f in kaggle.api.dataset_list_files(URL_DATASET).files]
print(filenames)

## Carga data

In [None]:
data = pd.read_csv(PATH_DATA.joinpath(filenames[1]))
data.columns = data.columns.str.strip().str.upper()
data.head(2)

In [None]:
for c in [
    "OCCUPATION",
    "AGE",
    "ANNUAL_INCOME",
    "NUM_OF_LOAN",
    "NUM_OF_DELAYED_PAYMENT",
    "CHANGED_CREDIT_LIMIT",
    "OUTSTANDING_DEBT",
    "AMOUNT_INVESTED_MONTHLY",
    "MONTHLY_BALANCE",
]:
    print(data.loc[:, c].unique())

In [None]:
data.loc[
    :,
    [
        "MONTHLY_INHAND_SALARY",
        "NUM_BANK_ACCOUNTS",
        "NUM_CREDIT_CARD",
        "INTEREST_RATE",
        "DELAY_FROM_DUE_DATE",
        "NUM_CREDIT_INQUIRIES",
        "CREDIT_UTILIZATION_RATIO",
        "TOTAL_EMI_PER_MONTH",
    ],
].describe()

In [None]:
data.loc[:, ["CREDIT_SCORE"]].value_counts(dropna=False, normalize=True)

In [None]:
data = data.loc[
    :,
    [
        "OCCUPATION",
        "AGE",
        "ANNUAL_INCOME",
        "NUM_OF_LOAN",
        "NUM_OF_DELAYED_PAYMENT",
        "CHANGED_CREDIT_LIMIT",
        "OUTSTANDING_DEBT",
        "AMOUNT_INVESTED_MONTHLY",
        "MONTHLY_BALANCE",
        "MONTHLY_INHAND_SALARY",
        "NUM_BANK_ACCOUNTS",
        "NUM_CREDIT_CARD",
        "INTEREST_RATE",
        "DELAY_FROM_DUE_DATE",
        "NUM_CREDIT_INQUIRIES",
        "CREDIT_UTILIZATION_RATIO",
        "TOTAL_EMI_PER_MONTH",
        "CREDIT_SCORE",
    ],
]

## Preparación train y test

In [None]:
SEED = 2025
np.random.seed(SEED)

In [None]:
encoder_target = LabelEncoder()
y = encoder_target.fit_transform(data["CREDIT_SCORE"])
print(encoder_target.inverse_transform(y))
print(encoder_target.classes_)
y = pd.DataFrame(np.where((y == 1), 1, 0), columns=["CREDIT_SCORE"])


x_train, x_test, y_train, y_test = train_test_split(
    data.drop(columns=["CREDIT_SCORE"]),
    y.values.ravel(),
    test_size=0.2,
    random_state=SEED,
    stratify=y.values.ravel(),
    shuffle=True,
)

## Generación del Pipeline

In [None]:
pl_cat = Pipeline(
    [
        ("clean", mb_clean_text("MISSING")),
        ("woe", mb_woe_encoder()),
    ]
)

pl_cat_num = Pipeline(
    [
        ("clean", mb_clean_text_number()),
        ("mb_simple_imputer", mb_simple_imputer(strategy="median")),
        ("scaler", mb_standard_scaler()),
    ]
)

pl_num = Pipeline(
    [
        ("mb_simple_imputer", mb_simple_imputer(strategy="median")),
        ("scaler", mb_standard_scaler()),
    ]
)

pl_preprocess = ColumnTransformer(
    [
        ("prep_cat", pl_cat, ["OCCUPATION"]),
        (
            "prep_cat_num",
            pl_cat_num,
            [
                "AGE",
                "ANNUAL_INCOME",
                "NUM_OF_LOAN",
                "NUM_OF_DELAYED_PAYMENT",
                "CHANGED_CREDIT_LIMIT",
                "OUTSTANDING_DEBT",
                "AMOUNT_INVESTED_MONTHLY",
                "MONTHLY_BALANCE",
            ],
        ),
        (
            "prep_num",
            pl_num,
            [
                "MONTHLY_INHAND_SALARY",
                "NUM_BANK_ACCOUNTS",
                "NUM_CREDIT_CARD",
                "INTEREST_RATE",
                "DELAY_FROM_DUE_DATE",
                "NUM_CREDIT_INQUIRIES",
                "CREDIT_UTILIZATION_RATIO",
                "TOTAL_EMI_PER_MONTH",
            ],
        ),
    ],
    force_int_remainder_cols=False,
)

In [None]:
pl_preprocess

In [None]:
data_clean = pl_preprocess.fit_transform(x_train, y_train)
data_clean

In [None]:
pl_preprocess.get_feature_names_out()

In [None]:
pl_preprocess.transformers_[0][1].named_steps["woe"].dict_woe

## Pipeline con preprocesamiento y modelo

In [None]:
pipeline_model = Pipeline(
    [("pl_prep", pl_preprocess), ("lineal_model", LogisticRegression())]
)

In [None]:
pipeline_model.fit(x_train, y_train)

In [None]:
joblib.dump(pipeline_model, r"../models/logistic_regresion_v1.joblib")

In [None]:
pipeline_model = joblib.load(r"../models/logistic_regresion_v1.joblib")

In [None]:
x_train.columns

## Resultados

In [None]:
print(
    f"Gini con Regresion Lineal en Train: {2 * roc_auc_score(y_train, pipeline_model.predict_proba(x_train)[:, 1]) - 1: .2%}"
)
print(
    f"Gini con Regresion Lineal en Test: {2 * roc_auc_score(y_test, pipeline_model.predict_proba(x_test)[:, 1]) - 1: .2%}"
)