# Pytorch & Tensorflow

**Tensorflow** TensorFlow es una biblioteca de código abierto para aprendizaje automático a través de un rango de tareas, y desarrollado por Google para satisfacer sus necesidades de sistemas capaces de construir y entrenar redes neuronales para detectar y descifrar patrones y correlaciones, análogos al aprendizaje y razonamiento usados por los humanos. 

**PyTorch** es un marco de deep learning de código abierto basado en software que se utiliza para crear redes neuronales, combinando la biblioteca de machine learning (ML) de Torch con una API de alto nivel basada en Python. Su flexibilidad y facilidad de uso, entre otros beneficios, lo han convertido en el marco de ML líder para las comunidades académicas y de investigación.

In [None]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.utils.validation import check_is_fitted
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pathlib
import kaggle
import numpy as np
import random

## Transformación personalizada

In [None]:
class mb_woe_encoder(BaseEstimator, TransformerMixin):
    """
    Clase creada como encoder de woe sobre variables
    categoricas. Esta transformacion podría utilizarse
    en un pipeline de sklearn
    """

    def __init__(self, pc_min_other=0.05, keep_na=True):
        """ """
        self.pc_min_other = pc_min_other
        self.keep_na = keep_na
        self.dict_woe = {}
        self.feature_names_in_ = None

    def fit(self, X, y=None):
        """ """
        self.feature_names_in_ = X.columns.to_numpy()

        if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
            y = y.values.squeeze()

        for c in self.feature_names_in_:
            self.dict_woe[c] = {"WOES": {}, "OTHERS": None}
            # Se eliminan registros con NAs
            target_sm = y[np.where(X[c].notna())]
            feature_sm = X.loc[(X[c].notna().values), [c]]

            df_aux = pd.concat(
                [
                    feature_sm.reset_index(drop=True),
                    pd.DataFrame(target_sm.reshape(-1, 1)),
                ],
                axis=1,
                ignore_index=False,
            )
            # Se calcula el numero de eventos por categoria
            event_x_cat = df_aux.groupby(c).sum()
            # Se calcula el numero de registros de cada categoria
            reg_x_cat = df_aux.groupby(c).count()
            # Se calcula el numero de no eventos por categoria
            non_event_x_cat = reg_x_cat - event_x_cat

            # Se calcula woe (Esta al reves de la teorica)
            woe = np.log((event_x_cat + 1) / (non_event_x_cat + 1))

            # Se resetean indices para tener columna con categorias
            woe = woe.reset_index()
            # Se renombran columnas
            woe.columns = ["CATEGORIA", "VALOR_WOE"]

            # Se calcula el numero de registros minimos que debe tener
            # una categoria para no ser considerada minoritaria
            lim_value_other = self.pc_min_other * len(y)

            # Se obtienen las categorias minoritarias
            ls_cat_excl = (
                reg_x_cat[(reg_x_cat < lim_value_other).values]
                .reset_index()
                .iloc[:, 0]
                .tolist()
            )

            # Se filtran WoE para quedarse solo con las categorias
            # mayoritarias
            woe_filtrado = woe[~(woe["CATEGORIA"].isin(ls_cat_excl))]
            # Se almacenan los woe en formato pares clave-valor de
            # categoria-valor
            self.dict_woe[c]["WOES"] = dict(
                zip(woe_filtrado["CATEGORIA"], woe_filtrado["VALOR_WOE"])
            )

            # Se calcula el woe comun para las clases minoritarias
            event_total_other = target_sm[
                np.where(~(feature_sm.isin(ls_cat_excl).values.squeeze()))
            ].sum()
            non_event_other = (
                target_sm[
                    np.where(~(feature_sm.isin(ls_cat_excl).values.squeeze()))
                ].size
                - event_total_other
            )
            self.dict_woe[c]["OTHERS"] = np.log(
                (event_total_other + 1) / (non_event_other + 1)
            )

        return self

    def transform(self, X):
        """ """
        for c in X.columns:
            X[c] = (
                X[c]
                .map(
                    lambda k: self._get_value_from_dict(
                        k, self.dict_woe[c]["WOES"], self.dict_woe[c]["OTHERS"]
                    )
                )
                .astype(np.float64)
            )

        return X

    # def fit_transform(self, X, y=None):
    #     """
    #     """
    #     return self.fit(X, y).transform(X)

    def _get_value_from_dict(self, key_dict, dict_values, value_other):
        """
        Función utilizada para que sustituye clave de un diccionario
        por su valor. En caso de que la clave sea None, np.nan,
        esta se sustituye por np.nan. Y para cuando no es
        desconocido el valor pero tampoco aparece en el diccionario
        se sustituye por value_other

        Keyword arguments:
        :key_dict: clave a sustituir por su valor
        :dict_values: diccionario con las tuplas clave-valor
        """
        if (pd.isna(key_dict)) and (self.keep_na):
            return np.nan
        else:
            return dict_values.get(key_dict, value_other)

    def get_feature_names_out(self, input_features=None):
        check_is_fitted(self)
        if input_features is None:
            input_features = self.feature_names_in_
        return np.array(input_features, dtype=object)


class mb_clean_text(BaseEstimator, TransformerMixin):
    """ """

    def __init__(self, replace_value=None):
        """ """
        self.replace_value = replace_value

    def fit(self, X, y=None):
        """ """
        self.feature_names_in_ = X.columns.to_numpy()

        return self

    def transform(self, X):
        """ """
        for c in X.columns:
            X[c] = X[c].str.replace(r"_", "").str.strip()
            X.loc[(X[c] == ""), c] = self.replace_value

            if self.replace_value is not None:
                X.loc[(X[c].isna()), c] = self.replace_value

        return X

    def get_feature_names_out(self, input_features=None):
        check_is_fitted(self)
        if input_features is None:
            input_features = self.feature_names_in_
        return np.array(input_features, dtype=object)

In [None]:
class mb_clean_text_number(BaseEstimator, TransformerMixin):
    """ """

    def __init__(
        self,
    ):
        """ """

    def fit(self, X, y=None):
        """ """
        self.feature_names_in_ = X.columns.to_numpy()

        return self

    def transform(self, X):
        """ """
        for c in X.columns:
            X[c] = X[c].astype(str).str.strip('_ ,"')
            X[c] = pd.to_numeric(X[c], errors="coerce")
        return X

    def get_feature_names_out(self, input_features=None):
        check_is_fitted(self)
        if input_features is None:
            input_features = self.feature_names_in_
        return np.array(input_features, dtype=object)

In [None]:
class mb_standard_scaler(StandardScaler):
    """ """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.feature_names_in_ = None

    def fit(self, X, y=None):
        # Guardamos los nombres de columnas si es DataFrame
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.to_numpy()
        else:
            self.feature_names_in_ = None
        return super().fit(X, y)

    def get_feature_names_out(self, input_features=None):
        check_is_fitted(self)
        if input_features is None:
            input_features = self.feature_names_in_
        return np.array(input_features, dtype=object)


class mb_simple_imputer(SimpleImputer):
    """ """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.feature_names_in_ = None

    def fit(self, X, y=None):
        # Guardamos los nombres de columnas si es DataFrame
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = X.columns.to_numpy()
        else:
            self.feature_names_in_ = None
        return super().fit(X, y)

    def get_feature_names_out(self, input_features=None):
        check_is_fitted(self)
        if input_features is None:
            input_features = self.feature_names_in_
        return np.array(input_features, dtype=object)

## Descarga del dataframe

In [None]:
URL_DATASET = r"parisrohan/credit-score-classification"
PATH_DATA = pathlib.Path("../data/")

In [None]:
kaggle.api.dataset_download_files(URL_DATASET, path=PATH_DATA, unzip=True)
filenames = [f.name for f in kaggle.api.dataset_list_files(URL_DATASET).files]
print(filenames)

## Carga data

In [None]:
data = pd.read_csv(PATH_DATA.joinpath(filenames[1]), low_memory=False)
data.columns = data.columns.str.strip().str.upper()
data.head(2)

In [None]:
for c in [
    "OCCUPATION",
    "AGE",
    "ANNUAL_INCOME",
    "NUM_OF_LOAN",
    "NUM_OF_DELAYED_PAYMENT",
    "CHANGED_CREDIT_LIMIT",
    "OUTSTANDING_DEBT",
    "AMOUNT_INVESTED_MONTHLY",
    "MONTHLY_BALANCE",
]:
    print(data.loc[:, c].unique())

In [None]:
data.loc[
    :,
    [
        "MONTHLY_INHAND_SALARY",
        "NUM_BANK_ACCOUNTS",
        "NUM_CREDIT_CARD",
        "INTEREST_RATE",
        "DELAY_FROM_DUE_DATE",
        "NUM_CREDIT_INQUIRIES",
        "CREDIT_UTILIZATION_RATIO",
        "TOTAL_EMI_PER_MONTH",
    ],
].describe()

In [None]:
data.loc[:, ["CREDIT_SCORE"]].value_counts(dropna=False, normalize=True)

In [None]:
data = data.loc[
    :,
    [
        "OCCUPATION",
        "AGE",
        "ANNUAL_INCOME",
        "NUM_OF_LOAN",
        "NUM_OF_DELAYED_PAYMENT",
        "CHANGED_CREDIT_LIMIT",
        "OUTSTANDING_DEBT",
        "AMOUNT_INVESTED_MONTHLY",
        "MONTHLY_BALANCE",
        "MONTHLY_INHAND_SALARY",
        "NUM_BANK_ACCOUNTS",
        "NUM_CREDIT_CARD",
        "INTEREST_RATE",
        "DELAY_FROM_DUE_DATE",
        "NUM_CREDIT_INQUIRIES",
        "CREDIT_UTILIZATION_RATIO",
        "TOTAL_EMI_PER_MONTH",
        "CREDIT_SCORE",
    ],
]

## Preparación train y test

In [None]:
SEED = 2025
np.random.seed(SEED)

In [None]:
encoder_target = LabelEncoder()
y = encoder_target.fit_transform(data["CREDIT_SCORE"])
print(y)
print(encoder_target.inverse_transform(y))
print(encoder_target.classes_)
y = pd.DataFrame(np.where((y == 1), 1, 0), columns=["CREDIT_SCORE"])


x_train, x_test, y_train, y_test = train_test_split(
    data.drop(columns=["CREDIT_SCORE"]),
    y.values.ravel(),
    test_size=0.2,
    random_state=2025,
    stratify=y.values.ravel(),
)

## Generación del Pipeline

In [None]:
pl_cat = Pipeline(
    [
        ("clean", mb_clean_text("MISSING")),
        ("woe", mb_woe_encoder()),
    ]
)

pl_cat_num = Pipeline(
    [
        ("clean", mb_clean_text_number()),
        ("mb_simple_imputer", mb_simple_imputer(strategy="median")),
        ("scaler", mb_standard_scaler()),
    ]
)

pl_num = Pipeline(
    [
        ("mb_simple_imputer", mb_simple_imputer(strategy="median")),
        ("scaler", mb_standard_scaler()),
    ]
)

pl_preprocess = ColumnTransformer(
    [
        ("prep_cat", pl_cat, ["OCCUPATION"]),
        (
            "prep_cat_num",
            pl_cat_num,
            [
                "AGE",
                "ANNUAL_INCOME",
                "NUM_OF_LOAN",
                "NUM_OF_DELAYED_PAYMENT",
                "CHANGED_CREDIT_LIMIT",
                "OUTSTANDING_DEBT",
                "AMOUNT_INVESTED_MONTHLY",
                "MONTHLY_BALANCE",
            ],
        ),
        (
            "prep_num",
            pl_num,
            [
                "MONTHLY_INHAND_SALARY",
                "NUM_BANK_ACCOUNTS",
                "NUM_CREDIT_CARD",
                "INTEREST_RATE",
                "DELAY_FROM_DUE_DATE",
                "NUM_CREDIT_INQUIRIES",
                "CREDIT_UTILIZATION_RATIO",
                "TOTAL_EMI_PER_MONTH",
            ],
        ),
    ],
    force_int_remainder_cols=False,
)

In [None]:
pl_preprocess

In [None]:
x_train_clean = pl_preprocess.fit_transform(x_train, y_train)
x_train_clean = pd.DataFrame(
    x_train_clean, columns=pl_preprocess.get_feature_names_out()
)

In [None]:
x_test_clean = pl_preprocess.transform(x_test)
x_test_clean = pd.DataFrame(x_test_clean, columns=pl_preprocess.get_feature_names_out())

## Tensorflow

In [None]:
# Se establece semilla
random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
# Crear un modelo secuencial en Keras (TensorFlow)
model = tf.keras.Sequential(
    [
        tf.keras.layers.InputLayer(shape=(x_train_clean.shape[1],)),  # Capa de entrada
        tf.keras.layers.Dense(64, activation="relu"),  # Primera capa oculta
        tf.keras.layers.Dense(32, activation="relu"),  # Segunda capa oculta
        tf.keras.layers.Dense(
            1, activation="sigmoid"
        ),  # Capa de salida (sigmoide para clasificación binaria)
    ]
)

# Compilar el modelo
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["auc"])

# Crear el callback EarlyStopping
early_stopping = EarlyStopping(
    monitor="val_loss",  # Monitorea la pérdida en el conjunto de validación
    patience=3,  # Espera 3 épocas sin mejora
    restore_best_weights=True,
)  # Restaura los mejores pesos

# Entrenar el modelo
model.fit(
    x_train_clean.values,
    y_train[:, np.newaxis],
    epochs=50,
    batch_size=128,
    validation_data=(x_test_clean, y_test),
)

# Evaluar el modelo
loss, auc = model.evaluate(x_train_clean.values, y_train)
print(f"Train Gini: {2 * auc - 1}")
loss, auc = model.evaluate(x_test_clean.values, y_test)
print(f"Test Gini: {2 * auc - 1}")

## Pytorch

In [None]:
# Se establece semilla
random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
# Convertir los datos a tensores de PyTorch
x_train_tensor = torch.tensor(x_train_clean.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)

# Convertir los datos a tensores de PyTorch
x_test_tensor = torch.tensor(x_test_clean.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

In [None]:
# Crear un DataLoader para el entrenamiento
train_data = TensorDataset(x_train_tensor, y_train_tensor)
generator = torch.Generator()
generator.manual_seed(SEED)
train_loader = DataLoader(train_data, batch_size=128, shuffle=True, generator=generator)

In [None]:
# Definir la red neuronal en PyTorch
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(
            x_train_tensor.shape[1], 64
        )  # Capa de entrada (3 características)
        self.layer2 = nn.Linear(64, 32)  # Primera capa oculta
        self.layer3 = nn.Linear(32, 1)  # Capa de salida
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.layer1(x))  # ReLU en la primera capa oculta
        x = torch.relu(self.layer2(x))  # ReLU en la segunda capa oculta
        x = self.sigmoid(self.layer3(x))  # Sigmoide en la capa de salida
        return x


# Crear el modelo
model = MLP()

# Definir la función de pérdida y el optimizador
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Entrenamiento del modelo
epochs = 50
for epoch in range(epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        # Forward pass
        y_pred = model(X_batch)

        # Calcular la pérdida
        loss = criterion(y_pred, y_batch)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()

        # Actualizar los pesos
        optimizer.step()

    # Imprimir el progreso cada 10 épocas
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

# Evaluación del modelo
model.eval()
with torch.no_grad():
    print(
        rf"Gini Train: {2 * roc_auc_score(y_train_tensor, model(x_train_tensor)) - 1: .2%}"
    )
    print(
        rf"Gini Test: {2 * roc_auc_score(y_test_tensor, model(x_test_tensor)) - 1: .2%}"
    )