### Prework

Realizamos la limpieza y adaptación de las variables.

In [2]:
import pandas as pd

train_df = pd.read_csv("application_train.csv")


# Detectar columnas con solo "y"/"n" y mapear a 1/0
bool_cols = [c for c in train_df.columns if set(train_df[c].dropna().unique()) <= {"y", "n"}]
if bool_cols:
    train_df[bool_cols] = train_df[bool_cols].apply(lambda s: s.map({"y": 1, "n": 0}).astype("int8"))

### Prueba de referencia con Regresión lineal

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# X numéricas (incluye las mapeadas), y target
X_num = train_df.select_dtypes(include=["number"]).drop(columns=["TARGET"], errors="ignore")
y = train_df["TARGET"]


# a) Drop NaN
X_drop = X_num.dropna()
y_drop = y.loc[X_drop.index]
print("Linear Regression, Drop NaN rows")
lr = LinearRegression()
lr.fit(X_drop, y_drop)
rmse_drop = np.sqrt(mean_squared_error(y_drop, lr.predict(X_drop)))
print("RMSE (dropna):", rmse_drop)

# b) Imputar con media
X_mean = X_num.fillna(X_num.mean(numeric_only=True))
lr2 = LinearRegression()
lr2.fit(X_mean, y)
rmse_mean = np.sqrt(mean_squared_error(y, lr2.predict(X_mean)))
print("RMSE (mean):", rmse_mean)

Linear Regression, Drop NaN rows
RMSE (dropna): 0.23050061380855455
RMSE (mean): 0.2644287502163005


### Hago One Hot Encoding con las variables categóricas

In [6]:
# columnas categóricas restantes
cat_cols = train_df.select_dtypes(include=["object"]).columns.tolist()

# opcional: quitar IDs antes de generar features
id_cols = ["SK_ID_CURR"]  # agregá más si hace falta

# X con one-hot, y como target
X_ohe = pd.get_dummies(
    train_df.drop(columns=["TARGET"] + id_cols, errors="ignore"),
    columns=cat_cols,
    dummy_na=True,   # conserva la categoría "NaN" como dummy
    dtype="int8"     # compacto en memoria
)
y = train_df["TARGET"]

print("X_ohe shape:", X_ohe.shape)
print("y shape:", y.shape)

X_ohe shape: (307511, 260)
y shape: (307511,)


Genaramos algunas varibles que entendemos representativas, como el CREDIT_TERM ≈ AMT_CREDIT / AMT_ANNUITY; PAYMENT_RATE = AMT_ANNUITY / AMT_CREDIT; CAR_AGE_TO_AGE = OWN_CAR_AGE / AGE_YEARS

In [7]:
import numpy as np

# CREDIT_TERM ≈ meses del crédito
train_df["CREDIT_TERM"] = train_df["AMT_CREDIT"].div(
    train_df["AMT_ANNUITY"].replace(0, np.nan)
)

# PAYMENT_RATE = cuota / crédito
train_df["PAYMENT_RATE"] = train_df["AMT_ANNUITY"].div(
    train_df["AMT_CREDIT"].replace(0, np.nan)
)

# ANNUITY_INCOME_PCT = cuota / ingreso total
train_df["ANNUITY_INCOME_PCT"] = train_df["AMT_ANNUITY"].div(
    train_df["AMT_INCOME_TOTAL"].replace(0, np.nan))

    # Asegurar columnas base
if "AGE_YEARS" not in train_df and "DAYS_BIRTH" in train_df:
    train_df["AGE_YEARS"] = (-train_df["DAYS_BIRTH"] / 365).clip(lower=0)

if "EMP_YEARS" not in train_df and "DAYS_EMPLOYED" in train_df:
    train_df["EMP_YEARS"] = (-train_df["DAYS_EMPLOYED"] / 365).clip(lower=0, upper=60)

# Relación entre años de empleo y edad
train_df["EMP_TO_AGE"] = train_df["EMP_YEARS"].div(
    train_df["AGE_YEARS"].replace(0, np.nan)
)

# Relación entre edad del auto y edad del cliente
train_df["CAR_AGE_TO_AGE"] = (
    train_df["OWN_CAR_AGE"].div(train_df["AGE_YEARS"].replace(0, np.nan))
    if "OWN_CAR_AGE" in train_df else np.nan
)