# Prework

## Functions and libraries

In [67]:
import os
# PATH = "/Users/luanagiusto/TP-1-ML"  # Cambia esto si tu path es diferente
PATH = "C:/Users/julia/OneDrive/Escritorio/Archivos/Capacitación/Maestría/03. Machine Learning/TP"

In [68]:
import pandas as pd
import numpy as np
# from ydata_profiling import ProfileReport
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)
import gc
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Nota: Antes de ejecutar este notebook, instala los requisitos con:
# !pip install -r requirements.txt

In [69]:
def data_profiling(df, output_file):
    # Opciones para que sea liviano
    profile = ProfileReport(
        df.sample(20000, random_state=42) if len(df) > 20000 else df,
        title=output_file,
        minimal=True,         # desactiva análisis costosos
        explorative=True      # agrega secciones útiles
    )

    profile.to_file(output_file)  # <-- abre este HTML en el navegador

In [70]:
# Funcion para mostrar un resumen del dataframe
def df_info_summary(df: pd.DataFrame):
    total = len(df)
    non_null = df.notnull().sum()
    nulls = df.isnull().sum()
    dtypes = df.dtypes
    
    resumen = pd.DataFrame({
        "Non-Null Count": non_null,
        "Null Count": nulls,
        "% Null": (nulls / total * 100).round(2),
        "Dtype": dtypes
    })
    print(resumen)

In [71]:
def resumir_por_id(df, id_col='ID', excluir_cols=None, verbose=False, nombre_conteo='n_registros'):
    """
    Sumariza un DataFrame agrupando por una columna ID.
    Calcula métricas estadísticas básicas para columnas numéricas,
    excluyendo las que se indiquen. Incluye conteo total de registros por ID.

    Parámetros:
    - df: DataFrame de entrada con múltiples registros por ID.
    - id_col: nombre de la columna que identifica cada entidad única.
    - excluir_cols: lista de columnas a excluir del resumen (opcional).
    - verbose: si True, imprime columnas incluidas y excluidas.
    - nombre_conteo: nombre de la columna que indica cantidad de registros por ID.

    Retorna:
    - DataFrame con una fila por ID y métricas estadísticas por columna.
    """
    if excluir_cols is None:
        excluir_cols = []

    excluir_set = set(excluir_cols)
    if id_col in excluir_set:
        excluir_set.remove(id_col)

    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    cols_a_resumir = [col for col in numeric_cols if col not in excluir_set and col != id_col]

    if verbose:
        print(f"Columnas excluidas: {sorted(excluir_set)}")
        print(f"Columnas resumidas: {sorted(cols_a_resumir)}")

    # Agregaciones estadísticas
    agg_funcs = ['mean', 'min', 'max', 'median', 'sum']
    agg_dict = {col: agg_funcs for col in cols_a_resumir}

    # Agregar conteo de registros por ID
    df[nombre_conteo] = 1
    agg_dict[nombre_conteo] = ['count']

    resumen = df.groupby(id_col).agg(agg_dict)
    resumen.columns = [f"{col}_{stat}" for col, stat in resumen.columns]
    resumen = resumen.reset_index()

    return resumen

In [72]:
# Función para limpiar nombres de columnas
def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns
          .str.strip()
          .str.lower()
          .str.replace(r'[^a-z0-9_]+', '_', regex=True)
          .str.replace(r'__+', '_', regex=True)
          .str.strip('_')
    )
    return df



In [73]:
# Función auxiliar
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Evaluación de modelos
def fit_transform_model(train_df):
    TARGET_COL = "target"    # ajustá al nombre de tu columna objetivo
    SAMPLE_FRAC = 0.05       # 5% de las filas

    X = train_df.drop(columns=TARGET_COL)
    y = train_df[TARGET_COL]

    # Muestreo
    sampled_X = X.sample(frac=SAMPLE_FRAC, random_state=42)
    sampled_y = y.loc[sampled_X.index]

    # División en entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    models = [
        LinearRegression(n_jobs=-1)
    ]   

    for model in models:
        model_name = model.__class__.__name__
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        rmse = root_mean_squared_error(y_test, predictions)
        print(f"{model_name} RMSE: {rmse:.4f}")

## Data import and overview

In [74]:
# PATH = "/Users/luanagiusto/TP-1-ML"  # Cambia esto si tu path es diferente
PATH = "C:/Users/julia/OneDrive/Escritorio/Archivos/Capacitación/Maestría/03. Machine Learning/TP"
train_df = pd.read_parquet(os.path.join(PATH, "train.parquet"), engine='fastparquet')

In [75]:
df_info_summary(train_df)

                                                    Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                                                  307511           0    0.00    int64
TARGET                                                      307511           0    0.00    int64
NAME_CONTRACT_TYPE                                          307511           0    0.00   object
CODE_GENDER                                                 307511           0    0.00   object
FLAG_OWN_CAR                                                307511           0    0.00   object
FLAG_OWN_REALTY                                             307511           0    0.00   object
CNT_CHILDREN                                                307511           0    0.00    int64
AMT_INCOME_TOTAL                                            307511           0    0.00  float64
AMT_CREDIT                                                  307511           0    0.00  float64
AMT_ANNUITY                             

In [76]:
# Por ahora reemplazo nan con ceros, pero habría que ver si se puede mejorar
train_df = train_df.fillna(0)
print("Columnas con valores NaN despues de rellenar:")
print(train_df.columns[train_df.isna().any()].tolist())

Columnas con valores NaN despues de rellenar:
[]


In [77]:
train_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 662 entries, SK_ID_CURR to CCB_credit_card_balance_records_count
dtypes: float64(605), int64(41), object(16)
memory usage: 1.5+ GB


In [78]:
# OHE de columnas categóricas
# Identificar columnas categóricas
cat_cols = train_df.select_dtypes(include=["object"]).columns.tolist()
print("Columnas categóricas:", cat_cols)

# Aplicar One Hot Encoding
train_df = pd.get_dummies(train_df, columns=cat_cols, dummy_na=True)

train_df.shape

Columnas categóricas: ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']


(307511, 808)

In [79]:
# Limpiar nombres de columnas
train_df = clean_column_names(train_df)
df_info_summary(train_df)   

                                                    Non-Null Count  Null Count  % Null    Dtype
sk_id_curr                                                  307511           0     0.0    int64
target                                                      307511           0     0.0    int64
cnt_children                                                307511           0     0.0    int64
amt_income_total                                            307511           0     0.0  float64
amt_credit                                                  307511           0     0.0  float64
amt_annuity                                                 307511           0     0.0  float64
amt_goods_price                                             307511           0     0.0  float64
region_population_relative                                  307511           0     0.0  float64
days_birth                                                  307511           0     0.0    int64
days_employed                           

In [80]:
# Testeo fit
fit_transform_model(train_df)

LinearRegression RMSE: 0.2618


## Feature engineering

In [86]:
# Ratios credito-ingreso

# credit_term ≈ meses del crédito
train_df["credit_term"] = (
    train_df["amt_credit"]
      .div(train_df["amt_annuity"].replace(0, np.nan))
      .replace([np.inf, -np.inf], np.nan)
      .fillna(0)
)

# credit_to_income = credito / ingreso total
train_df["credit_to_income"] = (
    train_df["amt_credit"]
      .div(train_df["amt_income_total"].replace(0, np.nan))
      .replace([np.inf, -np.inf], np.nan)
      .fillna(0)
)

# annuity_income_pct = cuota / ingreso total
train_df["annuity_income_pct"] = (
    train_df["amt_annuity"]
      .div(train_df["amt_income_total"].replace(0, np.nan))
      .replace([np.inf, -np.inf], np.nan)
      .fillna(0)
)

# goods_price_to_credit = precio bienes / credito
train_df["goods_price_to_credit"] = (
    train_df["amt_goods_price"]
      .div(train_df["amt_credit"].replace(0, np.nan))
      .replace([np.inf, -np.inf], np.nan)
      .fillna(0)
)

# income_per_child = income / cant hijos
train_df["income_per_child"] = (
    train_df["amt_income_total"]
      .div(train_df["cnt_children"].replace(0, 1))
      .replace([np.inf, -np.inf], np.nan)
      .fillna(0)
)

# income_per_family_member = income / cant miembros de familia
train_df["income_per_family_member"] = (
    train_df["amt_income_total"]
      .div(train_df["cnt_fam_members"].replace(0, 1))
      .replace([np.inf, -np.inf], np.nan)
      .fillna(0)
)

In [87]:
fit_transform_model(train_df)

LinearRegression RMSE: 0.2619


In [88]:
# Edad y antiguedad en la empresa

# age_years = edad en años
# Asegurar columnas base
if "age_years" not in train_df and "days_birth" in train_df:
    train_df["age_years"] = (
        (-train_df["days_birth"] / 365)
          .replace([np.inf, -np.inf], np.nan)
          .clip(lower=0)
          .fillna(0)
    )

if "emp_years" not in train_df and "days_employed" in train_df:
    train_df["emp_years"] = (
        (-train_df["days_employed"] / 365)
          .replace([np.inf, -np.inf], np.nan)
          .clip(lower=0, upper=60)
          .fillna(0)
    )

# Relación entre años de empleo y edad
train_df["emp_to_age"] = (
    train_df["emp_years"]
      .div(train_df["age_years"].replace(0, np.nan))
      .replace([np.inf, -np.inf], np.nan)
      .fillna(0)
)

In [89]:
fit_transform_model(train_df)

LinearRegression RMSE: 0.2619


In [92]:
# Suma y ratios de flags
train_df["flag_sum"] = train_df.filter(like='flag_').sum(axis=1)    
train_df['mismatch_address_flags'] = train_df[[
    'reg_region_not_live_region',
    'reg_region_not_work_region',
    'live_region_not_work_region',
    'reg_city_not_live_city',
    'reg_city_not_work_city',
    'live_city_not_work_city'
]].sum(axis=1)


In [93]:
fit_transform_model(train_df)

LinearRegression RMSE: 0.2619


In [108]:
# external_sources_mean = media de external_sources
train_df["external_sources_mean"] = train_df[[
    'ext_source_1',
    'ext_source_2',
    'ext_source_3'
]].mean(axis=1)

train_df["external_sources_mean_2_3"] = train_df[[
    'ext_source_2',
    'ext_source_3'
]].mean(axis=1)

In [109]:
fit_transform_model(train_df)

LinearRegression RMSE: 0.2615


In [110]:
# ext_source_prod
train_df["ext_source_prod"] = (
    train_df["ext_source_1"] *
    train_df["ext_source_2"] *
    train_df["ext_source_3"]
).fillna(0)

train_df["ext_source_prod_2x3"] = (
    train_df["ext_source_2"] *
    train_df["ext_source_3"]
).fillna(0)

train_df["ext_source_prod_1x3"] = (
    train_df["ext_source_1"] *
    train_df["ext_source_3"]
).fillna(0)

train_df["ext_source_prod_1x2"] = (
    train_df["ext_source_1"] *
    train_df["ext_source_2"]
).fillna(0)

In [111]:
fit_transform_model(train_df)

LinearRegression RMSE: 0.2615


In [112]:
# Características de Vivienda y Edificio
for suf in ['avg','mode','medi']:
    # living / apartments
    train_df[f'living_area_ratio_{suf}'] = (
        train_df[f'livingarea_{suf}']
        .div(train_df[f'apartments_{suf}'].replace(0, np.nan))
        .replace([np.inf, -np.inf], np.nan)
        .fillna(0)
    )
    # nonliving / nonlivingapartments
    train_df[f'nonliving_area_ratio_{suf}'] = (
        train_df[f'nonlivingarea_{suf}']
        .div(train_df[f'nonlivingapartments_{suf}'].replace(0, np.nan))
        .replace([np.inf, -np.inf], np.nan)
        .fillna(0)
    )
    # diferencia de pisos
    train_df[f'floor_diff_{suf}'] = (
        train_df[f'floorsmax_{suf}'] - train_df[f'floorsmin_{suf}']
    )
    # commonarea / entrances
    train_df[f'area_per_entrance_{suf}'] = (
        train_df[f'commonarea_{suf}']
        .div(train_df[f'entrances_{suf}'].replace(0, np.nan))
        .replace([np.inf, -np.inf], np.nan)
        .fillna(0)
    )
    # elevators / entrances
    train_df[f'elevators_per_entrance_{suf}'] = (
        train_df[f'elevators_{suf}']
        .div(train_df[f'entrances_{suf}'].replace(0, np.nan))
        .replace([np.inf, -np.inf], np.nan)
        .fillna(0)
    )

In [113]:
fit_transform_model(train_df)

LinearRegression RMSE: 0.2615


In [114]:
# Variables de Círculo Social
# ——————————————————————————————————————————————
train_df['def30_rate'] = (
    train_df['def_30_cnt_social_circle']
    .div(train_df['obs_30_cnt_social_circle'].replace(0, np.nan))
    .replace([np.inf, -np.inf], np.nan)
    .fillna(0)
)

train_df['def60_rate'] = (
    train_df['def_60_cnt_social_circle']
    .div(train_df['obs_60_cnt_social_circle'].replace(0, np.nan))
    .replace([np.inf, -np.inf], np.nan)
    .fillna(0)
)

train_df['social_obs_total'] = (
    train_df['obs_30_cnt_social_circle'] + train_df['obs_60_cnt_social_circle']
)

train_df['social_def_total'] = (
    train_df['def_30_cnt_social_circle'] + train_df['def_60_cnt_social_circle']
)


In [115]:
fit_transform_model(train_df)

LinearRegression RMSE: 0.2615


In [116]:
# Consultas al Bureau de Crédito
bureau_cols = [
    'amt_req_credit_bureau_hour',
    'amt_req_credit_bureau_day',
    'amt_req_credit_bureau_week',
    'amt_req_credit_bureau_mon',
    'amt_req_credit_bureau_qrt',
    'amt_req_credit_bureau_year'
]

train_df['bureau_req_total'] = train_df[bureau_cols].sum(axis=1)

train_df['bureau_hour_day_ratio'] = (
    train_df['amt_req_credit_bureau_hour']
    .div(train_df['amt_req_credit_bureau_day'].replace(0, np.nan))
    .replace([np.inf, -np.inf], np.nan)
    .fillna(0)
)

train_df['bureau_mon_year_ratio'] = (
    train_df['amt_req_credit_bureau_mon']
    .div(train_df['amt_req_credit_bureau_year'].replace(0, np.nan))
    .replace([np.inf, -np.inf], np.nan)
    .fillna(0)
)


In [117]:
fit_transform_model(train_df)

LinearRegression RMSE: 0.2615


## Guardar df final

In [118]:
# Guardar df final en formato parquet
train_df.to_parquet(os.path.join(PATH, "train_df.parquet"))
