# Functions and libraries

In [18]:
import pandas as pd
from ydata_profiling import ProfileReport
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)
import gc

In [19]:
def data_profiling(df, output_file):
    # Opciones para que sea liviano
    profile = ProfileReport(
        df.sample(20000, random_state=42) if len(df) > 20000 else df,
        title=output_file,
        minimal=True,         # desactiva análisis costosos
        explorative=True      # agrega secciones útiles
    )

    profile.to_file(output_file)  # <-- abre este HTML en el navegador

In [20]:
# Funcion para mostrar un resumen del dataframe
def df_info_summary(df: pd.DataFrame):
    total = len(df)
    non_null = df.notnull().sum()
    nulls = df.isnull().sum()
    dtypes = df.dtypes
    
    resumen = pd.DataFrame({
        "Non-Null Count": non_null,
        "Null Count": nulls,
        "% Null": (nulls / total * 100).round(2),
        "Dtype": dtypes
    })
    print(resumen)

## Bureau data

### Data import and overview

In [21]:
# Create data profiles for bureau and bureau_balance datasets

bureau_df = pd.read_csv("bureau.csv")
bureau_balance_df = pd.read_csv("bureau_balance.csv")

data_profiling(bureau_df, "bureau_df_profile.html")
data_profiling(bureau_balance_df, "bureau_df_balance_profile.html")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 17/17 [00:00<00:00, 61.83it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00, 249.80it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [22]:
# Mostrar las primeras filas de bureau dataset
bureau_df.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [23]:
len(bureau_df)

1716428

In [24]:
# Display the first few rows of the bureau balance data set
bureau_balance_df.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [25]:
len(bureau_balance_df)

27299925

In [26]:
# Info sobre las columnas del bureau_df
"""
1. **SK_ID_CURR**
   * ID del cliente (llave para unir con `application_{train|test}.csv`).
2. **SK_ID_BUREAU**
   * ID único del préstamo en el Buró de Crédito (llave para unir con `bureau_balance.csv`).
3. **CREDIT_ACTIVE**
   * Estado actual del crédito reportado.
   * Valores: *Active, Closed, Sold, Bad debt*.
4. **CREDIT_CURRENCY**
   * Moneda en que está registrado el crédito en el Buró.
5. **DAYS_CREDIT**
   * Días relativos a la fecha de aplicación en Home Credit en que se otorgó este préstamo externo.
   * Ej: `-1000` → el préstamo fue otorgado 1000 días antes de la aplicación.
6. **CREDIT_DAY_OVERDUE**
   * Número de días de atraso en pagos en el momento de la aplicación (si aplica).
7. **DAYS_CREDIT_ENDDATE**
   * Duración **restante** del crédito (en días) al momento de la aplicación.
   * Positivo = le quedan días para terminar.
   * Negativo = ya debería haber finalizado.
8. **DAYS_ENDDATE_FACT**
   * Días desde la **finalización real** del crédito, al momento de la aplicación (solo si está cerrado).
   * Negativo = terminó antes de la aplicación.
9. **AMT_CREDIT_MAX_OVERDUE**
   * Monto máximo de deuda vencida registrado durante la vida de ese crédito.
10. **CNT_CREDIT_PROLONG**
    * Cantidad de veces que se extendió/prorrogó este crédito.
11. **AMT_CREDIT_SUM**
    * Monto actual del crédito según Buró.
12. **AMT_CREDIT_SUM_DEBT**
    * Monto actual de deuda pendiente de ese crédito.
13. **AMT_CREDIT_SUM_LIMIT**
    * Límite actual de crédito (si es aplicable, ej. tarjeta).
14. **AMT_CREDIT_SUM_OVERDUE**
    * Monto actual vencido en ese crédito.
15. **CREDIT_TYPE**
    * Tipo de crédito según Buró (ej: *Car loan, Consumer credit, Mortgage, Credit card*).
16. **DAYS_CREDIT_UPDATE**
    * Días relativos a la aplicación en que se actualizó por última vez la info del crédito en el Buró.
17. **AMT_ANNUITY**
    * Monto de la cuota periódica (anualidad) reportado en el Buró para este crédito.
"""
df_info_summary(bureau_df)

                        Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                     1716428           0    0.00    int64
SK_ID_BUREAU                   1716428           0    0.00    int64
CREDIT_ACTIVE                  1716428           0    0.00   object
CREDIT_CURRENCY                1716428           0    0.00   object
DAYS_CREDIT                    1716428           0    0.00    int64
CREDIT_DAY_OVERDUE             1716428           0    0.00    int64
DAYS_CREDIT_ENDDATE            1610875      105553    6.15  float64
DAYS_ENDDATE_FACT              1082775      633653   36.92  float64
AMT_CREDIT_MAX_OVERDUE          591940     1124488   65.51  float64
CNT_CREDIT_PROLONG             1716428           0    0.00    int64
AMT_CREDIT_SUM                 1716415          13    0.00  float64
AMT_CREDIT_SUM_DEBT            1458759      257669   15.01  float64
AMT_CREDIT_SUM_LIMIT           1124648      591780   34.48  float64
AMT_CREDIT_SUM_OVERDUE         1716428          

In [27]:
# Info sobre las columnas del bureau_balance_df
"""
SK_ID_BUREAU → vincula con bureau.
MONTHS_BALANCE → mes relativo a la aplicación actual (ej. -1 = mes anterior, -6 = seis meses antes).
STATUS → estado en ese mes:
0 = al día (DPD 0)
1 = atraso 1–30 días
2 = atraso 31–60
3 = atraso 61–90
4 = atraso 91–120
5 = atraso 120+ o vendido/castigado
C = cerrado
X = desconocido

**DPD = Days Past Due
"""

df_info_summary(bureau_balance_df)

                Non-Null Count  Null Count  % Null   Dtype
SK_ID_BUREAU          27299925           0     0.0   int64
MONTHS_BALANCE        27299925           0     0.0   int64
STATUS                27299925           0     0.0  object


### Data prep - bureau.csv 

In [28]:
# bureau_df.loc[bureau_df["DAYS_CREDIT_ENDDATE"].isna(), ["CREDIT_TYPE", "CREDIT_ACTIVE"]].value_counts()
# bureau_df.loc[bureau_df["DAYS_ENDDATE_FACT"].isna(), ["CREDIT_TYPE", "CREDIT_ACTIVE"]].value_counts()
# bureau_df.loc[bureau_df["AMT_CREDIT_SUM"].isna(), ["CREDIT_TYPE", "CREDIT_ACTIVE"]].value_counts()
# bureau_df.loc[bureau_df["AMT_CREDIT_SUM_DEBT"].isna(), ["CREDIT_TYPE", "CREDIT_ACTIVE"]].value_counts()
# bureau_df.loc[bureau_df["AMT_CREDIT_SUM_LIMIT"].isna(), ["CREDIT_TYPE", "CREDIT_ACTIVE"]].value_counts()
bureau_df.loc[bureau_df["AMT_ANNUITY"].isna(), ["CREDIT_TYPE", "CREDIT_ACTIVE"]].value_counts()


CREDIT_TYPE                                   CREDIT_ACTIVE
Consumer credit                               Closed           683665
                                              Active           208321
Credit card                                   Active           206898
                                              Closed            84338
Car loan                                      Closed            12090
Mortgage                                      Active             8047
Car loan                                      Active             6267
Microloan                                     Closed             4773
Mortgage                                      Closed             3457
Consumer credit                               Sold               2257
Credit card                                   Sold               1470
Microloan                                     Active             1406
Loan for business development                 Closed             1140
Another type of loan          

In [29]:
# Por ahora reemplazo nan con ceros, pero habría que ver si se puede mejorar
bureau_df = bureau_df.fillna(0)
print("Columnas con valores NaN despues de rellenar:")
print(bureau_df.columns[bureau_df.isna().any()].tolist())

Columnas con valores NaN despues de rellenar:
[]


In [30]:
df_info_summary(bureau_df)

                        Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                     1716428           0     0.0    int64
SK_ID_BUREAU                   1716428           0     0.0    int64
CREDIT_ACTIVE                  1716428           0     0.0   object
CREDIT_CURRENCY                1716428           0     0.0   object
DAYS_CREDIT                    1716428           0     0.0    int64
CREDIT_DAY_OVERDUE             1716428           0     0.0    int64
DAYS_CREDIT_ENDDATE            1716428           0     0.0  float64
DAYS_ENDDATE_FACT              1716428           0     0.0  float64
AMT_CREDIT_MAX_OVERDUE         1716428           0     0.0  float64
CNT_CREDIT_PROLONG             1716428           0     0.0    int64
AMT_CREDIT_SUM                 1716428           0     0.0  float64
AMT_CREDIT_SUM_DEBT            1716428           0     0.0  float64
AMT_CREDIT_SUM_LIMIT           1716428           0     0.0  float64
AMT_CREDIT_SUM_OVERDUE         1716428          

### Data prep - bureau_balance.csv

In [31]:
len(bureau_balance_df)

27299925

In [32]:
# Crear tabla de conteo de status por SK_ID_BUREAU
# OHE con get_dummies
status_dummies = pd.get_dummies(bureau_balance_df["STATUS"], prefix="SUM_STATUS", dtype="int32")

# Concatenar con SK_ID_BUREAU
tmp = pd.concat([bureau_balance_df[["SK_ID_BUREAU"]], status_dummies], axis=1)

# Agrupar por SK_ID_BUREAU y sumar
status_counts = tmp.groupby("SK_ID_BUREAU", as_index=False).sum()

In [33]:
status_counts.head()

Unnamed: 0,SK_ID_BUREAU,SUM_STATUS_0,SUM_STATUS_1,SUM_STATUS_2,SUM_STATUS_3,SUM_STATUS_4,SUM_STATUS_5,SUM_STATUS_C,SUM_STATUS_X
0,5001709,0,0,0,0,0,0,86,11
1,5001710,5,0,0,0,0,0,48,30
2,5001711,3,0,0,0,0,0,0,1
3,5001712,10,0,0,0,0,0,9,0
4,5001713,0,0,0,0,0,0,0,22


In [34]:
df_info_summary(status_counts)

              Non-Null Count  Null Count  % Null  Dtype
SK_ID_BUREAU          817395           0     0.0  int64
SUM_STATUS_0          817395           0     0.0  int32
SUM_STATUS_1          817395           0     0.0  int32
SUM_STATUS_2          817395           0     0.0  int32
SUM_STATUS_3          817395           0     0.0  int32
SUM_STATUS_4          817395           0     0.0  int32
SUM_STATUS_5          817395           0     0.0  int32
SUM_STATUS_C          817395           0     0.0  int32
SUM_STATUS_X          817395           0     0.0  int32


In [35]:
len(status_counts)

817395

### Bureau datasets join

In [36]:
print("rows before join:", len(bureau_df))

rows before join: 1716428


In [37]:
# Join final de tablas bureau y bureau_balance agregando los conteos de status
bureau_df_join = bureau_df.merge(status_counts, on="SK_ID_BUREAU", how="left")

# Rellenar NaN con 0 y casteo a int32
status_cols = [col for col in bureau_df_join.columns if col.startswith("SUM_STATUS_")]
bureau_df_join[status_cols] = bureau_df_join[status_cols].fillna(0).astype("int32")

In [38]:
print("rows after join:", len(bureau_df_join))

rows after join: 1716428


In [39]:
bureau_df_join.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY,SUM_STATUS_0,SUM_STATUS_1,SUM_STATUS_2,SUM_STATUS_3,SUM_STATUS_4,SUM_STATUS_5,SUM_STATUS_C,SUM_STATUS_X
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,0.0,0,91323.0,0.0,0.0,0.0,Consumer credit,-131,0.0,0,0,0,0,0,0,0,0
1,215354,5714463,Active,currency 1,-208,0,1075.0,0.0,0.0,0,225000.0,171342.0,0.0,0.0,Credit card,-20,0.0,0,0,0,0,0,0,0,0
2,215354,5714464,Active,currency 1,-203,0,528.0,0.0,0.0,0,464323.5,0.0,0.0,0.0,Consumer credit,-16,0.0,0,0,0,0,0,0,0,0
3,215354,5714465,Active,currency 1,-203,0,0.0,0.0,0.0,0,90000.0,0.0,0.0,0.0,Credit card,-16,0.0,0,0,0,0,0,0,0,0
4,215354,5714466,Active,currency 1,-629,0,1197.0,0.0,77674.5,0,2700000.0,0.0,0.0,0.0,Consumer credit,-21,0.0,0,0,0,0,0,0,0,0


In [40]:
df_info_summary(bureau_df_join)

                        Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                     1716428           0     0.0    int64
SK_ID_BUREAU                   1716428           0     0.0    int64
CREDIT_ACTIVE                  1716428           0     0.0   object
CREDIT_CURRENCY                1716428           0     0.0   object
DAYS_CREDIT                    1716428           0     0.0    int64
CREDIT_DAY_OVERDUE             1716428           0     0.0    int64
DAYS_CREDIT_ENDDATE            1716428           0     0.0  float64
DAYS_ENDDATE_FACT              1716428           0     0.0  float64
AMT_CREDIT_MAX_OVERDUE         1716428           0     0.0  float64
CNT_CREDIT_PROLONG             1716428           0     0.0    int64
AMT_CREDIT_SUM                 1716428           0     0.0  float64
AMT_CREDIT_SUM_DEBT            1716428           0     0.0  float64
AMT_CREDIT_SUM_LIMIT           1716428           0     0.0  float64
AMT_CREDIT_SUM_OVERDUE         1716428          

In [41]:
del bureau_df
del bureau_balance_df
del tmp
del status_dummies
del status_counts
gc.collect()

54856

### Joined data prep

In [42]:
# OHE con get_dummies
to_keep_numeric_and_boolean = bureau_df_join.select_dtypes(include=["number", "bool"]).columns.tolist()
to_do_ohe = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE']
bureau_df_join = bureau_df_join[to_keep_numeric_and_boolean + to_do_ohe].copy()
gc.collect()
bureau_df_categorical = pd.get_dummies(bureau_df_join[to_do_ohe], dtype="int32")
bureau_df_join = pd.concat([bureau_df_join[to_keep_numeric_and_boolean], bureau_df_categorical], axis=1)

In [43]:
# bureau_df_join_ohe.columns
df_info_summary(bureau_df_join)

                                                    Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                                                 1716428           0     0.0    int64
SK_ID_BUREAU                                               1716428           0     0.0    int64
DAYS_CREDIT                                                1716428           0     0.0    int64
CREDIT_DAY_OVERDUE                                         1716428           0     0.0    int64
DAYS_CREDIT_ENDDATE                                        1716428           0     0.0  float64
DAYS_ENDDATE_FACT                                          1716428           0     0.0  float64
AMT_CREDIT_MAX_OVERDUE                                     1716428           0     0.0  float64
CNT_CREDIT_PROLONG                                         1716428           0     0.0    int64
AMT_CREDIT_SUM                                             1716428           0     0.0  float64
AMT_CREDIT_SUM_DEBT                     

In [44]:
len(bureau_df_join) 

1716428

In [45]:
bureau_df_join.columns
bureau_df_join.shape

(1716428, 45)

In [46]:
# Definir el diccionario de agregación con renombrado
agg_dict = {
    # Columnas con media y renombrado
    'DAYS_CREDIT': [('DAYS_CREDIT_MEAN', 'mean')],
    'DAYS_CREDIT_ENDDATE': [('DAYS_CREDIT_ENDDATE_MEAN', 'mean')],
    'DAYS_ENDDATE_FACT': [('DAYS_ENDDATE_FACT_MEAN', 'mean')],
    'AMT_CREDIT_MAX_OVERDUE': [('AMT_CREDIT_MAX_OVERDUE_MEAN', 'mean')],
    'AMT_CREDIT_SUM': [('AMT_CREDIT_SUM_MEAN', 'mean')],
    'AMT_CREDIT_SUM_DEBT': [('AMT_CREDIT_SUM_DEBT_MEAN', 'mean')],
    'AMT_CREDIT_SUM_LIMIT': [('AMT_CREDIT_SUM_LIMIT_MEAN', 'mean')],
    'AMT_CREDIT_SUM_OVERDUE': [('AMT_CREDIT_SUM_OVERDUE_MEAN', 'mean')],
    'DAYS_CREDIT_UPDATE': [('DAYS_CREDIT_UPDATE_MEAN', 'mean')],
    'AMT_ANNUITY': [('AMT_ANNUITY_MEAN', 'mean')],
    
    # Columnas con suma y renombrado
    'CREDIT_DAY_OVERDUE': [('CREDIT_DAY_OVERDUE_SUM', 'sum')],
    'CNT_CREDIT_PROLONG': [('CNT_CREDIT_PROLONG_SUM', 'sum')],
    'SUM_STATUS_0': [('MONTHS_WITH_STATUS_DPD_0_SUM', 'sum')],
    'SUM_STATUS_1': [('MONTHS_WITH_STATUS_DPD_1to30_SUM', 'sum')],
    'SUM_STATUS_2': [('MONTHS_WITH_STATUS_DPD_31to60_SUM', 'sum')],
    'SUM_STATUS_3': [('MONTHS_WITH_STATUS_DPD_61to90_SUM', 'sum')],
    'SUM_STATUS_4': [('MONTHS_WITH_STATUS_DPD_91to120_SUM', 'sum')],
    'SUM_STATUS_5': [('MONTHS_WITH_STATUS_DPD_over120_SUM', 'sum')],
    'SUM_STATUS_C': [('MONTHS_WITH_STATUS_CLOSED_SUM', 'sum')],
    'SUM_STATUS_X': [('MONTHS_WITH_STATUS_UNKNOWN_SUM', 'sum')],
    'CREDIT_ACTIVE_Active': [('CREDIT_ACTIVE_Active_SUM', 'sum')],
    'CREDIT_ACTIVE_Bad debt': [('CREDIT_ACTIVE_Bad debt_SUM', 'sum')],
    'CREDIT_ACTIVE_Closed': [('CREDIT_ACTIVE_Closed_SUM', 'sum')],
    'CREDIT_ACTIVE_Sold': [('CREDIT_ACTIVE_Sold_SUM', 'sum')],
    'CREDIT_CURRENCY_currency 1': [('CREDIT_CURRENCY_currency 1_SUM', 'sum')],
    'CREDIT_CURRENCY_currency 2': [('CREDIT_CURRENCY_currency 2_SUM', 'sum')],
    'CREDIT_CURRENCY_currency 3': [('CREDIT_CURRENCY_currency 3_SUM', 'sum')],
    'CREDIT_CURRENCY_currency 4': [('CREDIT_CURRENCY_currency 4_SUM', 'sum')],
    'CREDIT_TYPE_Another type of loan': [('CREDIT_TYPE_Another type of loan_SUM', 'sum')],
    'CREDIT_TYPE_Car loan': [('CREDIT_TYPE_Car loan_SUM', 'sum')],
    'CREDIT_TYPE_Cash loan (non-earmarked)': [('CREDIT_TYPE_Cash loan (non-earmarked)_SUM', 'sum')],
    'CREDIT_TYPE_Consumer credit': [('CREDIT_TYPE_Consumer credit_SUM', 'sum')],
    'CREDIT_TYPE_Credit card': [('CREDIT_TYPE_Credit card_SUM', 'sum')],
    'CREDIT_TYPE_Interbank credit': [('CREDIT_TYPE_Interbank credit_SUM', 'sum')],
    'CREDIT_TYPE_Loan for business development': [('CREDIT_TYPE_Loan for business development_SUM', 'sum')],
    'CREDIT_TYPE_Loan for purchase of shares (margin lending)': [('CREDIT_TYPE_Loan for purchase of shares (margin lending)_SUM', 'sum')],
    'CREDIT_TYPE_Loan for the purchase of equipment': [('CREDIT_TYPE_Loan for the purchase of equipment_SUM', 'sum')],
    'CREDIT_TYPE_Loan for working capital replenishment': [('CREDIT_TYPE_Loan for working capital replenishment_SUM', 'sum')],
    'CREDIT_TYPE_Microloan': [('CREDIT_TYPE_Microloan_SUM', 'sum')],
    'CREDIT_TYPE_Mobile operator loan': [('CREDIT_TYPE_Mobile operator loan_SUM', 'sum')],
    'CREDIT_TYPE_Mortgage': [('CREDIT_TYPE_Mortgage_SUM', 'sum')],
    'CREDIT_TYPE_Real estate loan': [('CREDIT_TYPE_Real estate loan_SUM', 'sum')],
    'CREDIT_TYPE_Unknown type of loan': [('CREDIT_TYPE_Unknown type of loan_SUM', 'sum')]
}

# Aplanar el diccionario para la agregación
flat_agg_dict = {}
for col, operations in agg_dict.items():
    for new_name, func in operations:
        flat_agg_dict[new_name] = (col, func)

# Aplicar la agregación con renombrado
bureau_df_agg = bureau_df_join.groupby(['SK_ID_CURR']).agg(**flat_agg_dict).reset_index()

# Mostrar las primeras filas
bureau_df_agg.head()

Unnamed: 0,SK_ID_CURR,DAYS_CREDIT_MEAN,DAYS_CREDIT_ENDDATE_MEAN,DAYS_ENDDATE_FACT_MEAN,AMT_CREDIT_MAX_OVERDUE_MEAN,AMT_CREDIT_SUM_MEAN,AMT_CREDIT_SUM_DEBT_MEAN,AMT_CREDIT_SUM_LIMIT_MEAN,AMT_CREDIT_SUM_OVERDUE_MEAN,DAYS_CREDIT_UPDATE_MEAN,AMT_ANNUITY_MEAN,CREDIT_DAY_OVERDUE_SUM,CNT_CREDIT_PROLONG_SUM,MONTHS_WITH_STATUS_DPD_0_SUM,MONTHS_WITH_STATUS_DPD_1to30_SUM,MONTHS_WITH_STATUS_DPD_31to60_SUM,MONTHS_WITH_STATUS_DPD_61to90_SUM,MONTHS_WITH_STATUS_DPD_91to120_SUM,MONTHS_WITH_STATUS_DPD_over120_SUM,MONTHS_WITH_STATUS_CLOSED_SUM,MONTHS_WITH_STATUS_UNKNOWN_SUM,CREDIT_ACTIVE_Active_SUM,CREDIT_ACTIVE_Bad debt_SUM,CREDIT_ACTIVE_Closed_SUM,CREDIT_ACTIVE_Sold_SUM,CREDIT_CURRENCY_currency 1_SUM,CREDIT_CURRENCY_currency 2_SUM,CREDIT_CURRENCY_currency 3_SUM,CREDIT_CURRENCY_currency 4_SUM,CREDIT_TYPE_Another type of loan_SUM,CREDIT_TYPE_Car loan_SUM,CREDIT_TYPE_Cash loan (non-earmarked)_SUM,CREDIT_TYPE_Consumer credit_SUM,CREDIT_TYPE_Credit card_SUM,CREDIT_TYPE_Interbank credit_SUM,CREDIT_TYPE_Loan for business development_SUM,CREDIT_TYPE_Loan for purchase of shares (margin lending)_SUM,CREDIT_TYPE_Loan for the purchase of equipment_SUM,CREDIT_TYPE_Loan for working capital replenishment_SUM,CREDIT_TYPE_Microloan_SUM,CREDIT_TYPE_Mobile operator loan_SUM,CREDIT_TYPE_Mortgage_SUM,CREDIT_TYPE_Real estate loan_SUM,CREDIT_TYPE_Unknown type of loan_SUM
0,100001,-735.0,82.428571,-471.714286,0.0,207623.571429,85240.928571,0.0,0.0,-93.142857,3545.357143,0,0,31,1,0,0,0,0,110,30,3,0,4,0,7,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0
1,100002,-874.0,-261.75,-523.125,1050.643125,108131.945625,30722.625,3998.570625,0.0,-499.875,0.0,0,0,45,27,0,0,0,0,23,15,2,0,6,0,8,0,0,0,0,0,0,4,4,0,0,0,0,0,0,0,0,0,0
2,100003,-1400.75,-544.5,-823.0,0.0,254350.125,0.0,202500.0,0.0,-816.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,3,0,4,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0
3,100004,-867.0,-488.5,-532.5,0.0,94518.9,0.0,0.0,0.0,-532.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0
4,100005,-190.666667,439.333333,-41.0,0.0,219042.0,189469.5,0.0,0.0,-54.333333,1420.5,0,0,14,0,0,0,0,0,5,2,2,0,1,0,3,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0


In [47]:
bureau_df_agg.shape

(305811, 44)

In [48]:
df_info_summary(bureau_df_agg)

                                                    Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                                                  305811           0     0.0    int64
DAYS_CREDIT_MEAN                                            305811           0     0.0  float64
DAYS_CREDIT_ENDDATE_MEAN                                    305811           0     0.0  float64
DAYS_ENDDATE_FACT_MEAN                                      305811           0     0.0  float64
AMT_CREDIT_MAX_OVERDUE_MEAN                                 305811           0     0.0  float64
AMT_CREDIT_SUM_MEAN                                         305811           0     0.0  float64
AMT_CREDIT_SUM_DEBT_MEAN                                    305811           0     0.0  float64
AMT_CREDIT_SUM_LIMIT_MEAN                                   305811           0     0.0  float64
AMT_CREDIT_SUM_OVERDUE_MEAN                                 305811           0     0.0  float64
DAYS_CREDIT_UPDATE_MEAN                 

In [49]:
bureau_df_agg['SK_ID_CURR'].nunique()

305811

## Previous application data

### Data import and overview

In [50]:
# Create data profiles for bureau and bureau_balance datasets

previous_application_df = pd.read_csv("previous_application.csv")
pos_cash_balance_df = pd.read_csv("POS_CASH_balance.csv")
installments_payments_df = pd.read_csv("installments_payments.csv")
credit_card_balance_df = pd.read_csv("credit_card_balance.csv")

# data_profiling(previous_application_df, "previous_application_df.html")
# data_profiling(pos_cash_balance_df, "pos_cash_balance_df.html")
# data_profiling(installments_payments_df, "installments_payments_df.html")
# data_profiling(credit_card_balance_df, "credit_card_balance_df.html")

In [51]:
# Mostrar las primeras filas de previous_application_df
previous_application_df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,Y,1,0.0,0.182832,0.867336,XAP,Approved,-73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,Country-wide,35,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,Y,1,,,,XNA,Approved,-164,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,-1,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,Y,1,,,,XNA,Approved,-301,Cash through the bank,XAP,"Spouse, partner",Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,Y,1,,,,XNA,Approved,-512,Cash through the bank,XAP,,Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,Y,1,,,,Repairs,Refused,-781,Cash through the bank,HC,,Repeater,XNA,Cash,walk-in,Credit and cash offices,-1,XNA,24.0,high,Cash Street: high,,,,,,


In [52]:
len(previous_application_df)

1670214

In [53]:
# Mostrar las primeras filas de pos_cash_balance_df
pos_cash_balance_df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [54]:
len(pos_cash_balance_df)

10001358

In [55]:
# Mostrar las primeras filas de installments_payments_df
installments_payments_df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [56]:
len(installments_payments_df)

13605401

In [57]:
# Mostrar las primeras filas de credit_card_balance_df
credit_card_balance_df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,1800.0,1800.0,0.0,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,2250.0,2250.0,60175.08,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,2250.0,2250.0,26926.425,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,11925.0,11925.0,224949.285,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,27000.0,27000.0,443044.395,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


In [58]:
len(credit_card_balance_df)

3840312

### Data prep - previous_application_df 

In [59]:
# Info sobre las columnas del previous_application_df

"""
•	SK_ID_PREV → ID de la aplicación previa.
•	SK_ID_CURR → ID del préstamo actual en nuestro dataset.
•	NAME_CONTRACT_TYPE → Tipo de producto solicitado (Cash loan, POS loan, etc.).
•	AMT_ANNUITY → Cuota periódica (anualidad) de esa solicitud.
•	AMT_APPLICATION → Monto que el cliente pidió originalmente.
•	AMT_CREDIT → Monto finalmente aprobado (puede diferir de lo solicitado).
•	AMT_DOWN_PAYMENT → Pago inicial hecho por el cliente.
•	AMT_GOODS_PRICE → Valor de los bienes financiados (si aplica).
•	WEEKDAY_APPR_PROCESS_START → Día de la semana en que se inició la aplicación.
•	HOUR_APPR_PROCESS_START → Hora del día de inicio (aprox., redondeada).
•	FLAG_LAST_APPL_PER_CONTRACT → Marca si fue la última solicitud para ese contrato.
•	NFLAG_LAST_APPL_IN_DAY → Marca si fue la última aplicación del cliente en ese día.
•	NFLAG_MICRO_CASH → Flag si era un microcrédito.
•	RATE_DOWN_PAYMENT → Porcentaje de pago inicial (normalizado).
•	RATE_INTEREST_PRIMARY / PRIVILEGED → Tasas de interés aplicables (normalizadas).
•	NAME_CASH_LOAN_PURPOSE → Propósito del préstamo en efectivo (educación, auto, etc.).
•	NAME_CONTRACT_STATUS → Estado de la aplicación (Approved, Refused, Canceled, etc.).
•	DAYS_DECISION → Días relativos a la aplicación actual en que se tomó la decisión.
•	NAME_PAYMENT_TYPE → Método de pago (Cash, Bank transfer, etc.).
•	CODE_REJECT_REASON → Razón de rechazo (CLIENT, HC, SCO, etc.).
•	NAME_TYPE_SUITE → Con quién estaba el cliente (Family, Alone, etc.).
•	NAME_CLIENT_TYPE → Si era cliente nuevo o recurrente.
•	NAME_GOODS_CATEGORY → Categoría del bien solicitado (Electronics, Furniture, etc.).
•	NAME_PORTFOLIO → Cartera (POS, Cash, Car, etc.).
•	NAME_PRODUCT_TYPE → Tipo de producto (X-Sell, Walk-in, etc.).
•	CHANNEL_TYPE → Canal de aplicación (Credit agent, Online, etc.).
•	SELLERPLACE_AREA → Tamaño del área de ventas del vendedor.
•	NAME_SELLER_INDUSTRY → Industria del vendedor.
•	CNT_PAYMENT → Número de pagos previstos (plazo).
•	NAME_YIELD_GROUP → Clasificación de la tasa de interés (baja, media, alta).
•	PRODUCT_COMBINATION → Detalle de la combinación de productos.
•	DAYS_FIRST_DRAWING → Días hasta la primera disposición de fondos.
•	DAYS_FIRST_DUE → Días hasta el primer pago esperado.
•	DAYS_LAST_DUE_1ST_VERSION → Último vencimiento esperado (versión inicial).
•	DAYS_LAST_DUE → Último vencimiento esperado (versión final).
•	DAYS_TERMINATION → Días hasta la finalización esperada del contrato.
•	NFLAG_INSURED_ON_APPROVAL → Si el cliente solicitó seguro.

"""
df_info_summary(previous_application_df)

                             Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                          1670214           0    0.00    int64
SK_ID_CURR                          1670214           0    0.00    int64
NAME_CONTRACT_TYPE                  1670214           0    0.00   object
AMT_ANNUITY                         1297979      372235   22.29  float64
AMT_APPLICATION                     1670214           0    0.00  float64
AMT_CREDIT                          1670213           1    0.00  float64
AMT_DOWN_PAYMENT                     774370      895844   53.64  float64
AMT_GOODS_PRICE                     1284699      385515   23.08  float64
WEEKDAY_APPR_PROCESS_START          1670214           0    0.00   object
HOUR_APPR_PROCESS_START             1670214           0    0.00    int64
FLAG_LAST_APPL_PER_CONTRACT         1670214           0    0.00   object
NFLAG_LAST_APPL_IN_DAY              1670214           0    0.00    int64
RATE_DOWN_PAYMENT                    774370      89

In [60]:
# previous_application_df.loc[previous_application_df["AMT_CREDIT"].isna(), ["NAME_CONTRACT_STATUS"]].value_counts()
previous_application_df.loc[previous_application_df["AMT_CREDIT"].isna()]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
1127152,2204450,438387,Revolving loans,0.0,0.0,,,,FRIDAY,10,Y,1,,,,XAP,Approved,-608,XNA,XAP,,Repeater,XNA,Cards,walk-in,Country-wide,20,Connectivity,0.0,XNA,Card Street,,,,,,


In [61]:
# Por ahora reemplazo nan con ceros, pero habría que ver si se puede mejorar
previous_application_df = previous_application_df.fillna(0)
print("Columnas con valores NaN despues de rellenar:")
print(previous_application_df.columns[previous_application_df.isna().any()].tolist())

Columnas con valores NaN despues de rellenar:
[]


In [62]:
df_info_summary(previous_application_df)

                             Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                          1670214           0     0.0    int64
SK_ID_CURR                          1670214           0     0.0    int64
NAME_CONTRACT_TYPE                  1670214           0     0.0   object
AMT_ANNUITY                         1670214           0     0.0  float64
AMT_APPLICATION                     1670214           0     0.0  float64
AMT_CREDIT                          1670214           0     0.0  float64
AMT_DOWN_PAYMENT                    1670214           0     0.0  float64
AMT_GOODS_PRICE                     1670214           0     0.0  float64
WEEKDAY_APPR_PROCESS_START          1670214           0     0.0   object
HOUR_APPR_PROCESS_START             1670214           0     0.0    int64
FLAG_LAST_APPL_PER_CONTRACT         1670214           0     0.0   object
NFLAG_LAST_APPL_IN_DAY              1670214           0     0.0    int64
RATE_DOWN_PAYMENT                   1670214        

In [63]:
print("Count distinct of SK_ID_PREV: ", previous_application_df["SK_ID_PREV"].nunique())
print("Count distinct of SK_ID_CURR: ", previous_application_df["SK_ID_CURR"].nunique())

Count distinct of SK_ID_PREV:  1670214
Count distinct of SK_ID_CURR:  338857


In [64]:
# OHE de columnas categóricas
# Identificar columnas categóricas
cat_cols = previous_application_df.select_dtypes(include=["object"]).columns.tolist()
print("Columnas categóricas:", cat_cols)

# Aplicar One Hot Encoding
previous_application_ohe = pd.get_dummies(previous_application_df, columns=cat_cols, dummy_na=True)

previous_application_ohe.shape

Columnas categóricas: ['NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'FLAG_LAST_APPL_PER_CONTRACT', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION']


(1670214, 182)

In [65]:
df_info_summary(previous_application_ohe)

                                                    Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                                                 1670214           0     0.0    int64
SK_ID_CURR                                                 1670214           0     0.0    int64
AMT_ANNUITY                                                1670214           0     0.0  float64
AMT_APPLICATION                                            1670214           0     0.0  float64
AMT_CREDIT                                                 1670214           0     0.0  float64
AMT_DOWN_PAYMENT                                           1670214           0     0.0  float64
AMT_GOODS_PRICE                                            1670214           0     0.0  float64
HOUR_APPR_PROCESS_START                                    1670214           0     0.0    int64
NFLAG_LAST_APPL_IN_DAY                                     1670214           0     0.0    int64
RATE_DOWN_PAYMENT                       

In [66]:
# Sumarizar previous_application_ohe por SK_ID_CURR para tener una fila por cada préstamo corriente
# Separar columnas según tipo
bool_cols = previous_application_ohe.select_dtypes(include=["bool"]).columns.tolist()
num_cols  = previous_application_ohe.select_dtypes(exclude=["bool"]).columns.tolist()

# Definir reglas de agregación
agg_dict = {col: "sum" for col in bool_cols}
agg_dict.update({col: "mean" for col in num_cols if col not in ["SK_ID_PREV", "SK_ID_CURR"]})

# Aplicar groupby
previous_application_summary = (
    previous_application_ohe
    .groupby("SK_ID_CURR")
    .agg(agg_dict)
    .reset_index()
)

previous_application_summary.shape


(338857, 181)

In [67]:
previous_application_summary.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Consumer loans,NAME_CONTRACT_TYPE_Revolving loans,NAME_CONTRACT_TYPE_XNA,NAME_CONTRACT_TYPE_nan,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,WEEKDAY_APPR_PROCESS_START_nan,FLAG_LAST_APPL_PER_CONTRACT_N,FLAG_LAST_APPL_PER_CONTRACT_Y,FLAG_LAST_APPL_PER_CONTRACT_nan,NAME_CASH_LOAN_PURPOSE_Building a house or an annex,NAME_CASH_LOAN_PURPOSE_Business development,NAME_CASH_LOAN_PURPOSE_Buying a garage,NAME_CASH_LOAN_PURPOSE_Buying a holiday home / land,NAME_CASH_LOAN_PURPOSE_Buying a home,NAME_CASH_LOAN_PURPOSE_Buying a new car,NAME_CASH_LOAN_PURPOSE_Buying a used car,NAME_CASH_LOAN_PURPOSE_Car repairs,NAME_CASH_LOAN_PURPOSE_Education,NAME_CASH_LOAN_PURPOSE_Everyday expenses,NAME_CASH_LOAN_PURPOSE_Furniture,NAME_CASH_LOAN_PURPOSE_Gasification / water supply,NAME_CASH_LOAN_PURPOSE_Hobby,NAME_CASH_LOAN_PURPOSE_Journey,NAME_CASH_LOAN_PURPOSE_Medicine,NAME_CASH_LOAN_PURPOSE_Money for a third person,NAME_CASH_LOAN_PURPOSE_Other,NAME_CASH_LOAN_PURPOSE_Payments on other loans,NAME_CASH_LOAN_PURPOSE_Purchase of electronic equipment,NAME_CASH_LOAN_PURPOSE_Refusal to name the goal,NAME_CASH_LOAN_PURPOSE_Repairs,NAME_CASH_LOAN_PURPOSE_Urgent needs,NAME_CASH_LOAN_PURPOSE_Wedding / gift / holiday,NAME_CASH_LOAN_PURPOSE_XAP,NAME_CASH_LOAN_PURPOSE_XNA,NAME_CASH_LOAN_PURPOSE_nan,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Canceled,NAME_CONTRACT_STATUS_Refused,NAME_CONTRACT_STATUS_Unused offer,NAME_CONTRACT_STATUS_nan,NAME_PAYMENT_TYPE_Cash through the bank,NAME_PAYMENT_TYPE_Cashless from the account of the employer,NAME_PAYMENT_TYPE_Non-cash from your account,NAME_PAYMENT_TYPE_XNA,NAME_PAYMENT_TYPE_nan,CODE_REJECT_REASON_CLIENT,CODE_REJECT_REASON_HC,CODE_REJECT_REASON_LIMIT,CODE_REJECT_REASON_SCO,CODE_REJECT_REASON_SCOFR,CODE_REJECT_REASON_SYSTEM,CODE_REJECT_REASON_VERIF,CODE_REJECT_REASON_XAP,CODE_REJECT_REASON_XNA,CODE_REJECT_REASON_nan,NAME_TYPE_SUITE_0,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NAME_TYPE_SUITE_nan,NAME_CLIENT_TYPE_New,NAME_CLIENT_TYPE_Refreshed,NAME_CLIENT_TYPE_Repeater,NAME_CLIENT_TYPE_XNA,NAME_CLIENT_TYPE_nan,NAME_GOODS_CATEGORY_Additional Service,NAME_GOODS_CATEGORY_Animals,NAME_GOODS_CATEGORY_Audio/Video,NAME_GOODS_CATEGORY_Auto Accessories,NAME_GOODS_CATEGORY_Clothing and Accessories,NAME_GOODS_CATEGORY_Computers,NAME_GOODS_CATEGORY_Construction Materials,NAME_GOODS_CATEGORY_Consumer Electronics,NAME_GOODS_CATEGORY_Direct Sales,NAME_GOODS_CATEGORY_Education,NAME_GOODS_CATEGORY_Fitness,NAME_GOODS_CATEGORY_Furniture,NAME_GOODS_CATEGORY_Gardening,NAME_GOODS_CATEGORY_Homewares,NAME_GOODS_CATEGORY_House Construction,NAME_GOODS_CATEGORY_Insurance,NAME_GOODS_CATEGORY_Jewelry,NAME_GOODS_CATEGORY_Medical Supplies,NAME_GOODS_CATEGORY_Medicine,NAME_GOODS_CATEGORY_Mobile,NAME_GOODS_CATEGORY_Office Appliances,NAME_GOODS_CATEGORY_Other,NAME_GOODS_CATEGORY_Photo / Cinema Equipment,NAME_GOODS_CATEGORY_Sport and Leisure,NAME_GOODS_CATEGORY_Tourism,NAME_GOODS_CATEGORY_Vehicles,NAME_GOODS_CATEGORY_Weapon,NAME_GOODS_CATEGORY_XNA,NAME_GOODS_CATEGORY_nan,NAME_PORTFOLIO_Cards,NAME_PORTFOLIO_Cars,NAME_PORTFOLIO_Cash,NAME_PORTFOLIO_POS,NAME_PORTFOLIO_XNA,NAME_PORTFOLIO_nan,NAME_PRODUCT_TYPE_XNA,NAME_PRODUCT_TYPE_walk-in,NAME_PRODUCT_TYPE_x-sell,NAME_PRODUCT_TYPE_nan,CHANNEL_TYPE_AP+ (Cash loan),CHANNEL_TYPE_Car dealer,CHANNEL_TYPE_Channel of corporate sales,CHANNEL_TYPE_Contact center,CHANNEL_TYPE_Country-wide,CHANNEL_TYPE_Credit and cash offices,CHANNEL_TYPE_Regional / Local,CHANNEL_TYPE_Stone,CHANNEL_TYPE_nan,NAME_SELLER_INDUSTRY_Auto technology,NAME_SELLER_INDUSTRY_Clothing,NAME_SELLER_INDUSTRY_Connectivity,NAME_SELLER_INDUSTRY_Construction,NAME_SELLER_INDUSTRY_Consumer electronics,NAME_SELLER_INDUSTRY_Furniture,NAME_SELLER_INDUSTRY_Industry,NAME_SELLER_INDUSTRY_Jewelry,NAME_SELLER_INDUSTRY_MLM partners,NAME_SELLER_INDUSTRY_Tourism,NAME_SELLER_INDUSTRY_XNA,NAME_SELLER_INDUSTRY_nan,NAME_YIELD_GROUP_XNA,NAME_YIELD_GROUP_high,NAME_YIELD_GROUP_low_action,NAME_YIELD_GROUP_low_normal,NAME_YIELD_GROUP_middle,NAME_YIELD_GROUP_nan,PRODUCT_COMBINATION_0,PRODUCT_COMBINATION_Card Street,PRODUCT_COMBINATION_Card X-Sell,PRODUCT_COMBINATION_Cash,PRODUCT_COMBINATION_Cash Street: high,PRODUCT_COMBINATION_Cash Street: low,PRODUCT_COMBINATION_Cash Street: middle,PRODUCT_COMBINATION_Cash X-Sell: high,PRODUCT_COMBINATION_Cash X-Sell: low,PRODUCT_COMBINATION_Cash X-Sell: middle,PRODUCT_COMBINATION_POS household with interest,PRODUCT_COMBINATION_POS household without interest,PRODUCT_COMBINATION_POS industry with interest,PRODUCT_COMBINATION_POS industry without interest,PRODUCT_COMBINATION_POS mobile with interest,PRODUCT_COMBINATION_POS mobile without interest,PRODUCT_COMBINATION_POS other with interest,PRODUCT_COMBINATION_POS others without interest,PRODUCT_COMBINATION_nan,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,HOUR_APPR_PROCESS_START,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,DAYS_DECISION,SELLERPLACE_AREA,CNT_PAYMENT,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,100001,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3951.0,24835.5,23787.0,2520.0,24835.5,13.0,1.0,0.104326,0.0,0.0,-1740.0,23.0,8.0,365243.0,-1709.0,-1499.0,-1619.0,-1612.0,0.0
1,100002,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,9251.775,179055.0,179055.0,0.0,179055.0,9.0,1.0,0.0,0.0,0.0,-606.0,500.0,24.0,365243.0,-565.0,125.0,-25.0,-17.0,0.0
2,100003,1,2,0,0,0,1,0,1,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,3,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,3,0,0,0,0,2,0,0,0,0,1,0,0,2,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,2,0,0,2,0,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,56553.99,435436.5,484191.0,2295.0,435436.5,14.666667,1.0,0.033354,0.0,0.0,-1305.0,533.0,10.0,365243.0,-1274.333333,-1004.333333,-1054.333333,-1047.333333,0.666667
3,100004,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,5357.25,24282.0,20106.0,4860.0,24282.0,5.0,1.0,0.212008,0.0,0.0,-815.0,30.0,4.0,365243.0,-784.0,-694.0,-724.0,-714.0,0.0
4,100005,1,1,0,0,0,1,0,0,0,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,2,0,0,2,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,2,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2406.6,22308.75,20076.75,2232.0,22308.75,10.5,1.0,0.054482,0.0,0.0,-536.0,18.0,6.0,182621.5,-353.0,-188.0,-233.0,-230.0,0.0


In [68]:
del previous_application_ohe
del previous_application_df
gc.collect()

0

### Data prep - pos_cash_balance

In [69]:
len(pos_cash_balance_df)

10001358

In [70]:
# Info sobre las columnas del pos_cash_balance_df
"""
•	SK_ID_PREV → ID del crédito previo en Home Credit (un préstamo en application puede tener 0, 1 o varios).
•	SK_ID_CURR → ID del préstamo en nuestro dataset principal (application).
•	MONTHS_BALANCE → Mes del reporte relativo a la fecha de aplicación del préstamo actual:
    0 = mes de la aplicación
    -1 = un mes antes
•	CNT_INSTALMENT → Número total de cuotas originalmente previstas (puede variar si se reestructura).
•	CNT_INSTALMENT_FUTURE → Número de cuotas pendientes en ese momento.
•	NAME_CONTRACT_STATUS → Estado del contrato ese mes (ej. Active, Completed, Signed, etc.).
•	SK_DPD → Days Past Due = cantidad de días de atraso en ese mes.
•	SK_DPD_DEF → Days Past Due con tolerancia (se ignoran deudas pequeñas).
"""
df_info_summary(pos_cash_balance_df)

                       Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                   10001358           0    0.00    int64
SK_ID_CURR                   10001358           0    0.00    int64
MONTHS_BALANCE               10001358           0    0.00    int64
CNT_INSTALMENT                9975287       26071    0.26  float64
CNT_INSTALMENT_FUTURE         9975271       26087    0.26  float64
NAME_CONTRACT_STATUS         10001358           0    0.00   object
SK_DPD                       10001358           0    0.00    int64
SK_DPD_DEF                   10001358           0    0.00    int64


In [71]:
print("Count distinct of SK_ID_PREV: ", pos_cash_balance_df["SK_ID_PREV"].nunique())
print("Count distinct of SK_ID_CURR: ", pos_cash_balance_df["SK_ID_CURR"].nunique())

Count distinct of SK_ID_PREV:  936325
Count distinct of SK_ID_CURR:  337252


In [72]:
# Por ahora reemplazo nan con ceros, pero habría que ver si se puede mejorar
pos_cash_balance_df = pos_cash_balance_df.fillna(0)
print("Columnas con valores NaN despues de rellenar:")
print(pos_cash_balance_df.columns[pos_cash_balance_df.isna().any()].tolist())

Columnas con valores NaN despues de rellenar:
[]


In [73]:
df_info_summary(pos_cash_balance_df)

                       Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                   10001358           0     0.0    int64
SK_ID_CURR                   10001358           0     0.0    int64
MONTHS_BALANCE               10001358           0     0.0    int64
CNT_INSTALMENT               10001358           0     0.0  float64
CNT_INSTALMENT_FUTURE        10001358           0     0.0  float64
NAME_CONTRACT_STATUS         10001358           0     0.0   object
SK_DPD                       10001358           0     0.0    int64
SK_DPD_DEF                   10001358           0     0.0    int64


In [74]:
# Crear tabla de conteo de status por SK_ID_PREV
# OHE con get_dummies
status_dummies = pd.get_dummies(pos_cash_balance_df["NAME_CONTRACT_STATUS"], prefix="STATUS_", dtype="int32")

# Concatenar pos_cash_balance_df
pos_cash_balance_df = pd.concat([pos_cash_balance_df, status_dummies], axis=1)

print("Filas de pos_cash_balance_df despues de agregar dummies:", len(pos_cash_balance_df))
pos_cash_balance_df.head()  

Filas de pos_cash_balance_df despues de agregar dummies: 10001358


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF,STATUS__Active,STATUS__Amortized debt,STATUS__Approved,STATUS__Canceled,STATUS__Completed,STATUS__Demand,STATUS__Returned to the store,STATUS__Signed,STATUS__XNA
0,1803195,182943,-31,48.0,45.0,Active,0,0,1,0,0,0,0,0,0,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0,1,0,0,0,0,0,0,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0,1,0,0,0,0,0,0,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0,1,0,0,0,0,0,0,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0,1,0,0,0,0,0,0,0,0


In [75]:
df_info_summary(pos_cash_balance_df)

                               Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                           10001358           0     0.0    int64
SK_ID_CURR                           10001358           0     0.0    int64
MONTHS_BALANCE                       10001358           0     0.0    int64
CNT_INSTALMENT                       10001358           0     0.0  float64
CNT_INSTALMENT_FUTURE                10001358           0     0.0  float64
NAME_CONTRACT_STATUS                 10001358           0     0.0   object
SK_DPD                               10001358           0     0.0    int64
SK_DPD_DEF                           10001358           0     0.0    int64
STATUS__Active                       10001358           0     0.0    int32
STATUS__Amortized debt               10001358           0     0.0    int32
STATUS__Approved                     10001358           0     0.0    int32
STATUS__Canceled                     10001358           0     0.0    int32
STATUS__Completed        

In [76]:
del status_dummies
gc.collect()

0

In [77]:
# Sumarizar pos_cash_balance_df por SK_ID_CURR para tener una fila por cada préstamo corriente
# Definir el diccionario de agregación según la columna
agg_dict = {
    # Columnas con media y renombrado
    'CNT_INSTALMENT': [('PCB_CNT_INSTALMENT_MEAN', 'mean')], # PCB_ para identificar que es de pos cash balance
    'CNT_INSTALMENT_FUTURE': [('PCB_CNT_INSTALMENT_FUTURE_MEAN', 'mean')],
    'SK_DPD': [('PCB_SK_DPD_MEAN', 'mean')],
    'SK_DPD_DEF': [('PCB_SK_DPD_DEF_MEAN', 'mean')],

    # Columnas con suma y renombrado
    'STATUS__Active': [('PCB_MONTHS_WITH_STATUS_Active_SUM', 'sum')],
    'STATUS__Amortized debt': [('PCB_MONTHS_WITH_STATUS_Amortized_debt_SUM', 'sum')],
    'STATUS__Approved': [('PCB_MONTHS_WITH_STATUS_Approved_SUM', 'sum')],
    'STATUS__Canceled': [('PCB_MONTHS_WITH_STATUS_Canceled_SUM', 'sum')],
    'STATUS__Completed': [('PCB_MONTHS_WITH_STATUS_Completed_SUM', 'sum')],
    'STATUS__Demand': [('PCB_MONTHS_WITH_STATUS_Demand_SUM', 'sum')],
    'STATUS__Returned to the store': [('PCB_MONTHS_WITH_STATUS_Returned_to_the_store_SUM', 'sum')],
    'STATUS__Signed': [('PCB_MONTHS_WITH_STATUS_Signed_SUM', 'sum')],
    'STATUS__XNA': [('PCB_MONTHS_WITH_STATUS_XNA_SUM', 'sum')]
}

# Aplanar el diccionario para la agregación
flat_agg_dict = {}
for col, operations in agg_dict.items():
    for new_name, func in operations:
        flat_agg_dict[new_name] = (col, func)

# Aplicar la agregación con renombrado
pos_cash_balance_df_agg = pos_cash_balance_df.groupby(['SK_ID_CURR']).agg(**flat_agg_dict).reset_index()

# Mostrar las primeras filas
pos_cash_balance_df_agg.head()

Unnamed: 0,SK_ID_CURR,PCB_CNT_INSTALMENT_MEAN,PCB_CNT_INSTALMENT_FUTURE_MEAN,PCB_SK_DPD_MEAN,PCB_SK_DPD_DEF_MEAN,PCB_MONTHS_WITH_STATUS_Active_SUM,PCB_MONTHS_WITH_STATUS_Amortized_debt_SUM,PCB_MONTHS_WITH_STATUS_Approved_SUM,PCB_MONTHS_WITH_STATUS_Canceled_SUM,PCB_MONTHS_WITH_STATUS_Completed_SUM,PCB_MONTHS_WITH_STATUS_Demand_SUM,PCB_MONTHS_WITH_STATUS_Returned_to_the_store_SUM,PCB_MONTHS_WITH_STATUS_Signed_SUM,PCB_MONTHS_WITH_STATUS_XNA_SUM
0,100001,4.0,1.444444,0.777778,0.777778,7,0,0,0,2,0,0,0,0
1,100002,24.0,15.0,0.0,0.0,19,0,0,0,0,0,0,0,0
2,100003,10.107143,5.785714,0.0,0.0,26,0,0,0,2,0,0,0,0
3,100004,3.75,2.25,0.0,0.0,3,0,0,0,1,0,0,0,0
4,100005,10.636364,6.545455,0.0,0.0,9,0,0,0,1,0,0,1,0


In [78]:
len(pos_cash_balance_df_agg)

337252

In [79]:
df_info_summary(pos_cash_balance_df_agg)

                                                  Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                                                337252           0     0.0    int64
PCB_CNT_INSTALMENT_MEAN                                   337252           0     0.0  float64
PCB_CNT_INSTALMENT_FUTURE_MEAN                            337252           0     0.0  float64
PCB_SK_DPD_MEAN                                           337252           0     0.0  float64
PCB_SK_DPD_DEF_MEAN                                       337252           0     0.0  float64
PCB_MONTHS_WITH_STATUS_Active_SUM                         337252           0     0.0    int32
PCB_MONTHS_WITH_STATUS_Amortized_debt_SUM                 337252           0     0.0    int32
PCB_MONTHS_WITH_STATUS_Approved_SUM                       337252           0     0.0    int32
PCB_MONTHS_WITH_STATUS_Canceled_SUM                       337252           0     0.0    int32
PCB_MONTHS_WITH_STATUS_Completed_SUM                      33

In [80]:
print("Count distinct of SK_ID_PREV: ", pos_cash_balance_df_agg["SK_ID_CURR"].nunique())

Count distinct of SK_ID_PREV:  337252


In [81]:
del pos_cash_balance_df
gc.collect()

0

### Data prep - installments_payments

In [82]:
len(installments_payments_df)

13605401

In [83]:
# Info sobre las columnas del installments_payments_df
"""
Histórico de pagos de cuotas de créditos anteriores.
•	SK_ID_PREV → ID del crédito previo.
•	SK_ID_CURR → ID del préstamo actual.
•	NUM_INSTALMENT_VERSION → Versión del calendario de pagos (0 si es tarjeta de crédito). Un cambio significa renegociación.
•	NUM_INSTALMENT_NUMBER → Número de cuota (1, 2, 3, …).
•	DAYS_INSTALMENT → Día en que debía pagarse la cuota (relativo a aplicación actual).
•	DAYS_ENTRY_PAYMENT → Día en que efectivamente se pagó (NaN si no se pagó).
•	AMT_INSTALMENT → Monto esperado de la cuota.
•	AMT_PAYMENT → Monto realmente pagado.
"""
df_info_summary(installments_payments_df)

                        Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                    13605401           0    0.00    int64
SK_ID_CURR                    13605401           0    0.00    int64
NUM_INSTALMENT_VERSION        13605401           0    0.00  float64
NUM_INSTALMENT_NUMBER         13605401           0    0.00    int64
DAYS_INSTALMENT               13605401           0    0.00  float64
DAYS_ENTRY_PAYMENT            13602496        2905    0.02  float64
AMT_INSTALMENT                13605401           0    0.00  float64
AMT_PAYMENT                   13602496        2905    0.02  float64


In [84]:
# Verificamos un ejemplo de filas con DAYS_ENTRY_PAYMENT NaN. Aparenteemente no se abonaron esas cuotas
installments_payments_df.loc[installments_payments_df["DAYS_ENTRY_PAYMENT"].isna()].head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
3764207,1531600,103793,1.0,7,-668.0,,49741.02,
3764208,1947105,159974,1.0,24,-36.0,,22849.515,
3764209,1843773,167270,1.0,22,-20.0,,48092.355,
3764210,1691592,192536,1.0,5,-2561.0,,7675.425,
3764211,1531299,157088,0.0,11,-1847.0,,67.5,


In [85]:
installments_payments_df.loc[installments_payments_df["SK_ID_PREV"] == 1531600].sort_values(by=["DAYS_INSTALMENT"], ascending=False) 

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
3794812,1531600,103793,1.0,27,-68.0,,49741.02,
3764294,1531600,103793,1.0,26,-98.0,,49741.02,
3764214,1531600,103793,1.0,25,-128.0,,49741.02,
3799630,1531600,103793,1.0,24,-158.0,,49741.02,
3774325,1531600,103793,1.0,23,-188.0,,49741.02,
3784883,1531600,103793,1.0,22,-218.0,,49741.02,
3764227,1531600,103793,1.0,21,-248.0,,49741.02,
3764224,1531600,103793,1.0,20,-278.0,,49741.02,
3779550,1531600,103793,1.0,19,-308.0,,49741.02,
3779605,1531600,103793,1.0,18,-338.0,,49741.02,


In [86]:
"""
Por lo visto anteriormente, los NaN en DAYS_ENTRY_PAYMENT y AMT_PAYMENT indican cuotas no pagadas. 
Vamos a crear las siguientes columnas:
- instalments_unpaid: cantidad de cuotas no abonadas (DAYS_ENTRY_PAYMENT no NaN)
- instalments_partially_paid: cantidad de cuotas abonadas parcialmente (DAYS_ENTRY_PAYMENT no NaN y AMT_PAYMENT < AMT_INSTALMENT)
- instalments_overdue: cantidad de cuotas vencidas y no pagadas (DAYS_ENTRY_PAYMENT NaN y DAYS_INSTALMENT < 0)
- amount debt: monto adeudado (AMT_INSTALMENT - AMT_PAYMENT) si no se pagó y si se pagó menos de lo debido
- dpd: días de atraso en el pago:
    - Si se pagó (DAYS_ENTRY_PAYMENT no es NaN): max(0, DAYS_ENTRY_PAYMENT - DAYS_INSTALMENT)
    - Si no se pagó y está vencido: abs(DAYS_INSTALMENT) si DAYS_INSTALMENT < 0
"""

installments_payments_df["instalments_unpaid"] = installments_payments_df["DAYS_ENTRY_PAYMENT"].isna().astype(int)

installments_payments_df["instalments_partially_paid"] = (
    (~installments_payments_df["DAYS_ENTRY_PAYMENT"].isna()) & 
    (installments_payments_df["AMT_PAYMENT"] < installments_payments_df["AMT_INSTALMENT"]) &
    (installments_payments_df["AMT_PAYMENT"] > 0)
).astype(int)

installments_payments_df["instalments_overdue"] = ((installments_payments_df["DAYS_ENTRY_PAYMENT"].isna()) & (installments_payments_df["DAYS_INSTALMENT"] < 0)).astype(int)

# Calcular monto adeudado
installments_payments_df["amount_debt"] = installments_payments_df.apply(
    # lambda row: (row["AMT_INSTALMENT"] - row["AMT_PAYMENT"]) if (pd.notna(row["AMT_PAYMENT"]) or row["AMT_PAYMENT"] < row["AMT_INSTALMENT"]) else 0,
    # axis=1
    lambda row: max(0, row["AMT_INSTALMENT"] - row["AMT_PAYMENT"]) if pd.notna(row["AMT_PAYMENT"]) 
    else abs(row["AMT_INSTALMENT"]), 
    axis=1
).round(2)

# Calcular dpd considerando ambos casos
installments_payments_df["dpd"] = installments_payments_df.apply(
    lambda row: max(0, row["DAYS_ENTRY_PAYMENT"] - row["DAYS_INSTALMENT"]) if pd.notna(row["DAYS_ENTRY_PAYMENT"]) 
    else abs(row["DAYS_INSTALMENT"]) if row["DAYS_INSTALMENT"] < 0 else 0, 
    axis=1
).astype(int)

installments_payments_df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,instalments_unpaid,instalments_partially_paid,instalments_overdue,amount_debt,dpd
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36,0,0,0,0.0,0
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525,0,0,0,0.0,0
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0,0,0,0,0.0,0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13,0,0,0,0.0,0
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585,0,1,0,4.45,17


In [87]:
installments_payments_df.loc[installments_payments_df["SK_ID_PREV"] == 1531600].sort_values(by=["DAYS_INSTALMENT"], ascending=False) 

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,instalments_unpaid,instalments_partially_paid,instalments_overdue,amount_debt,dpd
3794812,1531600,103793,1.0,27,-68.0,,49741.02,,1,0,1,49741.02,68
3764294,1531600,103793,1.0,26,-98.0,,49741.02,,1,0,1,49741.02,98
3764214,1531600,103793,1.0,25,-128.0,,49741.02,,1,0,1,49741.02,128
3799630,1531600,103793,1.0,24,-158.0,,49741.02,,1,0,1,49741.02,158
3774325,1531600,103793,1.0,23,-188.0,,49741.02,,1,0,1,49741.02,188
3784883,1531600,103793,1.0,22,-218.0,,49741.02,,1,0,1,49741.02,218
3764227,1531600,103793,1.0,21,-248.0,,49741.02,,1,0,1,49741.02,248
3764224,1531600,103793,1.0,20,-278.0,,49741.02,,1,0,1,49741.02,278
3779550,1531600,103793,1.0,19,-308.0,,49741.02,,1,0,1,49741.02,308
3779605,1531600,103793,1.0,18,-338.0,,49741.02,,1,0,1,49741.02,338


In [88]:
# Removemos columnas con NaN ya reemplazadas con las nuevas columnas
installments_payments_df = installments_payments_df.drop(columns=['DAYS_ENTRY_PAYMENT', 'AMT_PAYMENT'])

# Pasamos NUM_INSTALMENT_VERSION a int32
installments_payments_df['NUM_INSTALMENT_VERSION'] = installments_payments_df['NUM_INSTALMENT_VERSION'].astype('int32')

In [89]:
print("Count distinct of SK_ID_PREV: ", installments_payments_df["SK_ID_CURR"].nunique())

Count distinct of SK_ID_PREV:  339587


In [90]:
df_info_summary(installments_payments_df)

                            Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                        13605401           0     0.0    int64
SK_ID_CURR                        13605401           0     0.0    int64
NUM_INSTALMENT_VERSION            13605401           0     0.0    int32
NUM_INSTALMENT_NUMBER             13605401           0     0.0    int64
DAYS_INSTALMENT                   13605401           0     0.0  float64
AMT_INSTALMENT                    13605401           0     0.0  float64
instalments_unpaid                13605401           0     0.0    int32
instalments_partially_paid        13605401           0     0.0    int32
instalments_overdue               13605401           0     0.0    int32
amount_debt                       13605401           0     0.0  float64
dpd                               13605401           0     0.0    int32


In [91]:
# Sumarizar installments_payments_df por SK_ID_CURR para tener una fila por cada préstamo corriente
# Definir el diccionario de agregación según la columna
agg_dict = {
    # Columnas con maximo y renombrado
    'NUM_INSTALMENT_VERSION': [('IP_NUM_INSTALMENT_VERSION_MAX', 'max')],  # Para ver cual fue el maximo renegociación al que llego / IP_ para identificar que es de installments payments

    # Columnas con media y renombrado
    'dpd': [('IP_DPD_MEAN', 'mean')],
    'amount_debt': [('IP_AMOUNT_DEBT_MEAN', 'mean')],

    # Columnas con suma y renombrado
    'instalments_unpaid': [('IP_INSTALMENTS_UNPAID_SUM', 'sum')],
    'instalments_partially_paid': [('IP_INSTALMENTS_PARTIALLY_PAID_SUM', 'sum')],
    'instalments_overdue': [('IP_INSTALMENTS_OVERDUE_SUM', 'sum')]
}

# Aplanar el diccionario para la agregación
flat_agg_dict = {}
for col, operations in agg_dict.items():
    for new_name, func in operations:
        flat_agg_dict[new_name] = (col, func)

# Aplicar la agregación con renombrado
installments_payments_df_agg = installments_payments_df.groupby(['SK_ID_CURR']).agg(**flat_agg_dict).reset_index()

# Mostrar las primeras filas
installments_payments_df_agg.head()

Unnamed: 0,SK_ID_CURR,IP_NUM_INSTALMENT_VERSION_MAX,IP_DPD_MEAN,IP_AMOUNT_DEBT_MEAN,IP_INSTALMENTS_UNPAID_SUM,IP_INSTALMENTS_PARTIALLY_PAID_SUM,IP_INSTALMENTS_OVERDUE_SUM
0,100001,2,1.571429,0.0,0,0,0
1,100002,2,0.0,0.0,0,0,0
2,100003,2,0.0,0.0,0,0,0
3,100004,2,0.0,0.0,0,0,0
4,100005,2,0.111111,0.0,0,0,0


In [92]:
len(installments_payments_df_agg)

339587

In [93]:
df_info_summary(installments_payments_df_agg)

                                   Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                                 339587           0     0.0    int64
IP_NUM_INSTALMENT_VERSION_MAX              339587           0     0.0    int32
IP_DPD_MEAN                                339587           0     0.0  float64
IP_AMOUNT_DEBT_MEAN                        339587           0     0.0  float64
IP_INSTALMENTS_UNPAID_SUM                  339587           0     0.0    int32
IP_INSTALMENTS_PARTIALLY_PAID_SUM          339587           0     0.0    int32
IP_INSTALMENTS_OVERDUE_SUM                 339587           0     0.0    int32


In [94]:
print("Count distinct of SK_ID_CURR: ", installments_payments_df_agg["SK_ID_CURR"].nunique())

Count distinct of SK_ID_CURR:  339587


In [95]:
del installments_payments_df
gc.collect()

0

### Data prep - credit_card_balance

In [96]:
len(credit_card_balance_df)

3840312

In [97]:
# Info sobre las columnas del credit_card_balance_df
"""
Histórico mensual del comportamiento de las tarjetas de crédito.
•	SK_ID_PREV → ID del crédito previo (relacionado a tarjeta).
•	SK_ID_CURR → ID del préstamo en nuestro dataset principal.
•	MONTHS_BALANCE → Mes relativo a la aplicación actual (0 = aplicación, -1 = un mes antes).
•	AMT_BALANCE → Balance actual de la tarjeta ese mes.
•	AMT_CREDIT_LIMIT_ACTUAL → Límite de crédito vigente ese mes.
•	AMT_DRAWINGS_ATM_CURRENT → Monto retirado en cajero automático ese mes.
•	AMT_DRAWINGS_CURRENT → Monto total retirado ese mes (ATM + POS + otros).
•	AMT_DRAWINGS_OTHER_CURRENT → Monto retirado en canales distintos a cajero o POS.
•	AMT_DRAWINGS_POS_CURRENT → Monto gastado en POS (compras).
•	AMT_INST_MIN_REGULARITY → Pago mínimo requerido ese mes.
•	AMT_PAYMENT_CURRENT → Pago realizado ese mes.
•	AMT_PAYMENT_TOTAL_CURRENT → Pagos totales realizados ese mes (suma de todos los abonos).
•	AMT_RECEIVABLE_PRINCIPAL → Principal pendiente de pago.
•	AMT_RECIVABLE → Total pendiente (principal + intereses).
•	AMT_TOTAL_RECEIVABLE → Variante de cálculo del total pendiente.
•	CNT_DRAWINGS_ATM_CURRENT → Número de retiros en cajero ese mes.
•	CNT_DRAWINGS_CURRENT → Número total de operaciones con la tarjeta ese mes.
•	CNT_DRAWINGS_OTHER_CURRENT → Número de operaciones en otros canales.
•	CNT_DRAWINGS_POS_CURRENT → Número de operaciones de compra en POS.
•	CNT_INSTALMENT_MATURE_CUM → Número acumulado de cuotas ya pagadas.
•	NAME_CONTRACT_STATUS → Estado del contrato (ej. Active, Completed).
•	SK_DPD → Días de atraso ese mes.
•	SK_DPD_DEF → Días de atraso con criterio más estricto (ignora deudas pequeñas).
"""
df_info_summary(credit_card_balance_df)

                            Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                         3840312           0    0.00    int64
SK_ID_CURR                         3840312           0    0.00    int64
MONTHS_BALANCE                     3840312           0    0.00    int64
AMT_BALANCE                        3840312           0    0.00  float64
AMT_CREDIT_LIMIT_ACTUAL            3840312           0    0.00    int64
AMT_DRAWINGS_ATM_CURRENT           3090496      749816   19.52  float64
AMT_DRAWINGS_CURRENT               3840312           0    0.00  float64
AMT_DRAWINGS_OTHER_CURRENT         3090496      749816   19.52  float64
AMT_DRAWINGS_POS_CURRENT           3090496      749816   19.52  float64
AMT_INST_MIN_REGULARITY            3535076      305236    7.95  float64
AMT_PAYMENT_CURRENT                3072324      767988   20.00  float64
AMT_PAYMENT_TOTAL_CURRENT          3840312           0    0.00  float64
AMT_RECEIVABLE_PRINCIPAL           3840312           0    0.00  

In [98]:
# Por ahora reemplazamos nan con ceros, pero habría que ver si se puede mejorar.
# En este caso parece razonable rellenar con ceros obsvervando la informacion que arrojan las columnas con NaN.
credit_card_balance_df = credit_card_balance_df.fillna(0)
print("Columnas con valores NaN despues de rellenar:")
print(credit_card_balance_df.columns[credit_card_balance_df.isna().any()].tolist())

Columnas con valores NaN despues de rellenar:
[]


In [99]:
print("Count distinct of SK_ID_PREV: ", credit_card_balance_df["SK_ID_PREV"].nunique())
print("Count distinct of SK_ID_CURR: ", credit_card_balance_df["SK_ID_CURR"].nunique())

Count distinct of SK_ID_PREV:  104307
Count distinct of SK_ID_CURR:  103558


In [100]:
df_info_summary(credit_card_balance_df)

                            Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                         3840312           0     0.0    int64
SK_ID_CURR                         3840312           0     0.0    int64
MONTHS_BALANCE                     3840312           0     0.0    int64
AMT_BALANCE                        3840312           0     0.0  float64
AMT_CREDIT_LIMIT_ACTUAL            3840312           0     0.0    int64
AMT_DRAWINGS_ATM_CURRENT           3840312           0     0.0  float64
AMT_DRAWINGS_CURRENT               3840312           0     0.0  float64
AMT_DRAWINGS_OTHER_CURRENT         3840312           0     0.0  float64
AMT_DRAWINGS_POS_CURRENT           3840312           0     0.0  float64
AMT_INST_MIN_REGULARITY            3840312           0     0.0  float64
AMT_PAYMENT_CURRENT                3840312           0     0.0  float64
AMT_PAYMENT_TOTAL_CURRENT          3840312           0     0.0  float64
AMT_RECEIVABLE_PRINCIPAL           3840312           0     0.0  

In [101]:
# Sumarizar credit_card_balance_df por SK_ID_CURR para tener una fila por cada préstamo corriente
# Definir el diccionario de agregación según la columna
agg_dict = {
    # Columnas con media y renombrado
    'AMT_BALANCE': [('CCB_AMT_BALANCE_MEAN', 'mean')], # CCB_ para indicar que es de credit_card_balance
    'AMT_CREDIT_LIMIT_ACTUAL': [('CCB_AMT_CREDIT_LIMIT_ACTUAL_MEAN', 'mean')],
    'AMT_DRAWINGS_ATM_CURRENT': [('CCB_AMT_DRAWINGS_ATM_CURRENT_MEAN', 'mean')],
    'AMT_DRAWINGS_CURRENT': [('CCB_AMT_DRAWINGS_CURRENT_MEAN', 'mean')],
    'AMT_DRAWINGS_OTHER_CURRENT': [('CCB_AMT_DRAWINGS_OTHER_CURRENT_MEAN', 'mean')],
    'AMT_DRAWINGS_POS_CURRENT': [('CCB_AMT_DRAWINGS_POS_CURRENT_MEAN', 'mean')],
    'AMT_INST_MIN_REGULARITY': [('CCB_AMT_INST_MIN_REGULARITY_MEAN', 'mean')],
    'AMT_PAYMENT_CURRENT': [('CCB_AMT_PAYMENT_CURRENT_MEAN', 'mean')],
    'AMT_PAYMENT_TOTAL_CURRENT': [('CCB_AMT_PAYMENT_TOTAL_CURRENT_MEAN', 'mean')],
    'AMT_RECEIVABLE_PRINCIPAL': [('CCB_AMT_RECEIVABLE_PRINCIPAL_MEAN', 'mean')],
    'AMT_RECIVABLE': [('CCB_AMT_RECIVABLE_MEAN', 'mean')],
    'AMT_TOTAL_RECEIVABLE': [('CCB_AMT_TOTAL_RECEIVABLE_MEAN', 'mean')],
    'CNT_DRAWINGS_ATM_CURRENT': [('CCB_CNT_DRAWINGS_ATM_CURRENT_MEAN', 'mean')],
    'CNT_DRAWINGS_CURRENT': [('CCB_CNT_DRAWINGS_CURRENT_MEAN', 'mean')],
    'CNT_DRAWINGS_OTHER_CURRENT': [('CCB_CNT_DRAWINGS_OTHER_CURRENT_MEAN', 'mean')],
    'CNT_DRAWINGS_POS_CURRENT': [('CCB_CNT_DRAWINGS_POS_CURRENT_MEAN', 'mean')],
    'CNT_INSTALMENT_MATURE_CUM': [('CCB_CNT_INSTALMENT_MATURE_CUM_MEAN', 'mean')],
    'SK_DPD': [('CCB_SK_DPD_MEAN', 'mean')],
    'SK_DPD_DEF': [('CCB_SK_DPD_DEF_MEAN', 'mean')]
}

# Aplanar el diccionario para la agregación
flat_agg_dict = {}
for col, operations in agg_dict.items():
    for new_name, func in operations:
        flat_agg_dict[new_name] = (col, func)

# Aplicar la agregación con renombrado
credit_card_balance_df_agg = credit_card_balance_df.groupby(['SK_ID_CURR']).agg(**flat_agg_dict).reset_index()

# Mostrar las primeras filas
credit_card_balance_df_agg.head()

Unnamed: 0,SK_ID_CURR,CCB_AMT_BALANCE_MEAN,CCB_AMT_CREDIT_LIMIT_ACTUAL_MEAN,CCB_AMT_DRAWINGS_ATM_CURRENT_MEAN,CCB_AMT_DRAWINGS_CURRENT_MEAN,CCB_AMT_DRAWINGS_OTHER_CURRENT_MEAN,CCB_AMT_DRAWINGS_POS_CURRENT_MEAN,CCB_AMT_INST_MIN_REGULARITY_MEAN,CCB_AMT_PAYMENT_CURRENT_MEAN,CCB_AMT_PAYMENT_TOTAL_CURRENT_MEAN,CCB_AMT_RECEIVABLE_PRINCIPAL_MEAN,CCB_AMT_RECIVABLE_MEAN,CCB_AMT_TOTAL_RECEIVABLE_MEAN,CCB_CNT_DRAWINGS_ATM_CURRENT_MEAN,CCB_CNT_DRAWINGS_CURRENT_MEAN,CCB_CNT_DRAWINGS_OTHER_CURRENT_MEAN,CCB_CNT_DRAWINGS_POS_CURRENT_MEAN,CCB_CNT_INSTALMENT_MATURE_CUM_MEAN,CCB_SK_DPD_MEAN,CCB_SK_DPD_DEF_MEAN
0,100006,0.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100011,54482.111149,164189.189189,2432.432432,2432.432432,0.0,0.0,3902.759392,4843.064189,4520.067568,52402.088919,54433.179122,54433.179122,0.054054,0.054054,0.0,0.0,25.418919,0.0,0.0
2,100013,18159.919219,131718.75,5953.125,5953.125,0.0,0.0,1348.479375,7168.34625,6817.172344,17255.559844,18101.079844,18101.079844,0.239583,0.239583,0.0,0.0,17.354167,0.010417,0.010417
3,100021,0.0,675000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100023,0.0,135000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [102]:
len(credit_card_balance_df_agg)

103558

In [103]:
df_info_summary(credit_card_balance_df_agg)

                                     Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                                   103558           0     0.0    int64
CCB_AMT_BALANCE_MEAN                         103558           0     0.0  float64
CCB_AMT_CREDIT_LIMIT_ACTUAL_MEAN             103558           0     0.0  float64
CCB_AMT_DRAWINGS_ATM_CURRENT_MEAN            103558           0     0.0  float64
CCB_AMT_DRAWINGS_CURRENT_MEAN                103558           0     0.0  float64
CCB_AMT_DRAWINGS_OTHER_CURRENT_MEAN          103558           0     0.0  float64
CCB_AMT_DRAWINGS_POS_CURRENT_MEAN            103558           0     0.0  float64
CCB_AMT_INST_MIN_REGULARITY_MEAN             103558           0     0.0  float64
CCB_AMT_PAYMENT_CURRENT_MEAN                 103558           0     0.0  float64
CCB_AMT_PAYMENT_TOTAL_CURRENT_MEAN           103558           0     0.0  float64
CCB_AMT_RECEIVABLE_PRINCIPAL_MEAN            103558           0     0.0  float64
CCB_AMT_RECIVABLE_MEAN      

In [104]:
del credit_card_balance_df
gc.collect()

0

## Join with current application data

In [106]:
train_df = pd.read_csv("application_train.csv")

# Detectar columnas con solo "y"/"n" y mapear a 1/0
bool_cols = [c for c in train_df.columns if set(train_df[c].dropna().unique()) <= {"y", "n"}]
if bool_cols:
    train_df[bool_cols] = train_df[bool_cols].apply(lambda s: s.map({"y": 1, "n": 0}).astype("int8"))

In [107]:
len(train_df)

307511

In [108]:
df_info_summary(train_df)

                              Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                            307511           0    0.00    int64
TARGET                                307511           0    0.00    int64
NAME_CONTRACT_TYPE                    307511           0    0.00   object
CODE_GENDER                           307511           0    0.00   object
FLAG_OWN_CAR                          307511           0    0.00   object
FLAG_OWN_REALTY                       307511           0    0.00   object
CNT_CHILDREN                          307511           0    0.00    int64
AMT_INCOME_TOTAL                      307511           0    0.00  float64
AMT_CREDIT                            307511           0    0.00  float64
AMT_ANNUITY                           307499          12    0.00  float64
AMT_GOODS_PRICE                       307233         278    0.09  float64
NAME_TYPE_SUITE                       306219        1292    0.42   object
NAME_INCOME_TYPE                      

In [109]:
print("Count distinct of SK_ID_CURR: ", train_df["SK_ID_CURR"].nunique())

Count distinct of SK_ID_CURR:  307511


In [110]:
# Join train_df con bureau_df
train_df_join = train_df.merge(bureau_df_agg, on="SK_ID_CURR", how="left")
print("rows after join:", len(train_df_join))

rows after join: 307511


In [111]:
# Join train_df con previous_application_df
train_df_join = train_df_join.merge(previous_application_summary, on="SK_ID_CURR", how="left")
print("rows after join:", len(train_df_join))

rows after join: 307511


In [112]:
# Join train_df con pos_cash_balance_df
train_df_join = train_df_join.merge(pos_cash_balance_df_agg, on="SK_ID_CURR", how="left")
print("rows after join:", len(train_df_join))

rows after join: 307511


In [113]:
# Join train_df con installments_payments_df
train_df_join = train_df_join.merge(installments_payments_df_agg, on="SK_ID_CURR", how="left")
print("rows after join:", len(train_df_join))

rows after join: 307511


In [114]:
# Join train_df con credit_card_balance_df
train_df_join = train_df_join.merge(credit_card_balance_df_agg, on="SK_ID_CURR", how="left")
print("rows after join:", len(train_df_join))

rows after join: 307511


In [115]:
train_df_join.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,AMT_ANNUITY_x,AMT_GOODS_PRICE_x,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START_x,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DAYS_CREDIT_MEAN,DAYS_CREDIT_ENDDATE_MEAN,DAYS_ENDDATE_FACT_MEAN,AMT_CREDIT_MAX_OVERDUE_MEAN,AMT_CREDIT_SUM_MEAN,AMT_CREDIT_SUM_DEBT_MEAN,AMT_CREDIT_SUM_LIMIT_MEAN,AMT_CREDIT_SUM_OVERDUE_MEAN,DAYS_CREDIT_UPDATE_MEAN,AMT_ANNUITY_MEAN,CREDIT_DAY_OVERDUE_SUM,CNT_CREDIT_PROLONG_SUM,MONTHS_WITH_STATUS_DPD_0_SUM,MONTHS_WITH_STATUS_DPD_1to30_SUM,MONTHS_WITH_STATUS_DPD_31to60_SUM,MONTHS_WITH_STATUS_DPD_61to90_SUM,MONTHS_WITH_STATUS_DPD_91to120_SUM,MONTHS_WITH_STATUS_DPD_over120_SUM,MONTHS_WITH_STATUS_CLOSED_SUM,MONTHS_WITH_STATUS_UNKNOWN_SUM,CREDIT_ACTIVE_Active_SUM,CREDIT_ACTIVE_Bad debt_SUM,CREDIT_ACTIVE_Closed_SUM,CREDIT_ACTIVE_Sold_SUM,CREDIT_CURRENCY_currency 1_SUM,CREDIT_CURRENCY_currency 2_SUM,CREDIT_CURRENCY_currency 3_SUM,CREDIT_CURRENCY_currency 4_SUM,CREDIT_TYPE_Another type of loan_SUM,CREDIT_TYPE_Car loan_SUM,CREDIT_TYPE_Cash loan (non-earmarked)_SUM,CREDIT_TYPE_Consumer credit_SUM,CREDIT_TYPE_Credit card_SUM,CREDIT_TYPE_Interbank credit_SUM,CREDIT_TYPE_Loan for business development_SUM,CREDIT_TYPE_Loan for purchase of shares (margin lending)_SUM,CREDIT_TYPE_Loan for the purchase of equipment_SUM,CREDIT_TYPE_Loan for working capital replenishment_SUM,CREDIT_TYPE_Microloan_SUM,CREDIT_TYPE_Mobile operator loan_SUM,CREDIT_TYPE_Mortgage_SUM,CREDIT_TYPE_Real estate loan_SUM,CREDIT_TYPE_Unknown type of loan_SUM,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Consumer loans,NAME_CONTRACT_TYPE_Revolving loans,NAME_CONTRACT_TYPE_XNA,NAME_CONTRACT_TYPE_nan,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,WEEKDAY_APPR_PROCESS_START_nan,FLAG_LAST_APPL_PER_CONTRACT_N,FLAG_LAST_APPL_PER_CONTRACT_Y,FLAG_LAST_APPL_PER_CONTRACT_nan,NAME_CASH_LOAN_PURPOSE_Building a house or an annex,NAME_CASH_LOAN_PURPOSE_Business development,NAME_CASH_LOAN_PURPOSE_Buying a garage,NAME_CASH_LOAN_PURPOSE_Buying a holiday home / land,NAME_CASH_LOAN_PURPOSE_Buying a home,NAME_CASH_LOAN_PURPOSE_Buying a new car,NAME_CASH_LOAN_PURPOSE_Buying a used car,NAME_CASH_LOAN_PURPOSE_Car repairs,NAME_CASH_LOAN_PURPOSE_Education,NAME_CASH_LOAN_PURPOSE_Everyday expenses,NAME_CASH_LOAN_PURPOSE_Furniture,NAME_CASH_LOAN_PURPOSE_Gasification / water supply,NAME_CASH_LOAN_PURPOSE_Hobby,NAME_CASH_LOAN_PURPOSE_Journey,NAME_CASH_LOAN_PURPOSE_Medicine,NAME_CASH_LOAN_PURPOSE_Money for a third person,NAME_CASH_LOAN_PURPOSE_Other,NAME_CASH_LOAN_PURPOSE_Payments on other loans,NAME_CASH_LOAN_PURPOSE_Purchase of electronic equipment,NAME_CASH_LOAN_PURPOSE_Refusal to name the goal,NAME_CASH_LOAN_PURPOSE_Repairs,NAME_CASH_LOAN_PURPOSE_Urgent needs,NAME_CASH_LOAN_PURPOSE_Wedding / gift / holiday,NAME_CASH_LOAN_PURPOSE_XAP,NAME_CASH_LOAN_PURPOSE_XNA,NAME_CASH_LOAN_PURPOSE_nan,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Canceled,NAME_CONTRACT_STATUS_Refused,NAME_CONTRACT_STATUS_Unused offer,NAME_CONTRACT_STATUS_nan,NAME_PAYMENT_TYPE_Cash through the bank,NAME_PAYMENT_TYPE_Cashless from the account of the employer,NAME_PAYMENT_TYPE_Non-cash from your account,NAME_PAYMENT_TYPE_XNA,NAME_PAYMENT_TYPE_nan,CODE_REJECT_REASON_CLIENT,CODE_REJECT_REASON_HC,CODE_REJECT_REASON_LIMIT,CODE_REJECT_REASON_SCO,CODE_REJECT_REASON_SCOFR,CODE_REJECT_REASON_SYSTEM,CODE_REJECT_REASON_VERIF,CODE_REJECT_REASON_XAP,CODE_REJECT_REASON_XNA,CODE_REJECT_REASON_nan,NAME_TYPE_SUITE_0,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NAME_TYPE_SUITE_nan,NAME_CLIENT_TYPE_New,NAME_CLIENT_TYPE_Refreshed,NAME_CLIENT_TYPE_Repeater,NAME_CLIENT_TYPE_XNA,NAME_CLIENT_TYPE_nan,NAME_GOODS_CATEGORY_Additional Service,NAME_GOODS_CATEGORY_Animals,NAME_GOODS_CATEGORY_Audio/Video,NAME_GOODS_CATEGORY_Auto Accessories,NAME_GOODS_CATEGORY_Clothing and Accessories,NAME_GOODS_CATEGORY_Computers,NAME_GOODS_CATEGORY_Construction Materials,NAME_GOODS_CATEGORY_Consumer Electronics,NAME_GOODS_CATEGORY_Direct Sales,NAME_GOODS_CATEGORY_Education,NAME_GOODS_CATEGORY_Fitness,NAME_GOODS_CATEGORY_Furniture,NAME_GOODS_CATEGORY_Gardening,NAME_GOODS_CATEGORY_Homewares,NAME_GOODS_CATEGORY_House Construction,NAME_GOODS_CATEGORY_Insurance,NAME_GOODS_CATEGORY_Jewelry,NAME_GOODS_CATEGORY_Medical Supplies,NAME_GOODS_CATEGORY_Medicine,NAME_GOODS_CATEGORY_Mobile,NAME_GOODS_CATEGORY_Office Appliances,NAME_GOODS_CATEGORY_Other,NAME_GOODS_CATEGORY_Photo / Cinema Equipment,NAME_GOODS_CATEGORY_Sport and Leisure,NAME_GOODS_CATEGORY_Tourism,NAME_GOODS_CATEGORY_Vehicles,NAME_GOODS_CATEGORY_Weapon,NAME_GOODS_CATEGORY_XNA,NAME_GOODS_CATEGORY_nan,NAME_PORTFOLIO_Cards,NAME_PORTFOLIO_Cars,NAME_PORTFOLIO_Cash,NAME_PORTFOLIO_POS,NAME_PORTFOLIO_XNA,NAME_PORTFOLIO_nan,NAME_PRODUCT_TYPE_XNA,NAME_PRODUCT_TYPE_walk-in,NAME_PRODUCT_TYPE_x-sell,NAME_PRODUCT_TYPE_nan,CHANNEL_TYPE_AP+ (Cash loan),CHANNEL_TYPE_Car dealer,CHANNEL_TYPE_Channel of corporate sales,CHANNEL_TYPE_Contact center,CHANNEL_TYPE_Country-wide,CHANNEL_TYPE_Credit and cash offices,CHANNEL_TYPE_Regional / Local,CHANNEL_TYPE_Stone,CHANNEL_TYPE_nan,NAME_SELLER_INDUSTRY_Auto technology,NAME_SELLER_INDUSTRY_Clothing,NAME_SELLER_INDUSTRY_Connectivity,NAME_SELLER_INDUSTRY_Construction,NAME_SELLER_INDUSTRY_Consumer electronics,NAME_SELLER_INDUSTRY_Furniture,NAME_SELLER_INDUSTRY_Industry,NAME_SELLER_INDUSTRY_Jewelry,NAME_SELLER_INDUSTRY_MLM partners,NAME_SELLER_INDUSTRY_Tourism,NAME_SELLER_INDUSTRY_XNA,NAME_SELLER_INDUSTRY_nan,NAME_YIELD_GROUP_XNA,NAME_YIELD_GROUP_high,NAME_YIELD_GROUP_low_action,NAME_YIELD_GROUP_low_normal,NAME_YIELD_GROUP_middle,NAME_YIELD_GROUP_nan,PRODUCT_COMBINATION_0,PRODUCT_COMBINATION_Card Street,PRODUCT_COMBINATION_Card X-Sell,PRODUCT_COMBINATION_Cash,PRODUCT_COMBINATION_Cash Street: high,PRODUCT_COMBINATION_Cash Street: low,PRODUCT_COMBINATION_Cash Street: middle,PRODUCT_COMBINATION_Cash X-Sell: high,PRODUCT_COMBINATION_Cash X-Sell: low,PRODUCT_COMBINATION_Cash X-Sell: middle,PRODUCT_COMBINATION_POS household with interest,PRODUCT_COMBINATION_POS household without interest,PRODUCT_COMBINATION_POS industry with interest,PRODUCT_COMBINATION_POS industry without interest,PRODUCT_COMBINATION_POS mobile with interest,PRODUCT_COMBINATION_POS mobile without interest,PRODUCT_COMBINATION_POS other with interest,PRODUCT_COMBINATION_POS others without interest,PRODUCT_COMBINATION_nan,AMT_ANNUITY_y,AMT_APPLICATION,AMT_CREDIT_y,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE_y,HOUR_APPR_PROCESS_START_y,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,DAYS_DECISION,SELLERPLACE_AREA,CNT_PAYMENT,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL,PCB_CNT_INSTALMENT_MEAN,PCB_CNT_INSTALMENT_FUTURE_MEAN,PCB_SK_DPD_MEAN,PCB_SK_DPD_DEF_MEAN,PCB_MONTHS_WITH_STATUS_Active_SUM,PCB_MONTHS_WITH_STATUS_Amortized_debt_SUM,PCB_MONTHS_WITH_STATUS_Approved_SUM,PCB_MONTHS_WITH_STATUS_Canceled_SUM,PCB_MONTHS_WITH_STATUS_Completed_SUM,PCB_MONTHS_WITH_STATUS_Demand_SUM,PCB_MONTHS_WITH_STATUS_Returned_to_the_store_SUM,PCB_MONTHS_WITH_STATUS_Signed_SUM,PCB_MONTHS_WITH_STATUS_XNA_SUM,IP_NUM_INSTALMENT_VERSION_MAX,IP_DPD_MEAN,IP_AMOUNT_DEBT_MEAN,IP_INSTALMENTS_UNPAID_SUM,IP_INSTALMENTS_PARTIALLY_PAID_SUM,IP_INSTALMENTS_OVERDUE_SUM,CCB_AMT_BALANCE_MEAN,CCB_AMT_CREDIT_LIMIT_ACTUAL_MEAN,CCB_AMT_DRAWINGS_ATM_CURRENT_MEAN,CCB_AMT_DRAWINGS_CURRENT_MEAN,CCB_AMT_DRAWINGS_OTHER_CURRENT_MEAN,CCB_AMT_DRAWINGS_POS_CURRENT_MEAN,CCB_AMT_INST_MIN_REGULARITY_MEAN,CCB_AMT_PAYMENT_CURRENT_MEAN,CCB_AMT_PAYMENT_TOTAL_CURRENT_MEAN,CCB_AMT_RECEIVABLE_PRINCIPAL_MEAN,CCB_AMT_RECIVABLE_MEAN,CCB_AMT_TOTAL_RECEIVABLE_MEAN,CCB_CNT_DRAWINGS_ATM_CURRENT_MEAN,CCB_CNT_DRAWINGS_CURRENT_MEAN,CCB_CNT_DRAWINGS_OTHER_CURRENT_MEAN,CCB_CNT_DRAWINGS_POS_CURRENT_MEAN,CCB_CNT_INSTALMENT_MATURE_CUM_MEAN,CCB_SK_DPD_MEAN,CCB_SK_DPD_DEF_MEAN
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0252,0.0383,0.9722,0.6341,0.0144,0.0,0.069,0.0833,0.125,0.0377,0.022,0.0198,0.0,0.0,0.025,0.0369,0.9722,0.6243,0.0144,0.0,0.069,0.0833,0.125,0.0375,0.0205,0.0193,0.0,0.0,reg oper account,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,-874.0,-261.75,-523.125,1050.643125,108131.945625,30722.625,3998.570625,0.0,-499.875,0.0,0.0,0.0,45.0,27.0,0.0,0.0,0.0,0.0,23.0,15.0,2.0,0.0,6.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9251.775,179055.0,179055.0,0.0,179055.0,9.0,1.0,0.0,0.0,0.0,-606.0,500.0,24.0,365243.0,-565.0,125.0,-25.0,-17.0,0.0,24.0,15.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.0924,0.0538,0.9851,0.804,0.0497,0.0806,0.0345,0.2917,0.3333,0.0128,0.079,0.0554,0.0,0.0,0.0968,0.0529,0.9851,0.7987,0.0608,0.08,0.0345,0.2917,0.3333,0.0132,0.0787,0.0558,0.0039,0.01,reg oper account,block of flats,0.0714,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,-1400.75,-544.5,-823.0,0.0,254350.125,0.0,202500.0,0.0,-816.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,56553.99,435436.5,484191.0,2295.0,435436.5,14.666667,1.0,0.033354,0.0,0.0,-1305.0,533.0,10.0,365243.0,-1274.333333,-1004.333333,-1054.333333,-1047.333333,0.666667,10.107143,5.785714,0.0,0.0,26.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,Laborers,1.0,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.555912,0.729567,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,-867.0,-488.5,-532.5,0.0,94518.9,0.0,0.0,0.0,-532.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,5357.25,24282.0,20106.0,4860.0,24282.0,5.0,1.0,0.212008,0.0,0.0,-815.0,30.0,4.0,365243.0,-784.0,-694.0,-724.0,-714.0,0.0,3.75,2.25,0.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.650442,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,2.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,6.0,1.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,5.0,3.0,1.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,8.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,1.0,0.0,3.0,2.0,3.0,0.0,5.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,4.0,2.0,0.0,2.0,1.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,15767.45,272203.26,291695.5,7742.26,272203.26,14.666667,1.0,0.036314,0.0,0.0,-272.444444,894.222222,15.333333,162330.222222,40474.0,40704.0,81101.111111,81103.0,0.0,11.428571,8.238095,0.0,0.0,18.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.322738,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,-1149.0,-783.0,-783.0,0.0,146250.0,0.0,0.0,0.0,-783.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,0.0,6.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,5.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,2.0,0.0,0.0,2.0,1.0,3.0,0.0,1.0,0.0,0.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,12278.805,150530.25,166638.75,1130.25,150530.25,12.333333,1.0,0.053172,0.0,0.0,-1222.833333,409.166667,20.666667,304369.166667,-1052.666667,-697.666667,60113.5,60119.833333,0.5,15.333333,8.969697,0.0,0.0,62.0,0.0,0.0,0.0,3.0,0.0,0.0,1.0,0.0,2.0,0.954545,452.384394,0.0,6.0,0.0,,,,,,,,,,,,,,,,,,,


In [116]:
df_info_summary(train_df_join)

                                                    Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                                                  307511           0    0.00    int64
TARGET                                                      307511           0    0.00    int64
NAME_CONTRACT_TYPE                                          307511           0    0.00   object
CODE_GENDER                                                 307511           0    0.00   object
FLAG_OWN_CAR                                                307511           0    0.00   object
FLAG_OWN_REALTY                                             307511           0    0.00   object
CNT_CHILDREN                                                307511           0    0.00    int64
AMT_INCOME_TOTAL                                            307511           0    0.00  float64
AMT_CREDIT_x                                                307511           0    0.00  float64
AMT_ANNUITY_x                           

In [119]:
# Guardar df final en formato parquet
train_df_join.to_parquet(f"./train_1_1_2.parquet")