# Adquisición de datos

## Functions and libraries

In [1]:
import os
# PATH = "/Users/luanagiusto/TP-1-ML"  # Cambia esto si tu path es diferente
PATH = "C:/Users/julia/OneDrive/Escritorio/Archivos/Capacitación/Maestría/03. Machine Learning/TP"

In [2]:
import pandas as pd
import numpy as np
# from ydata_profiling import ProfileReport
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)
import gc
# Nota: Antes de ejecutar este notebook, instala los requisitos con:
# !pip install -r requirements.txt

In [3]:
def data_profiling(df, output_file):
    # Opciones para que sea liviano
    profile = ProfileReport(
        df.sample(20000, random_state=42) if len(df) > 20000 else df,
        title=output_file,
        minimal=True,         # desactiva análisis costosos
        explorative=True      # agrega secciones útiles
    )

    profile.to_file(output_file)  # <-- abre este HTML en el navegador

In [4]:
# Funcion para mostrar un resumen del dataframe
def df_info_summary(df: pd.DataFrame):
    total = len(df)
    non_null = df.notnull().sum()
    nulls = df.isnull().sum()
    dtypes = df.dtypes
    
    resumen = pd.DataFrame({
        "Non-Null Count": non_null,
        "Null Count": nulls,
        "% Null": (nulls / total * 100).round(2),
        "Dtype": dtypes
    })
    print(resumen)

In [None]:
def resumir_por_id(df, id_col='ID', excluir_cols=None, verbose=False, nombre_conteo='n_registros'):
    """
    Sumariza un DataFrame agrupando por una columna ID.
    Calcula métricas estadísticas básicas para columnas numéricas,
    excluyendo las que se indiquen. Incluye conteo total de registros por ID.

    Parámetros:
    - df: DataFrame de entrada con múltiples registros por ID.
    - id_col: nombre de la columna que identifica cada entidad única.
    - excluir_cols: lista de columnas a excluir del resumen (opcional).
    - verbose: si True, imprime columnas incluidas y excluidas.
    - nombre_conteo: nombre de la columna que indica cantidad de registros por ID.

    Retorna:
    - DataFrame con una fila por ID y métricas estadísticas por columna.
    """
    if excluir_cols is None:
        excluir_cols = []

    excluir_set = set(excluir_cols)
    if id_col in excluir_set:
        excluir_set.remove(id_col)

    numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
    cols_a_resumir = [col for col in numeric_cols if col not in excluir_set and col != id_col]

    if verbose:
        print(f"Columnas excluidas: {sorted(excluir_set)}")
        print(f"Columnas resumidas: {sorted(cols_a_resumir)}")

    # Agregaciones estadísticas
    agg_funcs = ['mean', 'min', 'max', 'median', 'sum']
    agg_dict = {col: agg_funcs for col in cols_a_resumir}

    # Agregar conteo de registros por ID
    df[nombre_conteo] = 1
    agg_dict[nombre_conteo] = ['count']

    resumen = df.groupby(id_col).agg(agg_dict)
    resumen.columns = [f"{col}_{stat}" for col, stat in resumen.columns]
    resumen = resumen.reset_index()

    return resumen

## Bureau data

### Data import and overview

In [5]:
# Create data profiles for bureau and bureau_balance datasets

bureau_df = pd.read_csv(os.path.join(PATH, "home-credit-default-risk/bureau.csv"))
bureau_balance_df = pd.read_csv(os.path.join(PATH, "home-credit-default-risk/bureau_balance.csv"))

# data_profiling(bureau_df, "bureau_df_profile.html")
# data_profiling(bureau_balance_df, "bureau_df_balance_profile.html")


In [6]:
# Mostrar las primeras filas de bureau dataset
bureau_df.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [7]:
len(bureau_df)

1716428

In [8]:
# Display the first few rows of the bureau balance data set
bureau_balance_df.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C


In [9]:
len(bureau_balance_df)

27299925

In [10]:
# Info sobre las columnas del bureau_df
"""
1. **SK_ID_CURR**
   * ID del cliente (llave para unir con `application_{train|test}.csv`).
2. **SK_ID_BUREAU**
   * ID único del préstamo en el Buró de Crédito (llave para unir con `bureau_balance.csv`).
3. **CREDIT_ACTIVE**
   * Estado actual del crédito reportado.
   * Valores: *Active, Closed, Sold, Bad debt*.
4. **CREDIT_CURRENCY**
   * Moneda en que está registrado el crédito en el Buró.
5. **DAYS_CREDIT**
   * Días relativos a la fecha de aplicación en Home Credit en que se otorgó este préstamo externo.
   * Ej: `-1000` → el préstamo fue otorgado 1000 días antes de la aplicación.
6. **CREDIT_DAY_OVERDUE**
   * Número de días de atraso en pagos en el momento de la aplicación (si aplica).
7. **DAYS_CREDIT_ENDDATE**
   * Duración **restante** del crédito (en días) al momento de la aplicación.
   * Positivo = le quedan días para terminar.
   * Negativo = ya debería haber finalizado.
8. **DAYS_ENDDATE_FACT**
   * Días desde la **finalización real** del crédito, al momento de la aplicación (solo si está cerrado).
   * Negativo = terminó antes de la aplicación.
9. **AMT_CREDIT_MAX_OVERDUE**
   * Monto máximo de deuda vencida registrado durante la vida de ese crédito.
10. **CNT_CREDIT_PROLONG**
    * Cantidad de veces que se extendió/prorrogó este crédito.
11. **AMT_CREDIT_SUM**
    * Monto actual del crédito según Buró.
12. **AMT_CREDIT_SUM_DEBT**
    * Monto actual de deuda pendiente de ese crédito.
13. **AMT_CREDIT_SUM_LIMIT**
    * Límite actual de crédito (si es aplicable, ej. tarjeta).
14. **AMT_CREDIT_SUM_OVERDUE**
    * Monto actual vencido en ese crédito.
15. **CREDIT_TYPE**
    * Tipo de crédito según Buró (ej: *Car loan, Consumer credit, Mortgage, Credit card*).
16. **DAYS_CREDIT_UPDATE**
    * Días relativos a la aplicación en que se actualizó por última vez la info del crédito en el Buró.
17. **AMT_ANNUITY**
    * Monto de la cuota periódica (anualidad) reportado en el Buró para este crédito.
"""
df_info_summary(bureau_df)

                        Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                     1716428           0    0.00    int64
SK_ID_BUREAU                   1716428           0    0.00    int64
CREDIT_ACTIVE                  1716428           0    0.00   object
CREDIT_CURRENCY                1716428           0    0.00   object
DAYS_CREDIT                    1716428           0    0.00    int64
CREDIT_DAY_OVERDUE             1716428           0    0.00    int64
DAYS_CREDIT_ENDDATE            1610875      105553    6.15  float64
DAYS_ENDDATE_FACT              1082775      633653   36.92  float64
AMT_CREDIT_MAX_OVERDUE          591940     1124488   65.51  float64
CNT_CREDIT_PROLONG             1716428           0    0.00    int64
AMT_CREDIT_SUM                 1716415          13    0.00  float64
AMT_CREDIT_SUM_DEBT            1458759      257669   15.01  float64
AMT_CREDIT_SUM_LIMIT           1124648      591780   34.48  float64
AMT_CREDIT_SUM_OVERDUE         1716428          

In [11]:
# Info sobre las columnas del bureau_balance_df
"""
SK_ID_BUREAU → vincula con bureau.
MONTHS_BALANCE → mes relativo a la aplicación actual (ej. -1 = mes anterior, -6 = seis meses antes).
STATUS → estado en ese mes:
0 = al día (DPD 0)
1 = atraso 1–30 días
2 = atraso 31–60
3 = atraso 61–90
4 = atraso 91–120
5 = atraso 120+ o vendido/castigado
C = cerrado
X = desconocido

**DPD = Days Past Due
"""

df_info_summary(bureau_balance_df)

                Non-Null Count  Null Count  % Null   Dtype
SK_ID_BUREAU          27299925           0     0.0   int64
MONTHS_BALANCE        27299925           0     0.0   int64
STATUS                27299925           0     0.0  object


### Data prep - bureau.csv 

In [12]:
# bureau_df.loc[bureau_df["DAYS_CREDIT_ENDDATE"].isna(), ["CREDIT_TYPE", "CREDIT_ACTIVE"]].value_counts()
# bureau_df.loc[bureau_df["DAYS_ENDDATE_FACT"].isna(), ["CREDIT_TYPE", "CREDIT_ACTIVE"]].value_counts()
# bureau_df.loc[bureau_df["AMT_CREDIT_SUM"].isna(), ["CREDIT_TYPE", "CREDIT_ACTIVE"]].value_counts()
# bureau_df.loc[bureau_df["AMT_CREDIT_SUM_DEBT"].isna(), ["CREDIT_TYPE", "CREDIT_ACTIVE"]].value_counts()
# bureau_df.loc[bureau_df["AMT_CREDIT_SUM_LIMIT"].isna(), ["CREDIT_TYPE", "CREDIT_ACTIVE"]].value_counts()
bureau_df.loc[bureau_df["AMT_ANNUITY"].isna(), ["CREDIT_TYPE", "CREDIT_ACTIVE"]].value_counts()


CREDIT_TYPE                                   CREDIT_ACTIVE
Consumer credit                               Closed           683665
                                              Active           208321
Credit card                                   Active           206898
                                              Closed            84338
Car loan                                      Closed            12090
Mortgage                                      Active             8047
Car loan                                      Active             6267
Microloan                                     Closed             4773
Mortgage                                      Closed             3457
Consumer credit                               Sold               2257
Credit card                                   Sold               1470
Microloan                                     Active             1406
Loan for business development                 Closed             1140
Another type of loan          

In [13]:
# Por ahora reemplazo nan con ceros, pero habría que ver si se puede mejorar
bureau_df = bureau_df.fillna(0)
print("Columnas con valores NaN despues de rellenar:")
print(bureau_df.columns[bureau_df.isna().any()].tolist())

Columnas con valores NaN despues de rellenar:
[]


In [14]:
df_info_summary(bureau_df)

                        Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                     1716428           0     0.0    int64
SK_ID_BUREAU                   1716428           0     0.0    int64
CREDIT_ACTIVE                  1716428           0     0.0   object
CREDIT_CURRENCY                1716428           0     0.0   object
DAYS_CREDIT                    1716428           0     0.0    int64
CREDIT_DAY_OVERDUE             1716428           0     0.0    int64
DAYS_CREDIT_ENDDATE            1716428           0     0.0  float64
DAYS_ENDDATE_FACT              1716428           0     0.0  float64
AMT_CREDIT_MAX_OVERDUE         1716428           0     0.0  float64
CNT_CREDIT_PROLONG             1716428           0     0.0    int64
AMT_CREDIT_SUM                 1716428           0     0.0  float64
AMT_CREDIT_SUM_DEBT            1716428           0     0.0  float64
AMT_CREDIT_SUM_LIMIT           1716428           0     0.0  float64
AMT_CREDIT_SUM_OVERDUE         1716428          

### Data prep - bureau_balance.csv

In [15]:
len(bureau_balance_df)

27299925

In [16]:
# Crear tabla de conteo de status por SK_ID_BUREAU
# OHE con get_dummies
status_dummies = pd.get_dummies(bureau_balance_df["STATUS"], prefix="SUM_STATUS", dtype="int32")

# Concatenar con SK_ID_BUREAU
tmp = pd.concat([bureau_balance_df[["SK_ID_BUREAU"]], status_dummies], axis=1)

# Agrupar por SK_ID_BUREAU y sumar
status_counts = tmp.groupby("SK_ID_BUREAU", as_index=False).sum()

# Renombrar columnas para que sean más claras
status_counts = status_counts.rename(columns={'SUM_STATUS_0': 'MONTHS_WITH_STATUS_DPD_0',
    'SUM_STATUS_1': 'MONTHS_WITH_STATUS_DPD_1to30',
    'SUM_STATUS_2': 'MONTHS_WITH_STATUS_DPD_31to60',
    'SUM_STATUS_3': 'MONTHS_WITH_STATUS_DPD_61to90',
    'SUM_STATUS_4': 'MONTHS_WITH_STATUS_DPD_91to120',
    'SUM_STATUS_5': 'MONTHS_WITH_STATUS_DPD_over120',
    'SUM_STATUS_C': 'MONTHS_WITH_STATUS_CLOSED',
    'SUM_STATUS_X': 'MONTHS_WITH_STATUS_UNKNOWN'
    })

In [17]:
status_counts.head()

Unnamed: 0,SK_ID_BUREAU,MONTHS_WITH_STATUS_DPD_0,MONTHS_WITH_STATUS_DPD_1to30,MONTHS_WITH_STATUS_DPD_31to60,MONTHS_WITH_STATUS_DPD_61to90,MONTHS_WITH_STATUS_DPD_91to120,MONTHS_WITH_STATUS_DPD_over120,MONTHS_WITH_STATUS_CLOSED,MONTHS_WITH_STATUS_UNKNOWN
0,5001709,0,0,0,0,0,0,86,11
1,5001710,5,0,0,0,0,0,48,30
2,5001711,3,0,0,0,0,0,0,1
3,5001712,10,0,0,0,0,0,9,0
4,5001713,0,0,0,0,0,0,0,22


In [18]:
df_info_summary(status_counts)

                                Non-Null Count  Null Count  % Null  Dtype
SK_ID_BUREAU                            817395           0     0.0  int64
MONTHS_WITH_STATUS_DPD_0                817395           0     0.0  int32
MONTHS_WITH_STATUS_DPD_1to30            817395           0     0.0  int32
MONTHS_WITH_STATUS_DPD_31to60           817395           0     0.0  int32
MONTHS_WITH_STATUS_DPD_61to90           817395           0     0.0  int32
MONTHS_WITH_STATUS_DPD_91to120          817395           0     0.0  int32
MONTHS_WITH_STATUS_DPD_over120          817395           0     0.0  int32
MONTHS_WITH_STATUS_CLOSED               817395           0     0.0  int32
MONTHS_WITH_STATUS_UNKNOWN              817395           0     0.0  int32


In [19]:
len(status_counts)

817395

### Bureau datasets join

In [20]:
print("rows before join:", len(bureau_df))

rows before join: 1716428


In [21]:
# Join final de tablas bureau y bureau_balance agregando los conteos de status
bureau_df_join = bureau_df.merge(status_counts, on="SK_ID_BUREAU", how="left")

# Rellenar NaN con 0 y casteo a int32
status_cols = [col for col in bureau_df_join.columns if col.startswith("MONTHS_WITH_STATUS_")]
bureau_df_join[status_cols] = bureau_df_join[status_cols].fillna(0).astype("int8")

In [22]:
print("rows after join:", len(bureau_df_join))

rows after join: 1716428


In [23]:
bureau_df_join.head()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY,MONTHS_WITH_STATUS_DPD_0,MONTHS_WITH_STATUS_DPD_1to30,MONTHS_WITH_STATUS_DPD_31to60,MONTHS_WITH_STATUS_DPD_61to90,MONTHS_WITH_STATUS_DPD_91to120,MONTHS_WITH_STATUS_DPD_over120,MONTHS_WITH_STATUS_CLOSED,MONTHS_WITH_STATUS_UNKNOWN
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,0.0,0,91323.0,0.0,0.0,0.0,Consumer credit,-131,0.0,0,0,0,0,0,0,0,0
1,215354,5714463,Active,currency 1,-208,0,1075.0,0.0,0.0,0,225000.0,171342.0,0.0,0.0,Credit card,-20,0.0,0,0,0,0,0,0,0,0
2,215354,5714464,Active,currency 1,-203,0,528.0,0.0,0.0,0,464323.5,0.0,0.0,0.0,Consumer credit,-16,0.0,0,0,0,0,0,0,0,0
3,215354,5714465,Active,currency 1,-203,0,0.0,0.0,0.0,0,90000.0,0.0,0.0,0.0,Credit card,-16,0.0,0,0,0,0,0,0,0,0
4,215354,5714466,Active,currency 1,-629,0,1197.0,0.0,77674.5,0,2700000.0,0.0,0.0,0.0,Consumer credit,-21,0.0,0,0,0,0,0,0,0,0


In [24]:
df_info_summary(bureau_df_join)

                                Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                             1716428           0     0.0    int64
SK_ID_BUREAU                           1716428           0     0.0    int64
CREDIT_ACTIVE                          1716428           0     0.0   object
CREDIT_CURRENCY                        1716428           0     0.0   object
DAYS_CREDIT                            1716428           0     0.0    int64
CREDIT_DAY_OVERDUE                     1716428           0     0.0    int64
DAYS_CREDIT_ENDDATE                    1716428           0     0.0  float64
DAYS_ENDDATE_FACT                      1716428           0     0.0  float64
AMT_CREDIT_MAX_OVERDUE                 1716428           0     0.0  float64
CNT_CREDIT_PROLONG                     1716428           0     0.0    int64
AMT_CREDIT_SUM                         1716428           0     0.0  float64
AMT_CREDIT_SUM_DEBT                    1716428           0     0.0  float64
AMT_CREDIT_S

In [25]:
del bureau_df
del bureau_balance_df
del tmp
del status_dummies
del status_counts
gc.collect()

0

### Joined data prep

In [26]:
# OHE con get_dummies
to_keep_numeric_and_boolean = bureau_df_join.select_dtypes(include=["number", "bool"]).columns.tolist()
to_do_ohe = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE']
bureau_df_join = bureau_df_join[to_keep_numeric_and_boolean + to_do_ohe].copy()
gc.collect()
bureau_df_categorical = pd.get_dummies(bureau_df_join[to_do_ohe], dtype="int8")
bureau_df_join = pd.concat([bureau_df_join[to_keep_numeric_and_boolean], bureau_df_categorical], axis=1)

In [27]:
# bureau_df_join_ohe.columns
df_info_summary(bureau_df_join)

                                                    Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                                                 1716428           0     0.0    int64
SK_ID_BUREAU                                               1716428           0     0.0    int64
DAYS_CREDIT                                                1716428           0     0.0    int64
CREDIT_DAY_OVERDUE                                         1716428           0     0.0    int64
DAYS_CREDIT_ENDDATE                                        1716428           0     0.0  float64
DAYS_ENDDATE_FACT                                          1716428           0     0.0  float64
AMT_CREDIT_MAX_OVERDUE                                     1716428           0     0.0  float64
CNT_CREDIT_PROLONG                                         1716428           0     0.0    int64
AMT_CREDIT_SUM                                             1716428           0     0.0  float64
AMT_CREDIT_SUM_DEBT                     

In [28]:
len(bureau_df_join) 

1716428

In [29]:
bureau_df_join.columns
bureau_df_join.shape

(1716428, 45)

In [31]:
bureau_df_agg = resumir_por_id(
    bureau_df_join,
    id_col='SK_ID_CURR',
    excluir_cols=['SK_ID_BUREAU'],
    verbose=True,
    nombre_conteo='bureau_records'
)

# Mostrar las primeras filas
bureau_df_agg.head()

Columnas excluidas: ['SK_ID_BUREAU']
Columnas resumidas: ['AMT_ANNUITY', 'AMT_CREDIT_MAX_OVERDUE', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM_OVERDUE', 'CNT_CREDIT_PROLONG', 'CREDIT_ACTIVE_Active', 'CREDIT_ACTIVE_Bad debt', 'CREDIT_ACTIVE_Closed', 'CREDIT_ACTIVE_Sold', 'CREDIT_CURRENCY_currency 1', 'CREDIT_CURRENCY_currency 2', 'CREDIT_CURRENCY_currency 3', 'CREDIT_CURRENCY_currency 4', 'CREDIT_DAY_OVERDUE', 'CREDIT_TYPE_Another type of loan', 'CREDIT_TYPE_Car loan', 'CREDIT_TYPE_Cash loan (non-earmarked)', 'CREDIT_TYPE_Consumer credit', 'CREDIT_TYPE_Credit card', 'CREDIT_TYPE_Interbank credit', 'CREDIT_TYPE_Loan for business development', 'CREDIT_TYPE_Loan for purchase of shares (margin lending)', 'CREDIT_TYPE_Loan for the purchase of equipment', 'CREDIT_TYPE_Loan for working capital replenishment', 'CREDIT_TYPE_Microloan', 'CREDIT_TYPE_Mobile operator loan', 'CREDIT_TYPE_Mortgage', 'CREDIT_TYPE_Real estate loan', 'CREDIT_TYPE_Unknown type of loan

Unnamed: 0,SK_ID_CURR,DAYS_CREDIT_mean,DAYS_CREDIT_min,DAYS_CREDIT_max,DAYS_CREDIT_median,DAYS_CREDIT_sum,CREDIT_DAY_OVERDUE_mean,CREDIT_DAY_OVERDUE_min,CREDIT_DAY_OVERDUE_max,CREDIT_DAY_OVERDUE_median,CREDIT_DAY_OVERDUE_sum,DAYS_CREDIT_ENDDATE_mean,DAYS_CREDIT_ENDDATE_min,DAYS_CREDIT_ENDDATE_max,DAYS_CREDIT_ENDDATE_median,DAYS_CREDIT_ENDDATE_sum,DAYS_ENDDATE_FACT_mean,DAYS_ENDDATE_FACT_min,DAYS_ENDDATE_FACT_max,DAYS_ENDDATE_FACT_median,DAYS_ENDDATE_FACT_sum,AMT_CREDIT_MAX_OVERDUE_mean,AMT_CREDIT_MAX_OVERDUE_min,AMT_CREDIT_MAX_OVERDUE_max,AMT_CREDIT_MAX_OVERDUE_median,AMT_CREDIT_MAX_OVERDUE_sum,CNT_CREDIT_PROLONG_mean,CNT_CREDIT_PROLONG_min,CNT_CREDIT_PROLONG_max,CNT_CREDIT_PROLONG_median,CNT_CREDIT_PROLONG_sum,AMT_CREDIT_SUM_mean,AMT_CREDIT_SUM_min,AMT_CREDIT_SUM_max,AMT_CREDIT_SUM_median,AMT_CREDIT_SUM_sum,AMT_CREDIT_SUM_DEBT_mean,AMT_CREDIT_SUM_DEBT_min,AMT_CREDIT_SUM_DEBT_max,AMT_CREDIT_SUM_DEBT_median,AMT_CREDIT_SUM_DEBT_sum,AMT_CREDIT_SUM_LIMIT_mean,AMT_CREDIT_SUM_LIMIT_min,AMT_CREDIT_SUM_LIMIT_max,AMT_CREDIT_SUM_LIMIT_median,AMT_CREDIT_SUM_LIMIT_sum,AMT_CREDIT_SUM_OVERDUE_mean,AMT_CREDIT_SUM_OVERDUE_min,AMT_CREDIT_SUM_OVERDUE_max,AMT_CREDIT_SUM_OVERDUE_median,AMT_CREDIT_SUM_OVERDUE_sum,DAYS_CREDIT_UPDATE_mean,DAYS_CREDIT_UPDATE_min,DAYS_CREDIT_UPDATE_max,DAYS_CREDIT_UPDATE_median,DAYS_CREDIT_UPDATE_sum,AMT_ANNUITY_mean,AMT_ANNUITY_min,AMT_ANNUITY_max,AMT_ANNUITY_median,AMT_ANNUITY_sum,MONTHS_WITH_STATUS_DPD_0_mean,MONTHS_WITH_STATUS_DPD_0_min,MONTHS_WITH_STATUS_DPD_0_max,MONTHS_WITH_STATUS_DPD_0_median,MONTHS_WITH_STATUS_DPD_0_sum,MONTHS_WITH_STATUS_DPD_1to30_mean,MONTHS_WITH_STATUS_DPD_1to30_min,MONTHS_WITH_STATUS_DPD_1to30_max,MONTHS_WITH_STATUS_DPD_1to30_median,MONTHS_WITH_STATUS_DPD_1to30_sum,MONTHS_WITH_STATUS_DPD_31to60_mean,MONTHS_WITH_STATUS_DPD_31to60_min,MONTHS_WITH_STATUS_DPD_31to60_max,MONTHS_WITH_STATUS_DPD_31to60_median,MONTHS_WITH_STATUS_DPD_31to60_sum,MONTHS_WITH_STATUS_DPD_61to90_mean,MONTHS_WITH_STATUS_DPD_61to90_min,MONTHS_WITH_STATUS_DPD_61to90_max,MONTHS_WITH_STATUS_DPD_61to90_median,MONTHS_WITH_STATUS_DPD_61to90_sum,MONTHS_WITH_STATUS_DPD_91to120_mean,MONTHS_WITH_STATUS_DPD_91to120_min,MONTHS_WITH_STATUS_DPD_91to120_max,MONTHS_WITH_STATUS_DPD_91to120_median,MONTHS_WITH_STATUS_DPD_91to120_sum,MONTHS_WITH_STATUS_DPD_over120_mean,MONTHS_WITH_STATUS_DPD_over120_min,MONTHS_WITH_STATUS_DPD_over120_max,MONTHS_WITH_STATUS_DPD_over120_median,MONTHS_WITH_STATUS_DPD_over120_sum,MONTHS_WITH_STATUS_CLOSED_mean,MONTHS_WITH_STATUS_CLOSED_min,MONTHS_WITH_STATUS_CLOSED_max,MONTHS_WITH_STATUS_CLOSED_median,MONTHS_WITH_STATUS_CLOSED_sum,MONTHS_WITH_STATUS_UNKNOWN_mean,MONTHS_WITH_STATUS_UNKNOWN_min,MONTHS_WITH_STATUS_UNKNOWN_max,MONTHS_WITH_STATUS_UNKNOWN_median,MONTHS_WITH_STATUS_UNKNOWN_sum,CREDIT_ACTIVE_Active_mean,CREDIT_ACTIVE_Active_min,CREDIT_ACTIVE_Active_max,CREDIT_ACTIVE_Active_median,CREDIT_ACTIVE_Active_sum,CREDIT_ACTIVE_Bad debt_mean,CREDIT_ACTIVE_Bad debt_min,CREDIT_ACTIVE_Bad debt_max,CREDIT_ACTIVE_Bad debt_median,CREDIT_ACTIVE_Bad debt_sum,CREDIT_ACTIVE_Closed_mean,CREDIT_ACTIVE_Closed_min,CREDIT_ACTIVE_Closed_max,CREDIT_ACTIVE_Closed_median,CREDIT_ACTIVE_Closed_sum,CREDIT_ACTIVE_Sold_mean,CREDIT_ACTIVE_Sold_min,CREDIT_ACTIVE_Sold_max,CREDIT_ACTIVE_Sold_median,CREDIT_ACTIVE_Sold_sum,CREDIT_CURRENCY_currency 1_mean,CREDIT_CURRENCY_currency 1_min,CREDIT_CURRENCY_currency 1_max,CREDIT_CURRENCY_currency 1_median,CREDIT_CURRENCY_currency 1_sum,CREDIT_CURRENCY_currency 2_mean,CREDIT_CURRENCY_currency 2_min,CREDIT_CURRENCY_currency 2_max,CREDIT_CURRENCY_currency 2_median,CREDIT_CURRENCY_currency 2_sum,CREDIT_CURRENCY_currency 3_mean,CREDIT_CURRENCY_currency 3_min,CREDIT_CURRENCY_currency 3_max,CREDIT_CURRENCY_currency 3_median,CREDIT_CURRENCY_currency 3_sum,CREDIT_CURRENCY_currency 4_mean,CREDIT_CURRENCY_currency 4_min,CREDIT_CURRENCY_currency 4_max,CREDIT_CURRENCY_currency 4_median,CREDIT_CURRENCY_currency 4_sum,CREDIT_TYPE_Another type of loan_mean,CREDIT_TYPE_Another type of loan_min,CREDIT_TYPE_Another type of loan_max,CREDIT_TYPE_Another type of loan_median,CREDIT_TYPE_Another type of loan_sum,CREDIT_TYPE_Car loan_mean,CREDIT_TYPE_Car loan_min,CREDIT_TYPE_Car loan_max,CREDIT_TYPE_Car loan_median,CREDIT_TYPE_Car loan_sum,CREDIT_TYPE_Cash loan (non-earmarked)_mean,CREDIT_TYPE_Cash loan (non-earmarked)_min,CREDIT_TYPE_Cash loan (non-earmarked)_max,CREDIT_TYPE_Cash loan (non-earmarked)_median,CREDIT_TYPE_Cash loan (non-earmarked)_sum,CREDIT_TYPE_Consumer credit_mean,CREDIT_TYPE_Consumer credit_min,CREDIT_TYPE_Consumer credit_max,CREDIT_TYPE_Consumer credit_median,CREDIT_TYPE_Consumer credit_sum,CREDIT_TYPE_Credit card_mean,CREDIT_TYPE_Credit card_min,CREDIT_TYPE_Credit card_max,CREDIT_TYPE_Credit card_median,CREDIT_TYPE_Credit card_sum,CREDIT_TYPE_Interbank credit_mean,CREDIT_TYPE_Interbank credit_min,CREDIT_TYPE_Interbank credit_max,CREDIT_TYPE_Interbank credit_median,CREDIT_TYPE_Interbank credit_sum,CREDIT_TYPE_Loan for business development_mean,CREDIT_TYPE_Loan for business development_min,CREDIT_TYPE_Loan for business development_max,CREDIT_TYPE_Loan for business development_median,CREDIT_TYPE_Loan for business development_sum,CREDIT_TYPE_Loan for purchase of shares (margin lending)_mean,CREDIT_TYPE_Loan for purchase of shares (margin lending)_min,CREDIT_TYPE_Loan for purchase of shares (margin lending)_max,CREDIT_TYPE_Loan for purchase of shares (margin lending)_median,CREDIT_TYPE_Loan for purchase of shares (margin lending)_sum,CREDIT_TYPE_Loan for the purchase of equipment_mean,CREDIT_TYPE_Loan for the purchase of equipment_min,CREDIT_TYPE_Loan for the purchase of equipment_max,CREDIT_TYPE_Loan for the purchase of equipment_median,CREDIT_TYPE_Loan for the purchase of equipment_sum,CREDIT_TYPE_Loan for working capital replenishment_mean,CREDIT_TYPE_Loan for working capital replenishment_min,CREDIT_TYPE_Loan for working capital replenishment_max,CREDIT_TYPE_Loan for working capital replenishment_median,CREDIT_TYPE_Loan for working capital replenishment_sum,CREDIT_TYPE_Microloan_mean,CREDIT_TYPE_Microloan_min,CREDIT_TYPE_Microloan_max,CREDIT_TYPE_Microloan_median,CREDIT_TYPE_Microloan_sum,CREDIT_TYPE_Mobile operator loan_mean,CREDIT_TYPE_Mobile operator loan_min,CREDIT_TYPE_Mobile operator loan_max,CREDIT_TYPE_Mobile operator loan_median,CREDIT_TYPE_Mobile operator loan_sum,CREDIT_TYPE_Mortgage_mean,CREDIT_TYPE_Mortgage_min,CREDIT_TYPE_Mortgage_max,CREDIT_TYPE_Mortgage_median,CREDIT_TYPE_Mortgage_sum,CREDIT_TYPE_Real estate loan_mean,CREDIT_TYPE_Real estate loan_min,CREDIT_TYPE_Real estate loan_max,CREDIT_TYPE_Real estate loan_median,CREDIT_TYPE_Real estate loan_sum,CREDIT_TYPE_Unknown type of loan_mean,CREDIT_TYPE_Unknown type of loan_min,CREDIT_TYPE_Unknown type of loan_max,CREDIT_TYPE_Unknown type of loan_median,CREDIT_TYPE_Unknown type of loan_sum,bureau_records_count
0,100001,-735.0,-1572,-49,-857.0,-5145,0.0,0,0,0.0,0,82.428571,-1329.0,1778.0,-179.0,577.0,-471.714286,-1328.0,0.0,-544.0,-3302.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,207623.571429,85500.0,378000.0,168345.0,1453365.0,85240.928571,0.0,373239.0,0.0,596686.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-93.142857,-155,-6,-155.0,-652,3545.357143,0.0,10822.5,0.0,24817.5,4.428571,1,12,2.0,31,0.142857,0,1,0.0,1,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,15.714286,0,44,18.0,110,4.285714,0,9,6.0,30,0.428571,0,1,0.0,3,0.0,0,0,0.0,0,0.571429,0,1,1.0,4,0.0,0,0,0.0,0,1.0,1,1,1.0,7,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,1.0,1,1,1.0,7,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,7
1,100002,-874.0,-1437,-103,-1042.5,-6992,0.0,0,0,0.0,0,-261.75,-1072.0,780.0,0.0,-2094.0,-523.125,-1185.0,0.0,-479.5,-4185.0,1050.643125,0.0,5043.645,0.0,8405.145,0.0,0,0,0.0,0,108131.945625,0.0,450000.0,54130.5,865055.565,30722.625,0.0,245781.0,0.0,245781.0,3998.570625,0.0,31988.565,0.0,31988.565,0.0,0.0,0.0,0.0,0.0,-499.875,-1185,-7,-402.5,-3999,0.0,0.0,0.0,0.0,0.0,5.625,2,18,5.0,45,3.375,0,6,4.0,27,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,2.875,0,13,2.0,23,1.875,0,3,2.5,15,0.25,0,1,0.0,2,0.0,0,0,0.0,0,0.75,0,1,1.0,6,0.0,0,0,0.0,0,1.0,1,1,1.0,8,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.5,0,1,0.5,4,0.5,0,1,0.5,4,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,8
2,100003,-1400.75,-2586,-606,-1205.5,-5603,0.0,0,0,0.0,0,-544.5,-2434.0,1216.0,-480.0,-2178.0,-823.0,-2131.0,0.0,-580.5,-3292.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,254350.125,22248.0,810000.0,92576.25,1017400.5,0.0,0.0,0.0,0.0,0.0,202500.0,0.0,810000.0,0.0,810000.0,0.0,0.0,0.0,0.0,0.0,-816.0,-2131,-43,-545.0,-3264,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.25,0,1,0.0,1,0.0,0,0,0.0,0,0.75,0,1,1.0,3,0.0,0,0,0.0,0,1.0,1,1,1.0,4,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.5,0,1,0.5,2,0.5,0,1,0.5,2,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,4
3,100004,-867.0,-1326,-408,-867.0,-1734,0.0,0,0,0.0,0,-488.5,-595.0,-382.0,-488.5,-977.0,-532.5,-683.0,-382.0,-532.5,-1065.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,94518.9,94500.0,94537.8,94518.9,189037.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-532.0,-682,-382,-532.0,-1064,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,1.0,1,1,1.0,2,0.0,0,0,0.0,0,1.0,1,1,1.0,2,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,1.0,1,1,1.0,2,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,2
4,100005,-190.666667,-373,-62,-137.0,-572,0.0,0,0,0.0,0,439.333333,-128.0,1324.0,122.0,1318.0,-41.0,-123.0,0.0,0.0,-123.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,219042.0,29826.0,568800.0,58500.0,657126.0,189469.5,0.0,543087.0,25321.5,568408.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-54.333333,-121,-11,-31.0,-163,1420.5,0.0,4261.5,0.0,4261.5,4.666667,2,7,5.0,14,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,1.666667,0,5,0.0,5,0.666667,0,1,1.0,2,0.666667,0,1,1.0,2,0.0,0,0,0.0,0,0.333333,0,1,0.0,1,0.0,0,0,0.0,0,1.0,1,1,1.0,3,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.666667,0,1,1.0,2,0.333333,0,1,0.0,1,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,3


In [32]:
bureau_df_agg.shape

(305811, 217)

In [33]:
df_info_summary(bureau_df_agg)

                                                    Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                                                  305811           0     0.0    int64
DAYS_CREDIT_mean                                            305811           0     0.0  float64
DAYS_CREDIT_min                                             305811           0     0.0    int64
DAYS_CREDIT_max                                             305811           0     0.0    int64
DAYS_CREDIT_median                                          305811           0     0.0  float64
DAYS_CREDIT_sum                                             305811           0     0.0    int64
CREDIT_DAY_OVERDUE_mean                                     305811           0     0.0  float64
CREDIT_DAY_OVERDUE_min                                      305811           0     0.0    int64
CREDIT_DAY_OVERDUE_max                                      305811           0     0.0    int64
CREDIT_DAY_OVERDUE_median               

In [34]:
bureau_df_agg['SK_ID_CURR'].nunique()

305811

## Previous application data

### Data import and overview

In [35]:
# Create data profiles for bureau and bureau_balance datasets

previous_application_df = pd.read_csv(os.path.join(PATH, "home-credit-default-risk/previous_application.csv"))
pos_cash_balance_df = pd.read_csv(os.path.join(PATH, "home-credit-default-risk/POS_CASH_balance.csv"))
installments_payments_df = pd.read_csv(os.path.join(PATH, "home-credit-default-risk/installments_payments.csv"))
credit_card_balance_df = pd.read_csv(os.path.join(PATH, "home-credit-default-risk/credit_card_balance.csv"))

# data_profiling(previous_application_df, "previous_application_df.html")
# data_profiling(pos_cash_balance_df, "pos_cash_balance_df.html")
# data_profiling(installments_payments_df, "installments_payments_df.html")
# data_profiling(credit_card_balance_df, "credit_card_balance_df.html")

In [36]:
# Mostrar las primeras filas de previous_application_df
previous_application_df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,Y,1,0.0,0.182832,0.867336,XAP,Approved,-73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,Country-wide,35,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,Y,1,,,,XNA,Approved,-164,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,-1,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,Y,1,,,,XNA,Approved,-301,Cash through the bank,XAP,"Spouse, partner",Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,Y,1,,,,XNA,Approved,-512,Cash through the bank,XAP,,Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,Y,1,,,,Repairs,Refused,-781,Cash through the bank,HC,,Repeater,XNA,Cash,walk-in,Credit and cash offices,-1,XNA,24.0,high,Cash Street: high,,,,,,


In [37]:
len(previous_application_df)

1670214

In [38]:
# Mostrar las primeras filas de pos_cash_balance_df
pos_cash_balance_df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [39]:
len(pos_cash_balance_df)

10001358

In [40]:
# Mostrar las primeras filas de installments_payments_df
installments_payments_df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [41]:
len(installments_payments_df)

13605401

In [42]:
# Mostrar las primeras filas de credit_card_balance_df
credit_card_balance_df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,2562384,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,1800.0,1800.0,0.0,0.0,0.0,0.0,1,0.0,1.0,35.0,Active,0,0
1,2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,2250.0,2250.0,60175.08,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
2,1740877,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,2250.0,2250.0,26926.425,31460.085,31460.085,0.0,0,0.0,0.0,30.0,Active,0,0
3,1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,11925.0,11925.0,224949.285,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
4,1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,27000.0,27000.0,443044.395,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0


In [43]:
len(credit_card_balance_df)

3840312

### Data prep - previous_application_df 

In [44]:
# Info sobre las columnas del previous_application_df

"""
•	SK_ID_PREV → ID de la aplicación previa.
•	SK_ID_CURR → ID del préstamo actual en nuestro dataset.
•	NAME_CONTRACT_TYPE → Tipo de producto solicitado (Cash loan, POS loan, etc.).
•	AMT_ANNUITY → Cuota periódica (anualidad) de esa solicitud.
•	AMT_APPLICATION → Monto que el cliente pidió originalmente.
•	AMT_CREDIT → Monto finalmente aprobado (puede diferir de lo solicitado).
•	AMT_DOWN_PAYMENT → Pago inicial hecho por el cliente.
•	AMT_GOODS_PRICE → Valor de los bienes financiados (si aplica).
•	WEEKDAY_APPR_PROCESS_START → Día de la semana en que se inició la aplicación.
•	HOUR_APPR_PROCESS_START → Hora del día de inicio (aprox., redondeada).
•	FLAG_LAST_APPL_PER_CONTRACT → Marca si fue la última solicitud para ese contrato.
•	NFLAG_LAST_APPL_IN_DAY → Marca si fue la última aplicación del cliente en ese día.
•	NFLAG_MICRO_CASH → Flag si era un microcrédito.
•	RATE_DOWN_PAYMENT → Porcentaje de pago inicial (normalizado).
•	RATE_INTEREST_PRIMARY / PRIVILEGED → Tasas de interés aplicables (normalizadas).
•	NAME_CASH_LOAN_PURPOSE → Propósito del préstamo en efectivo (educación, auto, etc.).
•	NAME_CONTRACT_STATUS → Estado de la aplicación (Approved, Refused, Canceled, etc.).
•	DAYS_DECISION → Días relativos a la aplicación actual en que se tomó la decisión.
•	NAME_PAYMENT_TYPE → Método de pago (Cash, Bank transfer, etc.).
•	CODE_REJECT_REASON → Razón de rechazo (CLIENT, HC, SCO, etc.).
•	NAME_TYPE_SUITE → Con quién estaba el cliente (Family, Alone, etc.).
•	NAME_CLIENT_TYPE → Si era cliente nuevo o recurrente.
•	NAME_GOODS_CATEGORY → Categoría del bien solicitado (Electronics, Furniture, etc.).
•	NAME_PORTFOLIO → Cartera (POS, Cash, Car, etc.).
•	NAME_PRODUCT_TYPE → Tipo de producto (X-Sell, Walk-in, etc.).
•	CHANNEL_TYPE → Canal de aplicación (Credit agent, Online, etc.).
•	SELLERPLACE_AREA → Tamaño del área de ventas del vendedor.
•	NAME_SELLER_INDUSTRY → Industria del vendedor.
•	CNT_PAYMENT → Número de pagos previstos (plazo).
•	NAME_YIELD_GROUP → Clasificación de la tasa de interés (baja, media, alta).
•	PRODUCT_COMBINATION → Detalle de la combinación de productos.
•	DAYS_FIRST_DRAWING → Días hasta la primera disposición de fondos.
•	DAYS_FIRST_DUE → Días hasta el primer pago esperado.
•	DAYS_LAST_DUE_1ST_VERSION → Último vencimiento esperado (versión inicial).
•	DAYS_LAST_DUE → Último vencimiento esperado (versión final).
•	DAYS_TERMINATION → Días hasta la finalización esperada del contrato.
•	NFLAG_INSURED_ON_APPROVAL → Si el cliente solicitó seguro.

"""
df_info_summary(previous_application_df)

                             Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                          1670214           0    0.00    int64
SK_ID_CURR                          1670214           0    0.00    int64
NAME_CONTRACT_TYPE                  1670214           0    0.00   object
AMT_ANNUITY                         1297979      372235   22.29  float64
AMT_APPLICATION                     1670214           0    0.00  float64
AMT_CREDIT                          1670213           1    0.00  float64
AMT_DOWN_PAYMENT                     774370      895844   53.64  float64
AMT_GOODS_PRICE                     1284699      385515   23.08  float64
WEEKDAY_APPR_PROCESS_START          1670214           0    0.00   object
HOUR_APPR_PROCESS_START             1670214           0    0.00    int64
FLAG_LAST_APPL_PER_CONTRACT         1670214           0    0.00   object
NFLAG_LAST_APPL_IN_DAY              1670214           0    0.00    int64
RATE_DOWN_PAYMENT                    774370      89

In [45]:
# previous_application_df.loc[previous_application_df["AMT_CREDIT"].isna(), ["NAME_CONTRACT_STATUS"]].value_counts()
previous_application_df.loc[previous_application_df["AMT_CREDIT"].isna()]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
1127152,2204450,438387,Revolving loans,0.0,0.0,,,,FRIDAY,10,Y,1,,,,XAP,Approved,-608,XNA,XAP,,Repeater,XNA,Cards,walk-in,Country-wide,20,Connectivity,0.0,XNA,Card Street,,,,,,


In [46]:
# Por ahora reemplazo nan con ceros, pero habría que ver si se puede mejorar
previous_application_df = previous_application_df.fillna(0)
print("Columnas con valores NaN despues de rellenar:")
print(previous_application_df.columns[previous_application_df.isna().any()].tolist())

Columnas con valores NaN despues de rellenar:
[]


In [47]:
df_info_summary(previous_application_df)

                             Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                          1670214           0     0.0    int64
SK_ID_CURR                          1670214           0     0.0    int64
NAME_CONTRACT_TYPE                  1670214           0     0.0   object
AMT_ANNUITY                         1670214           0     0.0  float64
AMT_APPLICATION                     1670214           0     0.0  float64
AMT_CREDIT                          1670214           0     0.0  float64
AMT_DOWN_PAYMENT                    1670214           0     0.0  float64
AMT_GOODS_PRICE                     1670214           0     0.0  float64
WEEKDAY_APPR_PROCESS_START          1670214           0     0.0   object
HOUR_APPR_PROCESS_START             1670214           0     0.0    int64
FLAG_LAST_APPL_PER_CONTRACT         1670214           0     0.0   object
NFLAG_LAST_APPL_IN_DAY              1670214           0     0.0    int64
RATE_DOWN_PAYMENT                   1670214        

In [48]:
print("Count distinct of SK_ID_PREV: ", previous_application_df["SK_ID_PREV"].nunique())
print("Count distinct of SK_ID_CURR: ", previous_application_df["SK_ID_CURR"].nunique())

Count distinct of SK_ID_PREV:  1670214
Count distinct of SK_ID_CURR:  338857


In [49]:
# OHE de columnas categóricas
# Identificar columnas categóricas
cat_cols = previous_application_df.select_dtypes(include=["object"]).columns.tolist()
print("Columnas categóricas:", cat_cols)

# Aplicar One Hot Encoding
previous_application_ohe = pd.get_dummies(previous_application_df, columns=cat_cols, dummy_na=True)

previous_application_ohe.shape

Columnas categóricas: ['NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'FLAG_LAST_APPL_PER_CONTRACT', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION']


(1670214, 182)

In [50]:
df_info_summary(previous_application_ohe)

                                                    Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                                                 1670214           0     0.0    int64
SK_ID_CURR                                                 1670214           0     0.0    int64
AMT_ANNUITY                                                1670214           0     0.0  float64
AMT_APPLICATION                                            1670214           0     0.0  float64
AMT_CREDIT                                                 1670214           0     0.0  float64
AMT_DOWN_PAYMENT                                           1670214           0     0.0  float64
AMT_GOODS_PRICE                                            1670214           0     0.0  float64
HOUR_APPR_PROCESS_START                                    1670214           0     0.0    int64
NFLAG_LAST_APPL_IN_DAY                                     1670214           0     0.0    int64
RATE_DOWN_PAYMENT                       

In [51]:
previous_application_summary = resumir_por_id(
    previous_application_ohe,
    id_col='SK_ID_CURR',
    excluir_cols=None,
    verbose=True,
    nombre_conteo='previous_application_records'
)

# Mostrar las primeras filas
previous_application_summary.head()

Columnas excluidas: []
Columnas resumidas: ['AMT_ANNUITY', 'AMT_APPLICATION', 'AMT_CREDIT', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE', 'CNT_PAYMENT', 'DAYS_DECISION', 'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_TERMINATION', 'HOUR_APPR_PROCESS_START', 'NFLAG_INSURED_ON_APPROVAL', 'NFLAG_LAST_APPL_IN_DAY', 'RATE_DOWN_PAYMENT', 'RATE_INTEREST_PRIMARY', 'RATE_INTEREST_PRIVILEGED', 'SELLERPLACE_AREA', 'SK_ID_PREV']


Unnamed: 0,SK_ID_CURR,SK_ID_PREV_mean,SK_ID_PREV_min,SK_ID_PREV_max,SK_ID_PREV_median,SK_ID_PREV_sum,AMT_ANNUITY_mean,AMT_ANNUITY_min,AMT_ANNUITY_max,AMT_ANNUITY_median,AMT_ANNUITY_sum,AMT_APPLICATION_mean,AMT_APPLICATION_min,AMT_APPLICATION_max,AMT_APPLICATION_median,AMT_APPLICATION_sum,AMT_CREDIT_mean,AMT_CREDIT_min,AMT_CREDIT_max,AMT_CREDIT_median,AMT_CREDIT_sum,AMT_DOWN_PAYMENT_mean,AMT_DOWN_PAYMENT_min,AMT_DOWN_PAYMENT_max,AMT_DOWN_PAYMENT_median,AMT_DOWN_PAYMENT_sum,AMT_GOODS_PRICE_mean,AMT_GOODS_PRICE_min,AMT_GOODS_PRICE_max,AMT_GOODS_PRICE_median,AMT_GOODS_PRICE_sum,HOUR_APPR_PROCESS_START_mean,HOUR_APPR_PROCESS_START_min,HOUR_APPR_PROCESS_START_max,HOUR_APPR_PROCESS_START_median,HOUR_APPR_PROCESS_START_sum,NFLAG_LAST_APPL_IN_DAY_mean,NFLAG_LAST_APPL_IN_DAY_min,NFLAG_LAST_APPL_IN_DAY_max,NFLAG_LAST_APPL_IN_DAY_median,NFLAG_LAST_APPL_IN_DAY_sum,RATE_DOWN_PAYMENT_mean,RATE_DOWN_PAYMENT_min,RATE_DOWN_PAYMENT_max,RATE_DOWN_PAYMENT_median,RATE_DOWN_PAYMENT_sum,RATE_INTEREST_PRIMARY_mean,RATE_INTEREST_PRIMARY_min,RATE_INTEREST_PRIMARY_max,RATE_INTEREST_PRIMARY_median,RATE_INTEREST_PRIMARY_sum,RATE_INTEREST_PRIVILEGED_mean,RATE_INTEREST_PRIVILEGED_min,RATE_INTEREST_PRIVILEGED_max,RATE_INTEREST_PRIVILEGED_median,RATE_INTEREST_PRIVILEGED_sum,DAYS_DECISION_mean,DAYS_DECISION_min,DAYS_DECISION_max,DAYS_DECISION_median,DAYS_DECISION_sum,SELLERPLACE_AREA_mean,SELLERPLACE_AREA_min,SELLERPLACE_AREA_max,SELLERPLACE_AREA_median,SELLERPLACE_AREA_sum,CNT_PAYMENT_mean,CNT_PAYMENT_min,CNT_PAYMENT_max,CNT_PAYMENT_median,CNT_PAYMENT_sum,DAYS_FIRST_DRAWING_mean,DAYS_FIRST_DRAWING_min,DAYS_FIRST_DRAWING_max,DAYS_FIRST_DRAWING_median,DAYS_FIRST_DRAWING_sum,DAYS_FIRST_DUE_mean,DAYS_FIRST_DUE_min,DAYS_FIRST_DUE_max,DAYS_FIRST_DUE_median,DAYS_FIRST_DUE_sum,DAYS_LAST_DUE_1ST_VERSION_mean,DAYS_LAST_DUE_1ST_VERSION_min,DAYS_LAST_DUE_1ST_VERSION_max,DAYS_LAST_DUE_1ST_VERSION_median,DAYS_LAST_DUE_1ST_VERSION_sum,DAYS_LAST_DUE_mean,DAYS_LAST_DUE_min,DAYS_LAST_DUE_max,DAYS_LAST_DUE_median,DAYS_LAST_DUE_sum,DAYS_TERMINATION_mean,DAYS_TERMINATION_min,DAYS_TERMINATION_max,DAYS_TERMINATION_median,DAYS_TERMINATION_sum,NFLAG_INSURED_ON_APPROVAL_mean,NFLAG_INSURED_ON_APPROVAL_min,NFLAG_INSURED_ON_APPROVAL_max,NFLAG_INSURED_ON_APPROVAL_median,NFLAG_INSURED_ON_APPROVAL_sum,previous_application_records_count
0,100001,1369693.0,1369693,1369693,1369693.0,1369693,3951.0,3951.0,3951.0,3951.0,3951.0,24835.5,24835.5,24835.5,24835.5,24835.5,23787.0,23787.0,23787.0,23787.0,23787.0,2520.0,2520.0,2520.0,2520.0,2520.0,24835.5,24835.5,24835.5,24835.5,24835.5,13.0,13,13,13.0,13,1.0,1,1,1.0,1,0.104326,0.104326,0.104326,0.104326,0.104326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1740.0,-1740,-1740,-1740.0,-1740,23.0,23,23,23.0,23,8.0,8.0,8.0,8.0,8.0,365243.0,365243.0,365243.0,365243.0,365243.0,-1709.0,-1709.0,-1709.0,-1709.0,-1709.0,-1499.0,-1499.0,-1499.0,-1499.0,-1499.0,-1619.0,-1619.0,-1619.0,-1619.0,-1619.0,-1612.0,-1612.0,-1612.0,-1612.0,-1612.0,0.0,0.0,0.0,0.0,0.0,1
1,100002,1038818.0,1038818,1038818,1038818.0,1038818,9251.775,9251.775,9251.775,9251.775,9251.775,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,0.0,0.0,0.0,0.0,0.0,179055.0,179055.0,179055.0,179055.0,179055.0,9.0,9,9,9.0,9,1.0,1,1,1.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-606.0,-606,-606,-606.0,-606,500.0,500,500,500.0,500,24.0,24.0,24.0,24.0,24.0,365243.0,365243.0,365243.0,365243.0,365243.0,-565.0,-565.0,-565.0,-565.0,-565.0,125.0,125.0,125.0,125.0,125.0,-25.0,-25.0,-25.0,-25.0,-25.0,-17.0,-17.0,-17.0,-17.0,-17.0,0.0,0.0,0.0,0.0,0.0,1
2,100003,2281150.0,1810518,2636178,2396755.0,6843451,56553.99,6737.31,98356.995,64567.665,169661.97,435436.5,68809.5,900000.0,337500.0,1306309.5,484191.0,68053.5,1035882.0,348637.5,1452573.0,2295.0,0.0,6885.0,0.0,6885.0,435436.5,68809.5,900000.0,337500.0,1306309.5,14.666667,12,17,15.0,44,1.0,1,1,1.0,3,0.033354,0.0,0.100061,0.0,0.100061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1305.0,-2341,-746,-828.0,-3915,533.0,-1,1400,200.0,1599,10.0,6.0,12.0,12.0,30.0,365243.0,365243.0,365243.0,365243.0,1095729.0,-1274.333333,-2310.0,-716.0,-797.0,-3823.0,-1004.333333,-1980.0,-386.0,-647.0,-3013.0,-1054.333333,-1980.0,-536.0,-647.0,-3163.0,-1047.333333,-1976.0,-527.0,-639.0,-3142.0,0.666667,0.0,1.0,1.0,2.0,3
3,100004,1564014.0,1564014,1564014,1564014.0,1564014,5357.25,5357.25,5357.25,5357.25,5357.25,24282.0,24282.0,24282.0,24282.0,24282.0,20106.0,20106.0,20106.0,20106.0,20106.0,4860.0,4860.0,4860.0,4860.0,4860.0,24282.0,24282.0,24282.0,24282.0,24282.0,5.0,5,5,5.0,5,1.0,1,1,1.0,1,0.212008,0.212008,0.212008,0.212008,0.212008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-815.0,-815,-815,-815.0,-815,30.0,30,30,30.0,30,4.0,4.0,4.0,4.0,4.0,365243.0,365243.0,365243.0,365243.0,365243.0,-784.0,-784.0,-784.0,-784.0,-784.0,-694.0,-694.0,-694.0,-694.0,-694.0,-724.0,-724.0,-724.0,-724.0,-724.0,-714.0,-714.0,-714.0,-714.0,-714.0,0.0,0.0,0.0,0.0,0.0,1
4,100005,2176837.0,1857999,2495675,2176837.0,4353674,2406.6,0.0,4813.2,2406.6,4813.2,22308.75,0.0,44617.5,22308.75,44617.5,20076.75,0.0,40153.5,20076.75,40153.5,2232.0,0.0,4464.0,2232.0,4464.0,22308.75,0.0,44617.5,22308.75,44617.5,10.5,10,11,10.5,21,1.0,1,1,1.0,2,0.054482,0.0,0.108964,0.054482,0.108964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-536.0,-757,-315,-536.0,-1072,18.0,-1,37,18.0,36,6.0,0.0,12.0,6.0,12.0,182621.5,0.0,365243.0,182621.5,365243.0,-353.0,-706.0,0.0,-353.0,-706.0,-188.0,-376.0,0.0,-188.0,-376.0,-233.0,-466.0,0.0,-233.0,-466.0,-230.0,-460.0,0.0,-230.0,-460.0,0.0,0.0,0.0,0.0,0.0,2


In [52]:
previous_application_summary.shape

(338857, 102)

In [53]:
del previous_application_ohe
del previous_application_df
gc.collect()

0

### Data prep - pos_cash_balance

In [54]:
len(pos_cash_balance_df)

10001358

In [55]:
# Info sobre las columnas del pos_cash_balance_df
"""
•	SK_ID_PREV → ID del crédito previo en Home Credit (un préstamo en application puede tener 0, 1 o varios).
•	SK_ID_CURR → ID del préstamo en nuestro dataset principal (application).
•	MONTHS_BALANCE → Mes del reporte relativo a la fecha de aplicación del préstamo actual:
    0 = mes de la aplicación
    -1 = un mes antes
•	CNT_INSTALMENT → Número total de cuotas originalmente previstas (puede variar si se reestructura).
•	CNT_INSTALMENT_FUTURE → Número de cuotas pendientes en ese momento.
•	NAME_CONTRACT_STATUS → Estado del contrato ese mes (ej. Active, Completed, Signed, etc.).
•	SK_DPD → Days Past Due = cantidad de días de atraso en ese mes.
•	SK_DPD_DEF → Days Past Due con tolerancia (se ignoran deudas pequeñas).
"""
df_info_summary(pos_cash_balance_df)

                       Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                   10001358           0    0.00    int64
SK_ID_CURR                   10001358           0    0.00    int64
MONTHS_BALANCE               10001358           0    0.00    int64
CNT_INSTALMENT                9975287       26071    0.26  float64
CNT_INSTALMENT_FUTURE         9975271       26087    0.26  float64
NAME_CONTRACT_STATUS         10001358           0    0.00   object
SK_DPD                       10001358           0    0.00    int64
SK_DPD_DEF                   10001358           0    0.00    int64


In [56]:
print("Count distinct of SK_ID_PREV: ", pos_cash_balance_df["SK_ID_PREV"].nunique())
print("Count distinct of SK_ID_CURR: ", pos_cash_balance_df["SK_ID_CURR"].nunique())

Count distinct of SK_ID_PREV:  936325
Count distinct of SK_ID_CURR:  337252


In [57]:
# Por ahora reemplazo nan con ceros, pero habría que ver si se puede mejorar
pos_cash_balance_df = pos_cash_balance_df.fillna(0)
print("Columnas con valores NaN despues de rellenar:")
print(pos_cash_balance_df.columns[pos_cash_balance_df.isna().any()].tolist())

Columnas con valores NaN despues de rellenar:
[]


In [58]:
df_info_summary(pos_cash_balance_df)

                       Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                   10001358           0     0.0    int64
SK_ID_CURR                   10001358           0     0.0    int64
MONTHS_BALANCE               10001358           0     0.0    int64
CNT_INSTALMENT               10001358           0     0.0  float64
CNT_INSTALMENT_FUTURE        10001358           0     0.0  float64
NAME_CONTRACT_STATUS         10001358           0     0.0   object
SK_DPD                       10001358           0     0.0    int64
SK_DPD_DEF                   10001358           0     0.0    int64


In [59]:
# Crear tabla de conteo de status por SK_ID_PREV
# OHE con get_dummies
status_dummies = pd.get_dummies(pos_cash_balance_df["NAME_CONTRACT_STATUS"], prefix="STATUS_", dtype="int8")

# Concatenar pos_cash_balance_df
pos_cash_balance_df = pd.concat([pos_cash_balance_df, status_dummies], axis=1)

print("Filas de pos_cash_balance_df despues de agregar dummies:", len(pos_cash_balance_df))
pos_cash_balance_df.head()  

Filas de pos_cash_balance_df despues de agregar dummies: 10001358


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF,STATUS__Active,STATUS__Amortized debt,STATUS__Approved,STATUS__Canceled,STATUS__Completed,STATUS__Demand,STATUS__Returned to the store,STATUS__Signed,STATUS__XNA
0,1803195,182943,-31,48.0,45.0,Active,0,0,1,0,0,0,0,0,0,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0,1,0,0,0,0,0,0,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0,1,0,0,0,0,0,0,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0,1,0,0,0,0,0,0,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0,1,0,0,0,0,0,0,0,0


In [60]:
df_info_summary(pos_cash_balance_df)

                               Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                           10001358           0     0.0    int64
SK_ID_CURR                           10001358           0     0.0    int64
MONTHS_BALANCE                       10001358           0     0.0    int64
CNT_INSTALMENT                       10001358           0     0.0  float64
CNT_INSTALMENT_FUTURE                10001358           0     0.0  float64
NAME_CONTRACT_STATUS                 10001358           0     0.0   object
SK_DPD                               10001358           0     0.0    int64
SK_DPD_DEF                           10001358           0     0.0    int64
STATUS__Active                       10001358           0     0.0     int8
STATUS__Amortized debt               10001358           0     0.0     int8
STATUS__Approved                     10001358           0     0.0     int8
STATUS__Canceled                     10001358           0     0.0     int8
STATUS__Completed        

In [61]:
del status_dummies
gc.collect()

0

In [62]:
pos_cash_balance_df_agg = resumir_por_id(
    pos_cash_balance_df,
    id_col='SK_ID_CURR',
    excluir_cols=['SK_ID_PREV'],
    verbose=True,
    nombre_conteo='pos_cash_balance_records'
)

# Mostrar las primeras filas
pos_cash_balance_df_agg.head()

Columnas excluidas: ['SK_ID_PREV']
Columnas resumidas: ['CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE', 'MONTHS_BALANCE', 'SK_DPD', 'SK_DPD_DEF', 'STATUS__Active', 'STATUS__Amortized debt', 'STATUS__Approved', 'STATUS__Canceled', 'STATUS__Completed', 'STATUS__Demand', 'STATUS__Returned to the store', 'STATUS__Signed', 'STATUS__XNA']


Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE_mean,MONTHS_BALANCE_min,MONTHS_BALANCE_max,MONTHS_BALANCE_median,MONTHS_BALANCE_sum,CNT_INSTALMENT_mean,CNT_INSTALMENT_min,CNT_INSTALMENT_max,CNT_INSTALMENT_median,CNT_INSTALMENT_sum,CNT_INSTALMENT_FUTURE_mean,CNT_INSTALMENT_FUTURE_min,CNT_INSTALMENT_FUTURE_max,CNT_INSTALMENT_FUTURE_median,CNT_INSTALMENT_FUTURE_sum,SK_DPD_mean,SK_DPD_min,SK_DPD_max,SK_DPD_median,SK_DPD_sum,SK_DPD_DEF_mean,SK_DPD_DEF_min,SK_DPD_DEF_max,SK_DPD_DEF_median,SK_DPD_DEF_sum,STATUS__Active_mean,STATUS__Active_min,STATUS__Active_max,STATUS__Active_median,STATUS__Active_sum,STATUS__Amortized debt_mean,STATUS__Amortized debt_min,STATUS__Amortized debt_max,STATUS__Amortized debt_median,STATUS__Amortized debt_sum,STATUS__Approved_mean,STATUS__Approved_min,STATUS__Approved_max,STATUS__Approved_median,STATUS__Approved_sum,STATUS__Canceled_mean,STATUS__Canceled_min,STATUS__Canceled_max,STATUS__Canceled_median,STATUS__Canceled_sum,STATUS__Completed_mean,STATUS__Completed_min,STATUS__Completed_max,STATUS__Completed_median,STATUS__Completed_sum,STATUS__Demand_mean,STATUS__Demand_min,STATUS__Demand_max,STATUS__Demand_median,STATUS__Demand_sum,STATUS__Returned to the store_mean,STATUS__Returned to the store_min,STATUS__Returned to the store_max,STATUS__Returned to the store_median,STATUS__Returned to the store_sum,STATUS__Signed_mean,STATUS__Signed_min,STATUS__Signed_max,STATUS__Signed_median,STATUS__Signed_sum,STATUS__XNA_mean,STATUS__XNA_min,STATUS__XNA_max,STATUS__XNA_median,STATUS__XNA_sum,pos_cash_balance_records_count
0,100001,-72.555556,-96,-53,-57.0,-653,4.0,4.0,4.0,4.0,36.0,1.444444,0.0,4.0,1.0,13.0,0.777778,0,7,0.0,7,0.777778,0,7,0.0,7,0.777778,0,1,1.0,7,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.222222,0,1,0.0,2,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,9
1,100002,-10.0,-19,-1,-10.0,-190,24.0,24.0,24.0,24.0,456.0,15.0,6.0,24.0,15.0,285.0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,1.0,1,1,1.0,19,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,19
2,100003,-43.785714,-77,-18,-26.5,-1226,10.107143,6.0,12.0,12.0,283.0,5.785714,0.0,12.0,6.0,162.0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.928571,0,1,1.0,26,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.071429,0,1,0.0,2,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,28
3,100004,-25.5,-27,-24,-25.5,-102,3.75,3.0,4.0,4.0,15.0,2.25,0.0,4.0,2.5,9.0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.75,0,1,1.0,3,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.25,0,1,0.0,1,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,4
4,100005,-20.0,-25,-15,-20.0,-220,10.636364,0.0,12.0,12.0,117.0,6.545455,0.0,12.0,7.0,72.0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.818182,0,1,1.0,9,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.090909,0,1,0.0,1,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.090909,0,1,0.0,1,0.0,0,0,0.0,0,11


In [63]:
len(pos_cash_balance_df_agg)

337252

In [64]:
df_info_summary(pos_cash_balance_df_agg)

                                      Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                                    337252           0     0.0    int64
MONTHS_BALANCE_mean                           337252           0     0.0  float64
MONTHS_BALANCE_min                            337252           0     0.0    int64
MONTHS_BALANCE_max                            337252           0     0.0    int64
MONTHS_BALANCE_median                         337252           0     0.0  float64
MONTHS_BALANCE_sum                            337252           0     0.0    int64
CNT_INSTALMENT_mean                           337252           0     0.0  float64
CNT_INSTALMENT_min                            337252           0     0.0  float64
CNT_INSTALMENT_max                            337252           0     0.0  float64
CNT_INSTALMENT_median                         337252           0     0.0  float64
CNT_INSTALMENT_sum                            337252           0     0.0  float64
CNT_INSTALMENT_F

In [65]:
print("Count distinct of SK_ID_PREV: ", pos_cash_balance_df_agg["SK_ID_CURR"].nunique())

Count distinct of SK_ID_PREV:  337252


In [66]:
del pos_cash_balance_df
gc.collect()

0

### Data prep - installments_payments

In [67]:
len(installments_payments_df)

13605401

In [68]:
# Info sobre las columnas del installments_payments_df
"""
Histórico de pagos de cuotas de créditos anteriores.
•	SK_ID_PREV → ID del crédito previo.
•	SK_ID_CURR → ID del préstamo actual.
•	NUM_INSTALMENT_VERSION → Versión del calendario de pagos (0 si es tarjeta de crédito). Un cambio significa renegociación.
•	NUM_INSTALMENT_NUMBER → Número de cuota (1, 2, 3, …).
•	DAYS_INSTALMENT → Día en que debía pagarse la cuota (relativo a aplicación actual).
•	DAYS_ENTRY_PAYMENT → Día en que efectivamente se pagó (NaN si no se pagó).
•	AMT_INSTALMENT → Monto esperado de la cuota.
•	AMT_PAYMENT → Monto realmente pagado.
"""
df_info_summary(installments_payments_df)

                        Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                    13605401           0    0.00    int64
SK_ID_CURR                    13605401           0    0.00    int64
NUM_INSTALMENT_VERSION        13605401           0    0.00  float64
NUM_INSTALMENT_NUMBER         13605401           0    0.00    int64
DAYS_INSTALMENT               13605401           0    0.00  float64
DAYS_ENTRY_PAYMENT            13602496        2905    0.02  float64
AMT_INSTALMENT                13605401           0    0.00  float64
AMT_PAYMENT                   13602496        2905    0.02  float64


In [69]:
# Verificamos un ejemplo de filas con DAYS_ENTRY_PAYMENT NaN. Aparenteemente no se abonaron esas cuotas
installments_payments_df.loc[installments_payments_df["DAYS_ENTRY_PAYMENT"].isna()].head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
3764207,1531600,103793,1.0,7,-668.0,,49741.02,
3764208,1947105,159974,1.0,24,-36.0,,22849.515,
3764209,1843773,167270,1.0,22,-20.0,,48092.355,
3764210,1691592,192536,1.0,5,-2561.0,,7675.425,
3764211,1531299,157088,0.0,11,-1847.0,,67.5,


In [70]:
installments_payments_df.loc[installments_payments_df["SK_ID_PREV"] == 1531600].sort_values(by=["DAYS_INSTALMENT"], ascending=False) 

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
3794812,1531600,103793,1.0,27,-68.0,,49741.02,
3764294,1531600,103793,1.0,26,-98.0,,49741.02,
3764214,1531600,103793,1.0,25,-128.0,,49741.02,
3799630,1531600,103793,1.0,24,-158.0,,49741.02,
3774325,1531600,103793,1.0,23,-188.0,,49741.02,
3784883,1531600,103793,1.0,22,-218.0,,49741.02,
3764227,1531600,103793,1.0,21,-248.0,,49741.02,
3764224,1531600,103793,1.0,20,-278.0,,49741.02,
3779550,1531600,103793,1.0,19,-308.0,,49741.02,
3779605,1531600,103793,1.0,18,-338.0,,49741.02,


In [71]:
"""
Por lo visto anteriormente, los NaN en DAYS_ENTRY_PAYMENT y AMT_PAYMENT indican cuotas no pagadas. 
Vamos a crear las siguientes columnas:
- instalments_unpaid: cantidad de cuotas no abonadas (DAYS_ENTRY_PAYMENT no NaN)
- instalments_partially_paid: cantidad de cuotas abonadas parcialmente (DAYS_ENTRY_PAYMENT no NaN y AMT_PAYMENT < AMT_INSTALMENT)
- instalments_overdue: cantidad de cuotas vencidas y no pagadas (DAYS_ENTRY_PAYMENT NaN y DAYS_INSTALMENT < 0)
- amount debt: monto adeudado (AMT_INSTALMENT - AMT_PAYMENT) si no se pagó y si se pagó menos de lo debido
- dpd: días de atraso en el pago:
    - Si se pagó (DAYS_ENTRY_PAYMENT no es NaN): max(0, DAYS_ENTRY_PAYMENT - DAYS_INSTALMENT)
    - Si no se pagó y está vencido: abs(DAYS_INSTALMENT) si DAYS_INSTALMENT < 0
"""

installments_payments_df["instalments_unpaid"] = installments_payments_df["DAYS_ENTRY_PAYMENT"].isna().astype(int)

installments_payments_df["instalments_partially_paid"] = (
    (~installments_payments_df["DAYS_ENTRY_PAYMENT"].isna()) & 
    (installments_payments_df["AMT_PAYMENT"] < installments_payments_df["AMT_INSTALMENT"]) &
    (installments_payments_df["AMT_PAYMENT"] > 0)
).astype(int)

installments_payments_df["instalments_overdue"] = ((installments_payments_df["DAYS_ENTRY_PAYMENT"].isna()) & (installments_payments_df["DAYS_INSTALMENT"] < 0)).astype(int)

# Calcular monto adeudado
installments_payments_df["amount_debt"] = installments_payments_df.apply(
    # lambda row: (row["AMT_INSTALMENT"] - row["AMT_PAYMENT"]) if (pd.notna(row["AMT_PAYMENT"]) or row["AMT_PAYMENT"] < row["AMT_INSTALMENT"]) else 0,
    # axis=1
    lambda row: max(0, row["AMT_INSTALMENT"] - row["AMT_PAYMENT"]) if pd.notna(row["AMT_PAYMENT"]) 
    else abs(row["AMT_INSTALMENT"]), 
    axis=1
).round(2)

# Calcular dpd considerando ambos casos
installments_payments_df["dpd"] = installments_payments_df.apply(
    lambda row: max(0, row["DAYS_ENTRY_PAYMENT"] - row["DAYS_INSTALMENT"]) if pd.notna(row["DAYS_ENTRY_PAYMENT"]) 
    else abs(row["DAYS_INSTALMENT"]) if row["DAYS_INSTALMENT"] < 0 else 0, 
    axis=1
).astype(int)

installments_payments_df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,instalments_unpaid,instalments_partially_paid,instalments_overdue,amount_debt,dpd
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36,0,0,0,0.0,0
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525,0,0,0,0.0,0
2,2085231,193053,2.0,1,-63.0,-63.0,25425.0,25425.0,0,0,0,0.0,0
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13,0,0,0,0.0,0
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585,0,1,0,4.45,17


In [72]:
installments_payments_df.loc[installments_payments_df["SK_ID_PREV"] == 1531600].sort_values(by=["DAYS_INSTALMENT"], ascending=False) 

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT,instalments_unpaid,instalments_partially_paid,instalments_overdue,amount_debt,dpd
3794812,1531600,103793,1.0,27,-68.0,,49741.02,,1,0,1,49741.02,68
3764294,1531600,103793,1.0,26,-98.0,,49741.02,,1,0,1,49741.02,98
3764214,1531600,103793,1.0,25,-128.0,,49741.02,,1,0,1,49741.02,128
3799630,1531600,103793,1.0,24,-158.0,,49741.02,,1,0,1,49741.02,158
3774325,1531600,103793,1.0,23,-188.0,,49741.02,,1,0,1,49741.02,188
3784883,1531600,103793,1.0,22,-218.0,,49741.02,,1,0,1,49741.02,218
3764227,1531600,103793,1.0,21,-248.0,,49741.02,,1,0,1,49741.02,248
3764224,1531600,103793,1.0,20,-278.0,,49741.02,,1,0,1,49741.02,278
3779550,1531600,103793,1.0,19,-308.0,,49741.02,,1,0,1,49741.02,308
3779605,1531600,103793,1.0,18,-338.0,,49741.02,,1,0,1,49741.02,338


In [73]:
# Removemos columnas con NaN ya reemplazadas con las nuevas columnas
installments_payments_df = installments_payments_df.drop(columns=['DAYS_ENTRY_PAYMENT', 'AMT_PAYMENT'])

# Pasamos NUM_INSTALMENT_VERSION a int32
installments_payments_df['NUM_INSTALMENT_VERSION'] = installments_payments_df['NUM_INSTALMENT_VERSION'].astype('int32')

In [74]:
print("Count distinct of SK_ID_PREV: ", installments_payments_df["SK_ID_CURR"].nunique())

Count distinct of SK_ID_PREV:  339587


In [75]:
df_info_summary(installments_payments_df)

                            Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                        13605401           0     0.0    int64
SK_ID_CURR                        13605401           0     0.0    int64
NUM_INSTALMENT_VERSION            13605401           0     0.0    int32
NUM_INSTALMENT_NUMBER             13605401           0     0.0    int64
DAYS_INSTALMENT                   13605401           0     0.0  float64
AMT_INSTALMENT                    13605401           0     0.0  float64
instalments_unpaid                13605401           0     0.0    int64
instalments_partially_paid        13605401           0     0.0    int64
instalments_overdue               13605401           0     0.0    int64
amount_debt                       13605401           0     0.0  float64
dpd                               13605401           0     0.0    int64


In [76]:
installments_payments_df_agg = resumir_por_id(
    installments_payments_df,
    id_col='SK_ID_CURR',
    excluir_cols=['SK_ID_PREV'],
    verbose=True,
    nombre_conteo='installments_payments_records'
)

# Mostrar las primeras filas
installments_payments_df_agg.head()

Columnas excluidas: ['SK_ID_PREV']
Columnas resumidas: ['AMT_INSTALMENT', 'DAYS_INSTALMENT', 'NUM_INSTALMENT_NUMBER', 'NUM_INSTALMENT_VERSION', 'amount_debt', 'dpd', 'instalments_overdue', 'instalments_partially_paid', 'instalments_unpaid']


Unnamed: 0,SK_ID_CURR,NUM_INSTALMENT_VERSION_mean,NUM_INSTALMENT_VERSION_min,NUM_INSTALMENT_VERSION_max,NUM_INSTALMENT_VERSION_median,NUM_INSTALMENT_VERSION_sum,NUM_INSTALMENT_NUMBER_mean,NUM_INSTALMENT_NUMBER_min,NUM_INSTALMENT_NUMBER_max,NUM_INSTALMENT_NUMBER_median,NUM_INSTALMENT_NUMBER_sum,DAYS_INSTALMENT_mean,DAYS_INSTALMENT_min,DAYS_INSTALMENT_max,DAYS_INSTALMENT_median,DAYS_INSTALMENT_sum,AMT_INSTALMENT_mean,AMT_INSTALMENT_min,AMT_INSTALMENT_max,AMT_INSTALMENT_median,AMT_INSTALMENT_sum,instalments_unpaid_mean,instalments_unpaid_min,instalments_unpaid_max,instalments_unpaid_median,instalments_unpaid_sum,instalments_partially_paid_mean,instalments_partially_paid_min,instalments_partially_paid_max,instalments_partially_paid_median,instalments_partially_paid_sum,instalments_overdue_mean,instalments_overdue_min,instalments_overdue_max,instalments_overdue_median,instalments_overdue_sum,amount_debt_mean,amount_debt_min,amount_debt_max,amount_debt_median,amount_debt_sum,dpd_mean,dpd_min,dpd_max,dpd_median,dpd_sum,installments_payments_records_count
0,100001,1.142857,1,2,1.0,8,2.714286,1,4,3.0,19,-2187.714286,-2916.0,-1619.0,-1709.0,-15314.0,5885.132143,3951.0,17397.9,3980.925,41195.925,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,1.571429,0,11,0.0,11,7
1,100002,1.052632,1,2,1.0,20,10.0,1,19,10.0,190,-295.0,-565.0,-25.0,-295.0,-5605.0,11559.247105,9251.775,53093.745,9251.775,219625.695,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,19
2,100003,1.04,1,2,1.0,26,5.08,1,12,5.0,127,-1378.16,-2310.0,-536.0,-797.0,-34454.0,64754.586,6662.97,560835.36,64275.615,1618864.65,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,25
3,100004,1.333333,1,2,1.0,4,2.0,1,3,2.0,6,-754.0,-784.0,-724.0,-754.0,-2262.0,7096.155,5357.25,10573.965,5357.25,21288.465,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,3
4,100005,1.111111,1,2,1.0,10,5.0,1,9,5.0,45,-586.0,-706.0,-466.0,-586.0,-5274.0,6240.205,4813.2,17656.245,4813.2,56161.845,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.111111,0,1,0.0,1,9


In [77]:
len(installments_payments_df_agg)

339587

In [78]:
df_info_summary(installments_payments_df_agg)

                                     Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                                   339587           0     0.0    int64
NUM_INSTALMENT_VERSION_mean                  339587           0     0.0  float64
NUM_INSTALMENT_VERSION_min                   339587           0     0.0    int32
NUM_INSTALMENT_VERSION_max                   339587           0     0.0    int32
NUM_INSTALMENT_VERSION_median                339587           0     0.0  float64
NUM_INSTALMENT_VERSION_sum                   339587           0     0.0    int32
NUM_INSTALMENT_NUMBER_mean                   339587           0     0.0  float64
NUM_INSTALMENT_NUMBER_min                    339587           0     0.0    int64
NUM_INSTALMENT_NUMBER_max                    339587           0     0.0    int64
NUM_INSTALMENT_NUMBER_median                 339587           0     0.0  float64
NUM_INSTALMENT_NUMBER_sum                    339587           0     0.0    int64
DAYS_INSTALMENT_mean        

In [79]:
print("Count distinct of SK_ID_CURR: ", installments_payments_df_agg["SK_ID_CURR"].nunique())

Count distinct of SK_ID_CURR:  339587


In [80]:
del installments_payments_df
gc.collect()

0

### Data prep - credit_card_balance

In [81]:
len(credit_card_balance_df)

3840312

In [82]:
# Info sobre las columnas del credit_card_balance_df
"""
Histórico mensual del comportamiento de las tarjetas de crédito.
•	SK_ID_PREV → ID del crédito previo (relacionado a tarjeta).
•	SK_ID_CURR → ID del préstamo en nuestro dataset principal.
•	MONTHS_BALANCE → Mes relativo a la aplicación actual (0 = aplicación, -1 = un mes antes).
•	AMT_BALANCE → Balance actual de la tarjeta ese mes.
•	AMT_CREDIT_LIMIT_ACTUAL → Límite de crédito vigente ese mes.
•	AMT_DRAWINGS_ATM_CURRENT → Monto retirado en cajero automático ese mes.
•	AMT_DRAWINGS_CURRENT → Monto total retirado ese mes (ATM + POS + otros).
•	AMT_DRAWINGS_OTHER_CURRENT → Monto retirado en canales distintos a cajero o POS.
•	AMT_DRAWINGS_POS_CURRENT → Monto gastado en POS (compras).
•	AMT_INST_MIN_REGULARITY → Pago mínimo requerido ese mes.
•	AMT_PAYMENT_CURRENT → Pago realizado ese mes.
•	AMT_PAYMENT_TOTAL_CURRENT → Pagos totales realizados ese mes (suma de todos los abonos).
•	AMT_RECEIVABLE_PRINCIPAL → Principal pendiente de pago.
•	AMT_RECIVABLE → Total pendiente (principal + intereses).
•	AMT_TOTAL_RECEIVABLE → Variante de cálculo del total pendiente.
•	CNT_DRAWINGS_ATM_CURRENT → Número de retiros en cajero ese mes.
•	CNT_DRAWINGS_CURRENT → Número total de operaciones con la tarjeta ese mes.
•	CNT_DRAWINGS_OTHER_CURRENT → Número de operaciones en otros canales.
•	CNT_DRAWINGS_POS_CURRENT → Número de operaciones de compra en POS.
•	CNT_INSTALMENT_MATURE_CUM → Número acumulado de cuotas ya pagadas.
•	NAME_CONTRACT_STATUS → Estado del contrato (ej. Active, Completed).
•	SK_DPD → Días de atraso ese mes.
•	SK_DPD_DEF → Días de atraso con criterio más estricto (ignora deudas pequeñas).
"""
df_info_summary(credit_card_balance_df)

                            Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                         3840312           0    0.00    int64
SK_ID_CURR                         3840312           0    0.00    int64
MONTHS_BALANCE                     3840312           0    0.00    int64
AMT_BALANCE                        3840312           0    0.00  float64
AMT_CREDIT_LIMIT_ACTUAL            3840312           0    0.00    int64
AMT_DRAWINGS_ATM_CURRENT           3090496      749816   19.52  float64
AMT_DRAWINGS_CURRENT               3840312           0    0.00  float64
AMT_DRAWINGS_OTHER_CURRENT         3090496      749816   19.52  float64
AMT_DRAWINGS_POS_CURRENT           3090496      749816   19.52  float64
AMT_INST_MIN_REGULARITY            3535076      305236    7.95  float64
AMT_PAYMENT_CURRENT                3072324      767988   20.00  float64
AMT_PAYMENT_TOTAL_CURRENT          3840312           0    0.00  float64
AMT_RECEIVABLE_PRINCIPAL           3840312           0    0.00  

In [83]:
# Por ahora reemplazamos nan con ceros, pero habría que ver si se puede mejorar.
# En este caso parece razonable rellenar con ceros obsvervando la informacion que arrojan las columnas con NaN.
credit_card_balance_df = credit_card_balance_df.fillna(0)
print("Columnas con valores NaN despues de rellenar:")
print(credit_card_balance_df.columns[credit_card_balance_df.isna().any()].tolist())

Columnas con valores NaN despues de rellenar:
[]


In [84]:
print("Count distinct of SK_ID_PREV: ", credit_card_balance_df["SK_ID_PREV"].nunique())
print("Count distinct of SK_ID_CURR: ", credit_card_balance_df["SK_ID_CURR"].nunique())

Count distinct of SK_ID_PREV:  104307
Count distinct of SK_ID_CURR:  103558


In [85]:
df_info_summary(credit_card_balance_df)

                            Non-Null Count  Null Count  % Null    Dtype
SK_ID_PREV                         3840312           0     0.0    int64
SK_ID_CURR                         3840312           0     0.0    int64
MONTHS_BALANCE                     3840312           0     0.0    int64
AMT_BALANCE                        3840312           0     0.0  float64
AMT_CREDIT_LIMIT_ACTUAL            3840312           0     0.0    int64
AMT_DRAWINGS_ATM_CURRENT           3840312           0     0.0  float64
AMT_DRAWINGS_CURRENT               3840312           0     0.0  float64
AMT_DRAWINGS_OTHER_CURRENT         3840312           0     0.0  float64
AMT_DRAWINGS_POS_CURRENT           3840312           0     0.0  float64
AMT_INST_MIN_REGULARITY            3840312           0     0.0  float64
AMT_PAYMENT_CURRENT                3840312           0     0.0  float64
AMT_PAYMENT_TOTAL_CURRENT          3840312           0     0.0  float64
AMT_RECEIVABLE_PRINCIPAL           3840312           0     0.0  

In [87]:
credit_card_balance_df_agg = resumir_por_id(
    credit_card_balance_df,
    id_col='SK_ID_CURR',
    excluir_cols=['SK_ID_PREV'],
    verbose=True,
    nombre_conteo='credit_card_balance_records'
)

# Mostrar las primeras filas
credit_card_balance_df_agg.head()

Columnas excluidas: ['SK_ID_PREV']
Columnas resumidas: ['AMT_BALANCE', 'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_DRAWINGS_ATM_CURRENT', 'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT', 'AMT_DRAWINGS_POS_CURRENT', 'AMT_INST_MIN_REGULARITY', 'AMT_PAYMENT_CURRENT', 'AMT_PAYMENT_TOTAL_CURRENT', 'AMT_RECEIVABLE_PRINCIPAL', 'AMT_RECIVABLE', 'AMT_TOTAL_RECEIVABLE', 'CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_CURRENT', 'CNT_DRAWINGS_OTHER_CURRENT', 'CNT_DRAWINGS_POS_CURRENT', 'CNT_INSTALMENT_MATURE_CUM', 'MONTHS_BALANCE', 'SK_DPD', 'SK_DPD_DEF', 'previous_application_records']


Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE_mean,MONTHS_BALANCE_min,MONTHS_BALANCE_max,MONTHS_BALANCE_median,MONTHS_BALANCE_sum,AMT_BALANCE_mean,AMT_BALANCE_min,AMT_BALANCE_max,AMT_BALANCE_median,AMT_BALANCE_sum,AMT_CREDIT_LIMIT_ACTUAL_mean,AMT_CREDIT_LIMIT_ACTUAL_min,AMT_CREDIT_LIMIT_ACTUAL_max,AMT_CREDIT_LIMIT_ACTUAL_median,AMT_CREDIT_LIMIT_ACTUAL_sum,AMT_DRAWINGS_ATM_CURRENT_mean,AMT_DRAWINGS_ATM_CURRENT_min,AMT_DRAWINGS_ATM_CURRENT_max,AMT_DRAWINGS_ATM_CURRENT_median,AMT_DRAWINGS_ATM_CURRENT_sum,AMT_DRAWINGS_CURRENT_mean,AMT_DRAWINGS_CURRENT_min,AMT_DRAWINGS_CURRENT_max,AMT_DRAWINGS_CURRENT_median,AMT_DRAWINGS_CURRENT_sum,AMT_DRAWINGS_OTHER_CURRENT_mean,AMT_DRAWINGS_OTHER_CURRENT_min,AMT_DRAWINGS_OTHER_CURRENT_max,AMT_DRAWINGS_OTHER_CURRENT_median,AMT_DRAWINGS_OTHER_CURRENT_sum,AMT_DRAWINGS_POS_CURRENT_mean,AMT_DRAWINGS_POS_CURRENT_min,AMT_DRAWINGS_POS_CURRENT_max,AMT_DRAWINGS_POS_CURRENT_median,AMT_DRAWINGS_POS_CURRENT_sum,AMT_INST_MIN_REGULARITY_mean,AMT_INST_MIN_REGULARITY_min,AMT_INST_MIN_REGULARITY_max,AMT_INST_MIN_REGULARITY_median,AMT_INST_MIN_REGULARITY_sum,AMT_PAYMENT_CURRENT_mean,AMT_PAYMENT_CURRENT_min,AMT_PAYMENT_CURRENT_max,AMT_PAYMENT_CURRENT_median,AMT_PAYMENT_CURRENT_sum,AMT_PAYMENT_TOTAL_CURRENT_mean,AMT_PAYMENT_TOTAL_CURRENT_min,AMT_PAYMENT_TOTAL_CURRENT_max,AMT_PAYMENT_TOTAL_CURRENT_median,AMT_PAYMENT_TOTAL_CURRENT_sum,AMT_RECEIVABLE_PRINCIPAL_mean,AMT_RECEIVABLE_PRINCIPAL_min,AMT_RECEIVABLE_PRINCIPAL_max,AMT_RECEIVABLE_PRINCIPAL_median,AMT_RECEIVABLE_PRINCIPAL_sum,AMT_RECIVABLE_mean,AMT_RECIVABLE_min,AMT_RECIVABLE_max,AMT_RECIVABLE_median,AMT_RECIVABLE_sum,AMT_TOTAL_RECEIVABLE_mean,AMT_TOTAL_RECEIVABLE_min,AMT_TOTAL_RECEIVABLE_max,AMT_TOTAL_RECEIVABLE_median,AMT_TOTAL_RECEIVABLE_sum,CNT_DRAWINGS_ATM_CURRENT_mean,CNT_DRAWINGS_ATM_CURRENT_min,CNT_DRAWINGS_ATM_CURRENT_max,CNT_DRAWINGS_ATM_CURRENT_median,CNT_DRAWINGS_ATM_CURRENT_sum,CNT_DRAWINGS_CURRENT_mean,CNT_DRAWINGS_CURRENT_min,CNT_DRAWINGS_CURRENT_max,CNT_DRAWINGS_CURRENT_median,CNT_DRAWINGS_CURRENT_sum,CNT_DRAWINGS_OTHER_CURRENT_mean,CNT_DRAWINGS_OTHER_CURRENT_min,CNT_DRAWINGS_OTHER_CURRENT_max,CNT_DRAWINGS_OTHER_CURRENT_median,CNT_DRAWINGS_OTHER_CURRENT_sum,CNT_DRAWINGS_POS_CURRENT_mean,CNT_DRAWINGS_POS_CURRENT_min,CNT_DRAWINGS_POS_CURRENT_max,CNT_DRAWINGS_POS_CURRENT_median,CNT_DRAWINGS_POS_CURRENT_sum,CNT_INSTALMENT_MATURE_CUM_mean,CNT_INSTALMENT_MATURE_CUM_min,CNT_INSTALMENT_MATURE_CUM_max,CNT_INSTALMENT_MATURE_CUM_median,CNT_INSTALMENT_MATURE_CUM_sum,SK_DPD_mean,SK_DPD_min,SK_DPD_max,SK_DPD_median,SK_DPD_sum,SK_DPD_DEF_mean,SK_DPD_DEF_min,SK_DPD_DEF_max,SK_DPD_DEF_median,SK_DPD_DEF_sum,previous_application_records_mean,previous_application_records_min,previous_application_records_max,previous_application_records_median,previous_application_records_sum,credit_card_balance_records_count
0,100006,-3.5,-6,-1,-3.5,-21,0.0,0.0,0.0,0.0,0.0,270000.0,270000,270000,270000.0,1620000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,1.0,1,1,1.0,6,6
1,100011,-38.5,-75,-2,-38.5,-2849,54482.111149,0.0,189000.0,0.0,4031676.225,164189.189189,90000,180000,180000.0,12150000,2432.432432,0.0,180000.0,0.0,180000.0,2432.432432,0.0,180000.0,0.0,180000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3902.759392,0.0,9000.0,0.0,288804.195,4843.064189,0.0,55485.0,563.355,358386.75,4520.067568,0.0,55485.0,0.0,334485.0,52402.088919,0.0,180000.0,0.0,3877754.58,54433.179122,-563.355,189000.0,0.0,4028055.255,54433.179122,-563.355,189000.0,0.0,4028055.255,0.054054,0.0,4.0,0.0,4.0,0.054054,0,4,0.0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.418919,0.0,33.0,33.0,1881.0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,1.0,1,1,1.0,74,74
2,100013,-48.5,-96,-1,-48.5,-4656,18159.919219,0.0,161420.22,0.0,1743352.245,131718.75,45000,157500,157500.0,12645000,5953.125,0.0,157500.0,0.0,571500.0,5953.125,0.0,157500.0,0.0,571500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1348.479375,0.0,7875.0,0.0,129454.02,7168.34625,0.0,153675.0,274.32,688161.24,6817.172344,0.0,153675.0,0.0,654448.545,17255.559844,0.0,157500.0,0.0,1656533.745,18101.079844,-274.32,161420.22,0.0,1737703.665,18101.079844,-274.32,161420.22,0.0,1737703.665,0.239583,0.0,7.0,0.0,23.0,0.239583,0,7,0.0,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.354167,0.0,22.0,22.0,1666.0,0.010417,0,1,0.0,1,0.010417,0,1,0.0,1,1.0,1,1,1.0,96,96
3,100021,-10.0,-18,-2,-10.0,-170,0.0,0.0,0.0,0.0,0.0,675000.0,675000,675000,675000.0,11475000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,1.0,1,1,1.0,17,17
4,100023,-7.5,-11,-4,-7.5,-60,0.0,0.0,0.0,0.0,0.0,135000.0,45000,225000,135000.0,1080000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0,0,0.0,0,1.0,1,1,1.0,8,8


In [88]:
len(credit_card_balance_df_agg)

103558

In [91]:
credit_card_balance_df_agg.shape

(103558, 107)

In [89]:
df_info_summary(credit_card_balance_df_agg)

                                     Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                                   103558           0     0.0    int64
MONTHS_BALANCE_mean                          103558           0     0.0  float64
MONTHS_BALANCE_min                           103558           0     0.0    int64
MONTHS_BALANCE_max                           103558           0     0.0    int64
MONTHS_BALANCE_median                        103558           0     0.0  float64
MONTHS_BALANCE_sum                           103558           0     0.0    int64
AMT_BALANCE_mean                             103558           0     0.0  float64
AMT_BALANCE_min                              103558           0     0.0  float64
AMT_BALANCE_max                              103558           0     0.0  float64
AMT_BALANCE_median                           103558           0     0.0  float64
AMT_BALANCE_sum                              103558           0     0.0  float64
AMT_CREDIT_LIMIT_ACTUAL_mean

In [90]:
del credit_card_balance_df
gc.collect()

0

## Join with current application data

In [92]:
train_df = pd.read_csv(os.path.join(PATH, "home-credit-default-risk/application_train.csv"))

# Detectar columnas con solo "y"/"n" y mapear a 1/0
bool_cols = [c for c in train_df.columns if set(train_df[c].dropna().unique()) <= {"y", "n"}]
if bool_cols:
    train_df[bool_cols] = train_df[bool_cols].apply(lambda s: s.map({"y": 1, "n": 0}).astype("int8"))

In [93]:
len(train_df)

307511

In [94]:
df_info_summary(train_df)

                              Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                            307511           0    0.00    int64
TARGET                                307511           0    0.00    int64
NAME_CONTRACT_TYPE                    307511           0    0.00   object
CODE_GENDER                           307511           0    0.00   object
FLAG_OWN_CAR                          307511           0    0.00   object
FLAG_OWN_REALTY                       307511           0    0.00   object
CNT_CHILDREN                          307511           0    0.00    int64
AMT_INCOME_TOTAL                      307511           0    0.00  float64
AMT_CREDIT                            307511           0    0.00  float64
AMT_ANNUITY                           307499          12    0.00  float64
AMT_GOODS_PRICE                       307233         278    0.09  float64
NAME_TYPE_SUITE                       306219        1292    0.42   object
NAME_INCOME_TYPE                      

In [95]:
print("Count distinct of SK_ID_CURR: ", train_df["SK_ID_CURR"].nunique())

Count distinct of SK_ID_CURR:  307511


In [96]:
# Antes de los joins, agregamos prefijos a las columnas de cada dataset para facilitar la identificación posterior
bureau_df_agg = bureau_df_agg.add_prefix("BB_")
previous_application_summary = previous_application_summary.add_prefix("PA_")
pos_cash_balance_df_agg = pos_cash_balance_df_agg.add_prefix("PCB_")    
installments_payments_df_agg = installments_payments_df_agg.add_prefix("IP_")
credit_card_balance_df_agg = credit_card_balance_df_agg.add_prefix("CCB_")

In [97]:
previous_application_summary.head()

Unnamed: 0,PA_SK_ID_CURR,PA_SK_ID_PREV_mean,PA_SK_ID_PREV_min,PA_SK_ID_PREV_max,PA_SK_ID_PREV_median,PA_SK_ID_PREV_sum,PA_AMT_ANNUITY_mean,PA_AMT_ANNUITY_min,PA_AMT_ANNUITY_max,PA_AMT_ANNUITY_median,PA_AMT_ANNUITY_sum,PA_AMT_APPLICATION_mean,PA_AMT_APPLICATION_min,PA_AMT_APPLICATION_max,PA_AMT_APPLICATION_median,PA_AMT_APPLICATION_sum,PA_AMT_CREDIT_mean,PA_AMT_CREDIT_min,PA_AMT_CREDIT_max,PA_AMT_CREDIT_median,PA_AMT_CREDIT_sum,PA_AMT_DOWN_PAYMENT_mean,PA_AMT_DOWN_PAYMENT_min,PA_AMT_DOWN_PAYMENT_max,PA_AMT_DOWN_PAYMENT_median,PA_AMT_DOWN_PAYMENT_sum,PA_AMT_GOODS_PRICE_mean,PA_AMT_GOODS_PRICE_min,PA_AMT_GOODS_PRICE_max,PA_AMT_GOODS_PRICE_median,PA_AMT_GOODS_PRICE_sum,PA_HOUR_APPR_PROCESS_START_mean,PA_HOUR_APPR_PROCESS_START_min,PA_HOUR_APPR_PROCESS_START_max,PA_HOUR_APPR_PROCESS_START_median,PA_HOUR_APPR_PROCESS_START_sum,PA_NFLAG_LAST_APPL_IN_DAY_mean,PA_NFLAG_LAST_APPL_IN_DAY_min,PA_NFLAG_LAST_APPL_IN_DAY_max,PA_NFLAG_LAST_APPL_IN_DAY_median,PA_NFLAG_LAST_APPL_IN_DAY_sum,PA_RATE_DOWN_PAYMENT_mean,PA_RATE_DOWN_PAYMENT_min,PA_RATE_DOWN_PAYMENT_max,PA_RATE_DOWN_PAYMENT_median,PA_RATE_DOWN_PAYMENT_sum,PA_RATE_INTEREST_PRIMARY_mean,PA_RATE_INTEREST_PRIMARY_min,PA_RATE_INTEREST_PRIMARY_max,PA_RATE_INTEREST_PRIMARY_median,PA_RATE_INTEREST_PRIMARY_sum,PA_RATE_INTEREST_PRIVILEGED_mean,PA_RATE_INTEREST_PRIVILEGED_min,PA_RATE_INTEREST_PRIVILEGED_max,PA_RATE_INTEREST_PRIVILEGED_median,PA_RATE_INTEREST_PRIVILEGED_sum,PA_DAYS_DECISION_mean,PA_DAYS_DECISION_min,PA_DAYS_DECISION_max,PA_DAYS_DECISION_median,PA_DAYS_DECISION_sum,PA_SELLERPLACE_AREA_mean,PA_SELLERPLACE_AREA_min,PA_SELLERPLACE_AREA_max,PA_SELLERPLACE_AREA_median,PA_SELLERPLACE_AREA_sum,PA_CNT_PAYMENT_mean,PA_CNT_PAYMENT_min,PA_CNT_PAYMENT_max,PA_CNT_PAYMENT_median,PA_CNT_PAYMENT_sum,PA_DAYS_FIRST_DRAWING_mean,PA_DAYS_FIRST_DRAWING_min,PA_DAYS_FIRST_DRAWING_max,PA_DAYS_FIRST_DRAWING_median,PA_DAYS_FIRST_DRAWING_sum,PA_DAYS_FIRST_DUE_mean,PA_DAYS_FIRST_DUE_min,PA_DAYS_FIRST_DUE_max,PA_DAYS_FIRST_DUE_median,PA_DAYS_FIRST_DUE_sum,PA_DAYS_LAST_DUE_1ST_VERSION_mean,PA_DAYS_LAST_DUE_1ST_VERSION_min,PA_DAYS_LAST_DUE_1ST_VERSION_max,PA_DAYS_LAST_DUE_1ST_VERSION_median,PA_DAYS_LAST_DUE_1ST_VERSION_sum,PA_DAYS_LAST_DUE_mean,PA_DAYS_LAST_DUE_min,PA_DAYS_LAST_DUE_max,PA_DAYS_LAST_DUE_median,PA_DAYS_LAST_DUE_sum,PA_DAYS_TERMINATION_mean,PA_DAYS_TERMINATION_min,PA_DAYS_TERMINATION_max,PA_DAYS_TERMINATION_median,PA_DAYS_TERMINATION_sum,PA_NFLAG_INSURED_ON_APPROVAL_mean,PA_NFLAG_INSURED_ON_APPROVAL_min,PA_NFLAG_INSURED_ON_APPROVAL_max,PA_NFLAG_INSURED_ON_APPROVAL_median,PA_NFLAG_INSURED_ON_APPROVAL_sum,PA_previous_application_records_count
0,100001,1369693.0,1369693,1369693,1369693.0,1369693,3951.0,3951.0,3951.0,3951.0,3951.0,24835.5,24835.5,24835.5,24835.5,24835.5,23787.0,23787.0,23787.0,23787.0,23787.0,2520.0,2520.0,2520.0,2520.0,2520.0,24835.5,24835.5,24835.5,24835.5,24835.5,13.0,13,13,13.0,13,1.0,1,1,1.0,1,0.104326,0.104326,0.104326,0.104326,0.104326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1740.0,-1740,-1740,-1740.0,-1740,23.0,23,23,23.0,23,8.0,8.0,8.0,8.0,8.0,365243.0,365243.0,365243.0,365243.0,365243.0,-1709.0,-1709.0,-1709.0,-1709.0,-1709.0,-1499.0,-1499.0,-1499.0,-1499.0,-1499.0,-1619.0,-1619.0,-1619.0,-1619.0,-1619.0,-1612.0,-1612.0,-1612.0,-1612.0,-1612.0,0.0,0.0,0.0,0.0,0.0,1
1,100002,1038818.0,1038818,1038818,1038818.0,1038818,9251.775,9251.775,9251.775,9251.775,9251.775,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,0.0,0.0,0.0,0.0,0.0,179055.0,179055.0,179055.0,179055.0,179055.0,9.0,9,9,9.0,9,1.0,1,1,1.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-606.0,-606,-606,-606.0,-606,500.0,500,500,500.0,500,24.0,24.0,24.0,24.0,24.0,365243.0,365243.0,365243.0,365243.0,365243.0,-565.0,-565.0,-565.0,-565.0,-565.0,125.0,125.0,125.0,125.0,125.0,-25.0,-25.0,-25.0,-25.0,-25.0,-17.0,-17.0,-17.0,-17.0,-17.0,0.0,0.0,0.0,0.0,0.0,1
2,100003,2281150.0,1810518,2636178,2396755.0,6843451,56553.99,6737.31,98356.995,64567.665,169661.97,435436.5,68809.5,900000.0,337500.0,1306309.5,484191.0,68053.5,1035882.0,348637.5,1452573.0,2295.0,0.0,6885.0,0.0,6885.0,435436.5,68809.5,900000.0,337500.0,1306309.5,14.666667,12,17,15.0,44,1.0,1,1,1.0,3,0.033354,0.0,0.100061,0.0,0.100061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1305.0,-2341,-746,-828.0,-3915,533.0,-1,1400,200.0,1599,10.0,6.0,12.0,12.0,30.0,365243.0,365243.0,365243.0,365243.0,1095729.0,-1274.333333,-2310.0,-716.0,-797.0,-3823.0,-1004.333333,-1980.0,-386.0,-647.0,-3013.0,-1054.333333,-1980.0,-536.0,-647.0,-3163.0,-1047.333333,-1976.0,-527.0,-639.0,-3142.0,0.666667,0.0,1.0,1.0,2.0,3
3,100004,1564014.0,1564014,1564014,1564014.0,1564014,5357.25,5357.25,5357.25,5357.25,5357.25,24282.0,24282.0,24282.0,24282.0,24282.0,20106.0,20106.0,20106.0,20106.0,20106.0,4860.0,4860.0,4860.0,4860.0,4860.0,24282.0,24282.0,24282.0,24282.0,24282.0,5.0,5,5,5.0,5,1.0,1,1,1.0,1,0.212008,0.212008,0.212008,0.212008,0.212008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-815.0,-815,-815,-815.0,-815,30.0,30,30,30.0,30,4.0,4.0,4.0,4.0,4.0,365243.0,365243.0,365243.0,365243.0,365243.0,-784.0,-784.0,-784.0,-784.0,-784.0,-694.0,-694.0,-694.0,-694.0,-694.0,-724.0,-724.0,-724.0,-724.0,-724.0,-714.0,-714.0,-714.0,-714.0,-714.0,0.0,0.0,0.0,0.0,0.0,1
4,100005,2176837.0,1857999,2495675,2176837.0,4353674,2406.6,0.0,4813.2,2406.6,4813.2,22308.75,0.0,44617.5,22308.75,44617.5,20076.75,0.0,40153.5,20076.75,40153.5,2232.0,0.0,4464.0,2232.0,4464.0,22308.75,0.0,44617.5,22308.75,44617.5,10.5,10,11,10.5,21,1.0,1,1,1.0,2,0.054482,0.0,0.108964,0.054482,0.108964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-536.0,-757,-315,-536.0,-1072,18.0,-1,37,18.0,36,6.0,0.0,12.0,6.0,12.0,182621.5,0.0,365243.0,182621.5,365243.0,-353.0,-706.0,0.0,-353.0,-706.0,-188.0,-376.0,0.0,-188.0,-376.0,-233.0,-466.0,0.0,-233.0,-466.0,-230.0,-460.0,0.0,-230.0,-460.0,0.0,0.0,0.0,0.0,0.0,2


In [98]:
# Join train_df con bureau_df
train_df_join = train_df.merge(bureau_df_agg, how="left", left_on="SK_ID_CURR", right_on="BB_SK_ID_CURR")
print("rows after join:", len(train_df_join))

rows after join: 307511


In [99]:
# Join train_df con previous_application_df
train_df_join = train_df_join.merge(previous_application_summary, how="left", left_on="SK_ID_CURR", right_on="PA_SK_ID_CURR")
print("rows after join:", len(train_df_join))

rows after join: 307511


In [100]:
# Join train_df con pos_cash_balance_df
train_df_join = train_df_join.merge(pos_cash_balance_df_agg, how="left", left_on="SK_ID_CURR", right_on="PCB_SK_ID_CURR")
print("rows after join:", len(train_df_join))

rows after join: 307511


In [101]:
# Join train_df con installments_payments_df
train_df_join = train_df_join.merge(installments_payments_df_agg, how="left", left_on="SK_ID_CURR", right_on="IP_SK_ID_CURR")
print("rows after join:", len(train_df_join))

rows after join: 307511


In [102]:
# Join train_df con credit_card_balance_df
train_df_join = train_df_join.merge(credit_card_balance_df_agg, how="left", left_on="SK_ID_CURR", right_on="CCB_SK_ID_CURR")
print("rows after join:", len(train_df_join))

rows after join: 307511


In [103]:
train_df_join.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,BB_SK_ID_CURR,BB_DAYS_CREDIT_mean,BB_DAYS_CREDIT_min,BB_DAYS_CREDIT_max,BB_DAYS_CREDIT_median,BB_DAYS_CREDIT_sum,BB_CREDIT_DAY_OVERDUE_mean,BB_CREDIT_DAY_OVERDUE_min,BB_CREDIT_DAY_OVERDUE_max,BB_CREDIT_DAY_OVERDUE_median,BB_CREDIT_DAY_OVERDUE_sum,BB_DAYS_CREDIT_ENDDATE_mean,BB_DAYS_CREDIT_ENDDATE_min,BB_DAYS_CREDIT_ENDDATE_max,BB_DAYS_CREDIT_ENDDATE_median,BB_DAYS_CREDIT_ENDDATE_sum,BB_DAYS_ENDDATE_FACT_mean,BB_DAYS_ENDDATE_FACT_min,BB_DAYS_ENDDATE_FACT_max,BB_DAYS_ENDDATE_FACT_median,BB_DAYS_ENDDATE_FACT_sum,BB_AMT_CREDIT_MAX_OVERDUE_mean,BB_AMT_CREDIT_MAX_OVERDUE_min,BB_AMT_CREDIT_MAX_OVERDUE_max,BB_AMT_CREDIT_MAX_OVERDUE_median,BB_AMT_CREDIT_MAX_OVERDUE_sum,BB_CNT_CREDIT_PROLONG_mean,BB_CNT_CREDIT_PROLONG_min,BB_CNT_CREDIT_PROLONG_max,BB_CNT_CREDIT_PROLONG_median,BB_CNT_CREDIT_PROLONG_sum,BB_AMT_CREDIT_SUM_mean,BB_AMT_CREDIT_SUM_min,BB_AMT_CREDIT_SUM_max,BB_AMT_CREDIT_SUM_median,BB_AMT_CREDIT_SUM_sum,BB_AMT_CREDIT_SUM_DEBT_mean,BB_AMT_CREDIT_SUM_DEBT_min,BB_AMT_CREDIT_SUM_DEBT_max,BB_AMT_CREDIT_SUM_DEBT_median,BB_AMT_CREDIT_SUM_DEBT_sum,BB_AMT_CREDIT_SUM_LIMIT_mean,BB_AMT_CREDIT_SUM_LIMIT_min,BB_AMT_CREDIT_SUM_LIMIT_max,BB_AMT_CREDIT_SUM_LIMIT_median,BB_AMT_CREDIT_SUM_LIMIT_sum,BB_AMT_CREDIT_SUM_OVERDUE_mean,BB_AMT_CREDIT_SUM_OVERDUE_min,BB_AMT_CREDIT_SUM_OVERDUE_max,BB_AMT_CREDIT_SUM_OVERDUE_median,BB_AMT_CREDIT_SUM_OVERDUE_sum,BB_DAYS_CREDIT_UPDATE_mean,BB_DAYS_CREDIT_UPDATE_min,BB_DAYS_CREDIT_UPDATE_max,BB_DAYS_CREDIT_UPDATE_median,BB_DAYS_CREDIT_UPDATE_sum,BB_AMT_ANNUITY_mean,BB_AMT_ANNUITY_min,BB_AMT_ANNUITY_max,BB_AMT_ANNUITY_median,BB_AMT_ANNUITY_sum,BB_MONTHS_WITH_STATUS_DPD_0_mean,BB_MONTHS_WITH_STATUS_DPD_0_min,BB_MONTHS_WITH_STATUS_DPD_0_max,BB_MONTHS_WITH_STATUS_DPD_0_median,BB_MONTHS_WITH_STATUS_DPD_0_sum,BB_MONTHS_WITH_STATUS_DPD_1to30_mean,BB_MONTHS_WITH_STATUS_DPD_1to30_min,BB_MONTHS_WITH_STATUS_DPD_1to30_max,BB_MONTHS_WITH_STATUS_DPD_1to30_median,BB_MONTHS_WITH_STATUS_DPD_1to30_sum,BB_MONTHS_WITH_STATUS_DPD_31to60_mean,BB_MONTHS_WITH_STATUS_DPD_31to60_min,BB_MONTHS_WITH_STATUS_DPD_31to60_max,BB_MONTHS_WITH_STATUS_DPD_31to60_median,BB_MONTHS_WITH_STATUS_DPD_31to60_sum,BB_MONTHS_WITH_STATUS_DPD_61to90_mean,BB_MONTHS_WITH_STATUS_DPD_61to90_min,BB_MONTHS_WITH_STATUS_DPD_61to90_max,BB_MONTHS_WITH_STATUS_DPD_61to90_median,BB_MONTHS_WITH_STATUS_DPD_61to90_sum,BB_MONTHS_WITH_STATUS_DPD_91to120_mean,BB_MONTHS_WITH_STATUS_DPD_91to120_min,BB_MONTHS_WITH_STATUS_DPD_91to120_max,BB_MONTHS_WITH_STATUS_DPD_91to120_median,BB_MONTHS_WITH_STATUS_DPD_91to120_sum,BB_MONTHS_WITH_STATUS_DPD_over120_mean,BB_MONTHS_WITH_STATUS_DPD_over120_min,BB_MONTHS_WITH_STATUS_DPD_over120_max,BB_MONTHS_WITH_STATUS_DPD_over120_median,BB_MONTHS_WITH_STATUS_DPD_over120_sum,BB_MONTHS_WITH_STATUS_CLOSED_mean,BB_MONTHS_WITH_STATUS_CLOSED_min,BB_MONTHS_WITH_STATUS_CLOSED_max,BB_MONTHS_WITH_STATUS_CLOSED_median,BB_MONTHS_WITH_STATUS_CLOSED_sum,BB_MONTHS_WITH_STATUS_UNKNOWN_mean,BB_MONTHS_WITH_STATUS_UNKNOWN_min,BB_MONTHS_WITH_STATUS_UNKNOWN_max,BB_MONTHS_WITH_STATUS_UNKNOWN_median,BB_MONTHS_WITH_STATUS_UNKNOWN_sum,BB_CREDIT_ACTIVE_Active_mean,BB_CREDIT_ACTIVE_Active_min,BB_CREDIT_ACTIVE_Active_max,BB_CREDIT_ACTIVE_Active_median,BB_CREDIT_ACTIVE_Active_sum,BB_CREDIT_ACTIVE_Bad debt_mean,BB_CREDIT_ACTIVE_Bad debt_min,BB_CREDIT_ACTIVE_Bad debt_max,BB_CREDIT_ACTIVE_Bad debt_median,BB_CREDIT_ACTIVE_Bad debt_sum,BB_CREDIT_ACTIVE_Closed_mean,BB_CREDIT_ACTIVE_Closed_min,BB_CREDIT_ACTIVE_Closed_max,BB_CREDIT_ACTIVE_Closed_median,BB_CREDIT_ACTIVE_Closed_sum,BB_CREDIT_ACTIVE_Sold_mean,BB_CREDIT_ACTIVE_Sold_min,BB_CREDIT_ACTIVE_Sold_max,BB_CREDIT_ACTIVE_Sold_median,BB_CREDIT_ACTIVE_Sold_sum,BB_CREDIT_CURRENCY_currency 1_mean,BB_CREDIT_CURRENCY_currency 1_min,BB_CREDIT_CURRENCY_currency 1_max,BB_CREDIT_CURRENCY_currency 1_median,BB_CREDIT_CURRENCY_currency 1_sum,BB_CREDIT_CURRENCY_currency 2_mean,BB_CREDIT_CURRENCY_currency 2_min,BB_CREDIT_CURRENCY_currency 2_max,BB_CREDIT_CURRENCY_currency 2_median,BB_CREDIT_CURRENCY_currency 2_sum,BB_CREDIT_CURRENCY_currency 3_mean,BB_CREDIT_CURRENCY_currency 3_min,BB_CREDIT_CURRENCY_currency 3_max,BB_CREDIT_CURRENCY_currency 3_median,BB_CREDIT_CURRENCY_currency 3_sum,BB_CREDIT_CURRENCY_currency 4_mean,BB_CREDIT_CURRENCY_currency 4_min,BB_CREDIT_CURRENCY_currency 4_max,BB_CREDIT_CURRENCY_currency 4_median,BB_CREDIT_CURRENCY_currency 4_sum,BB_CREDIT_TYPE_Another type of loan_mean,BB_CREDIT_TYPE_Another type of loan_min,BB_CREDIT_TYPE_Another type of loan_max,BB_CREDIT_TYPE_Another type of loan_median,BB_CREDIT_TYPE_Another type of loan_sum,BB_CREDIT_TYPE_Car loan_mean,BB_CREDIT_TYPE_Car loan_min,BB_CREDIT_TYPE_Car loan_max,BB_CREDIT_TYPE_Car loan_median,BB_CREDIT_TYPE_Car loan_sum,BB_CREDIT_TYPE_Cash loan (non-earmarked)_mean,BB_CREDIT_TYPE_Cash loan (non-earmarked)_min,BB_CREDIT_TYPE_Cash loan (non-earmarked)_max,BB_CREDIT_TYPE_Cash loan (non-earmarked)_median,BB_CREDIT_TYPE_Cash loan (non-earmarked)_sum,BB_CREDIT_TYPE_Consumer credit_mean,BB_CREDIT_TYPE_Consumer credit_min,BB_CREDIT_TYPE_Consumer credit_max,BB_CREDIT_TYPE_Consumer credit_median,BB_CREDIT_TYPE_Consumer credit_sum,BB_CREDIT_TYPE_Credit card_mean,BB_CREDIT_TYPE_Credit card_min,BB_CREDIT_TYPE_Credit card_max,BB_CREDIT_TYPE_Credit card_median,BB_CREDIT_TYPE_Credit card_sum,BB_CREDIT_TYPE_Interbank credit_mean,BB_CREDIT_TYPE_Interbank credit_min,BB_CREDIT_TYPE_Interbank credit_max,BB_CREDIT_TYPE_Interbank credit_median,BB_CREDIT_TYPE_Interbank credit_sum,BB_CREDIT_TYPE_Loan for business development_mean,BB_CREDIT_TYPE_Loan for business development_min,BB_CREDIT_TYPE_Loan for business development_max,BB_CREDIT_TYPE_Loan for business development_median,BB_CREDIT_TYPE_Loan for business development_sum,BB_CREDIT_TYPE_Loan for purchase of shares (margin lending)_mean,BB_CREDIT_TYPE_Loan for purchase of shares (margin lending)_min,BB_CREDIT_TYPE_Loan for purchase of shares (margin lending)_max,BB_CREDIT_TYPE_Loan for purchase of shares (margin lending)_median,BB_CREDIT_TYPE_Loan for purchase of shares (margin lending)_sum,BB_CREDIT_TYPE_Loan for the purchase of equipment_mean,BB_CREDIT_TYPE_Loan for the purchase of equipment_min,BB_CREDIT_TYPE_Loan for the purchase of equipment_max,BB_CREDIT_TYPE_Loan for the purchase of equipment_median,BB_CREDIT_TYPE_Loan for the purchase of equipment_sum,BB_CREDIT_TYPE_Loan for working capital replenishment_mean,BB_CREDIT_TYPE_Loan for working capital replenishment_min,BB_CREDIT_TYPE_Loan for working capital replenishment_max,BB_CREDIT_TYPE_Loan for working capital replenishment_median,BB_CREDIT_TYPE_Loan for working capital replenishment_sum,BB_CREDIT_TYPE_Microloan_mean,BB_CREDIT_TYPE_Microloan_min,BB_CREDIT_TYPE_Microloan_max,BB_CREDIT_TYPE_Microloan_median,BB_CREDIT_TYPE_Microloan_sum,BB_CREDIT_TYPE_Mobile operator loan_mean,BB_CREDIT_TYPE_Mobile operator loan_min,BB_CREDIT_TYPE_Mobile operator loan_max,BB_CREDIT_TYPE_Mobile operator loan_median,BB_CREDIT_TYPE_Mobile operator loan_sum,BB_CREDIT_TYPE_Mortgage_mean,BB_CREDIT_TYPE_Mortgage_min,BB_CREDIT_TYPE_Mortgage_max,BB_CREDIT_TYPE_Mortgage_median,BB_CREDIT_TYPE_Mortgage_sum,BB_CREDIT_TYPE_Real estate loan_mean,BB_CREDIT_TYPE_Real estate loan_min,BB_CREDIT_TYPE_Real estate loan_max,BB_CREDIT_TYPE_Real estate loan_median,BB_CREDIT_TYPE_Real estate loan_sum,BB_CREDIT_TYPE_Unknown type of loan_mean,BB_CREDIT_TYPE_Unknown type of loan_min,BB_CREDIT_TYPE_Unknown type of loan_max,BB_CREDIT_TYPE_Unknown type of loan_median,BB_CREDIT_TYPE_Unknown type of loan_sum,BB_bureau_records_count,PA_SK_ID_CURR,PA_SK_ID_PREV_mean,PA_SK_ID_PREV_min,PA_SK_ID_PREV_max,PA_SK_ID_PREV_median,PA_SK_ID_PREV_sum,PA_AMT_ANNUITY_mean,PA_AMT_ANNUITY_min,PA_AMT_ANNUITY_max,PA_AMT_ANNUITY_median,PA_AMT_ANNUITY_sum,PA_AMT_APPLICATION_mean,PA_AMT_APPLICATION_min,PA_AMT_APPLICATION_max,PA_AMT_APPLICATION_median,PA_AMT_APPLICATION_sum,PA_AMT_CREDIT_mean,PA_AMT_CREDIT_min,PA_AMT_CREDIT_max,PA_AMT_CREDIT_median,PA_AMT_CREDIT_sum,PA_AMT_DOWN_PAYMENT_mean,PA_AMT_DOWN_PAYMENT_min,PA_AMT_DOWN_PAYMENT_max,PA_AMT_DOWN_PAYMENT_median,PA_AMT_DOWN_PAYMENT_sum,PA_AMT_GOODS_PRICE_mean,PA_AMT_GOODS_PRICE_min,PA_AMT_GOODS_PRICE_max,PA_AMT_GOODS_PRICE_median,PA_AMT_GOODS_PRICE_sum,PA_HOUR_APPR_PROCESS_START_mean,PA_HOUR_APPR_PROCESS_START_min,PA_HOUR_APPR_PROCESS_START_max,PA_HOUR_APPR_PROCESS_START_median,PA_HOUR_APPR_PROCESS_START_sum,PA_NFLAG_LAST_APPL_IN_DAY_mean,PA_NFLAG_LAST_APPL_IN_DAY_min,PA_NFLAG_LAST_APPL_IN_DAY_max,PA_NFLAG_LAST_APPL_IN_DAY_median,PA_NFLAG_LAST_APPL_IN_DAY_sum,PA_RATE_DOWN_PAYMENT_mean,PA_RATE_DOWN_PAYMENT_min,PA_RATE_DOWN_PAYMENT_max,PA_RATE_DOWN_PAYMENT_median,PA_RATE_DOWN_PAYMENT_sum,PA_RATE_INTEREST_PRIMARY_mean,PA_RATE_INTEREST_PRIMARY_min,PA_RATE_INTEREST_PRIMARY_max,PA_RATE_INTEREST_PRIMARY_median,PA_RATE_INTEREST_PRIMARY_sum,PA_RATE_INTEREST_PRIVILEGED_mean,PA_RATE_INTEREST_PRIVILEGED_min,PA_RATE_INTEREST_PRIVILEGED_max,PA_RATE_INTEREST_PRIVILEGED_median,PA_RATE_INTEREST_PRIVILEGED_sum,PA_DAYS_DECISION_mean,PA_DAYS_DECISION_min,PA_DAYS_DECISION_max,PA_DAYS_DECISION_median,PA_DAYS_DECISION_sum,PA_SELLERPLACE_AREA_mean,PA_SELLERPLACE_AREA_min,PA_SELLERPLACE_AREA_max,PA_SELLERPLACE_AREA_median,PA_SELLERPLACE_AREA_sum,PA_CNT_PAYMENT_mean,PA_CNT_PAYMENT_min,PA_CNT_PAYMENT_max,PA_CNT_PAYMENT_median,PA_CNT_PAYMENT_sum,PA_DAYS_FIRST_DRAWING_mean,PA_DAYS_FIRST_DRAWING_min,PA_DAYS_FIRST_DRAWING_max,PA_DAYS_FIRST_DRAWING_median,PA_DAYS_FIRST_DRAWING_sum,PA_DAYS_FIRST_DUE_mean,PA_DAYS_FIRST_DUE_min,PA_DAYS_FIRST_DUE_max,PA_DAYS_FIRST_DUE_median,PA_DAYS_FIRST_DUE_sum,PA_DAYS_LAST_DUE_1ST_VERSION_mean,PA_DAYS_LAST_DUE_1ST_VERSION_min,PA_DAYS_LAST_DUE_1ST_VERSION_max,PA_DAYS_LAST_DUE_1ST_VERSION_median,PA_DAYS_LAST_DUE_1ST_VERSION_sum,PA_DAYS_LAST_DUE_mean,PA_DAYS_LAST_DUE_min,PA_DAYS_LAST_DUE_max,PA_DAYS_LAST_DUE_median,PA_DAYS_LAST_DUE_sum,PA_DAYS_TERMINATION_mean,PA_DAYS_TERMINATION_min,PA_DAYS_TERMINATION_max,PA_DAYS_TERMINATION_median,PA_DAYS_TERMINATION_sum,PA_NFLAG_INSURED_ON_APPROVAL_mean,PA_NFLAG_INSURED_ON_APPROVAL_min,PA_NFLAG_INSURED_ON_APPROVAL_max,PA_NFLAG_INSURED_ON_APPROVAL_median,PA_NFLAG_INSURED_ON_APPROVAL_sum,PA_previous_application_records_count,PCB_SK_ID_CURR,PCB_MONTHS_BALANCE_mean,PCB_MONTHS_BALANCE_min,PCB_MONTHS_BALANCE_max,PCB_MONTHS_BALANCE_median,PCB_MONTHS_BALANCE_sum,PCB_CNT_INSTALMENT_mean,PCB_CNT_INSTALMENT_min,PCB_CNT_INSTALMENT_max,PCB_CNT_INSTALMENT_median,PCB_CNT_INSTALMENT_sum,PCB_CNT_INSTALMENT_FUTURE_mean,PCB_CNT_INSTALMENT_FUTURE_min,PCB_CNT_INSTALMENT_FUTURE_max,PCB_CNT_INSTALMENT_FUTURE_median,PCB_CNT_INSTALMENT_FUTURE_sum,PCB_SK_DPD_mean,PCB_SK_DPD_min,PCB_SK_DPD_max,PCB_SK_DPD_median,PCB_SK_DPD_sum,PCB_SK_DPD_DEF_mean,PCB_SK_DPD_DEF_min,PCB_SK_DPD_DEF_max,PCB_SK_DPD_DEF_median,PCB_SK_DPD_DEF_sum,PCB_STATUS__Active_mean,PCB_STATUS__Active_min,PCB_STATUS__Active_max,PCB_STATUS__Active_median,PCB_STATUS__Active_sum,PCB_STATUS__Amortized debt_mean,PCB_STATUS__Amortized debt_min,PCB_STATUS__Amortized debt_max,PCB_STATUS__Amortized debt_median,PCB_STATUS__Amortized debt_sum,PCB_STATUS__Approved_mean,PCB_STATUS__Approved_min,PCB_STATUS__Approved_max,PCB_STATUS__Approved_median,PCB_STATUS__Approved_sum,PCB_STATUS__Canceled_mean,PCB_STATUS__Canceled_min,PCB_STATUS__Canceled_max,PCB_STATUS__Canceled_median,PCB_STATUS__Canceled_sum,PCB_STATUS__Completed_mean,PCB_STATUS__Completed_min,PCB_STATUS__Completed_max,PCB_STATUS__Completed_median,PCB_STATUS__Completed_sum,PCB_STATUS__Demand_mean,PCB_STATUS__Demand_min,PCB_STATUS__Demand_max,PCB_STATUS__Demand_median,PCB_STATUS__Demand_sum,PCB_STATUS__Returned to the store_mean,PCB_STATUS__Returned to the store_min,PCB_STATUS__Returned to the store_max,PCB_STATUS__Returned to the store_median,PCB_STATUS__Returned to the store_sum,PCB_STATUS__Signed_mean,PCB_STATUS__Signed_min,PCB_STATUS__Signed_max,PCB_STATUS__Signed_median,PCB_STATUS__Signed_sum,PCB_STATUS__XNA_mean,PCB_STATUS__XNA_min,PCB_STATUS__XNA_max,PCB_STATUS__XNA_median,PCB_STATUS__XNA_sum,PCB_pos_cash_balance_records_count,IP_SK_ID_CURR,IP_NUM_INSTALMENT_VERSION_mean,IP_NUM_INSTALMENT_VERSION_min,IP_NUM_INSTALMENT_VERSION_max,IP_NUM_INSTALMENT_VERSION_median,IP_NUM_INSTALMENT_VERSION_sum,IP_NUM_INSTALMENT_NUMBER_mean,IP_NUM_INSTALMENT_NUMBER_min,IP_NUM_INSTALMENT_NUMBER_max,IP_NUM_INSTALMENT_NUMBER_median,IP_NUM_INSTALMENT_NUMBER_sum,IP_DAYS_INSTALMENT_mean,IP_DAYS_INSTALMENT_min,IP_DAYS_INSTALMENT_max,IP_DAYS_INSTALMENT_median,IP_DAYS_INSTALMENT_sum,IP_AMT_INSTALMENT_mean,IP_AMT_INSTALMENT_min,IP_AMT_INSTALMENT_max,IP_AMT_INSTALMENT_median,IP_AMT_INSTALMENT_sum,IP_instalments_unpaid_mean,IP_instalments_unpaid_min,IP_instalments_unpaid_max,IP_instalments_unpaid_median,IP_instalments_unpaid_sum,IP_instalments_partially_paid_mean,IP_instalments_partially_paid_min,IP_instalments_partially_paid_max,IP_instalments_partially_paid_median,IP_instalments_partially_paid_sum,IP_instalments_overdue_mean,IP_instalments_overdue_min,IP_instalments_overdue_max,IP_instalments_overdue_median,IP_instalments_overdue_sum,IP_amount_debt_mean,IP_amount_debt_min,IP_amount_debt_max,IP_amount_debt_median,IP_amount_debt_sum,IP_dpd_mean,IP_dpd_min,IP_dpd_max,IP_dpd_median,IP_dpd_sum,IP_installments_payments_records_count,CCB_SK_ID_CURR,CCB_MONTHS_BALANCE_mean,CCB_MONTHS_BALANCE_min,CCB_MONTHS_BALANCE_max,CCB_MONTHS_BALANCE_median,CCB_MONTHS_BALANCE_sum,CCB_AMT_BALANCE_mean,CCB_AMT_BALANCE_min,CCB_AMT_BALANCE_max,CCB_AMT_BALANCE_median,CCB_AMT_BALANCE_sum,CCB_AMT_CREDIT_LIMIT_ACTUAL_mean,CCB_AMT_CREDIT_LIMIT_ACTUAL_min,CCB_AMT_CREDIT_LIMIT_ACTUAL_max,CCB_AMT_CREDIT_LIMIT_ACTUAL_median,CCB_AMT_CREDIT_LIMIT_ACTUAL_sum,CCB_AMT_DRAWINGS_ATM_CURRENT_mean,CCB_AMT_DRAWINGS_ATM_CURRENT_min,CCB_AMT_DRAWINGS_ATM_CURRENT_max,CCB_AMT_DRAWINGS_ATM_CURRENT_median,CCB_AMT_DRAWINGS_ATM_CURRENT_sum,CCB_AMT_DRAWINGS_CURRENT_mean,CCB_AMT_DRAWINGS_CURRENT_min,CCB_AMT_DRAWINGS_CURRENT_max,CCB_AMT_DRAWINGS_CURRENT_median,CCB_AMT_DRAWINGS_CURRENT_sum,CCB_AMT_DRAWINGS_OTHER_CURRENT_mean,CCB_AMT_DRAWINGS_OTHER_CURRENT_min,CCB_AMT_DRAWINGS_OTHER_CURRENT_max,CCB_AMT_DRAWINGS_OTHER_CURRENT_median,CCB_AMT_DRAWINGS_OTHER_CURRENT_sum,CCB_AMT_DRAWINGS_POS_CURRENT_mean,CCB_AMT_DRAWINGS_POS_CURRENT_min,CCB_AMT_DRAWINGS_POS_CURRENT_max,CCB_AMT_DRAWINGS_POS_CURRENT_median,CCB_AMT_DRAWINGS_POS_CURRENT_sum,CCB_AMT_INST_MIN_REGULARITY_mean,CCB_AMT_INST_MIN_REGULARITY_min,CCB_AMT_INST_MIN_REGULARITY_max,CCB_AMT_INST_MIN_REGULARITY_median,CCB_AMT_INST_MIN_REGULARITY_sum,CCB_AMT_PAYMENT_CURRENT_mean,CCB_AMT_PAYMENT_CURRENT_min,CCB_AMT_PAYMENT_CURRENT_max,CCB_AMT_PAYMENT_CURRENT_median,CCB_AMT_PAYMENT_CURRENT_sum,CCB_AMT_PAYMENT_TOTAL_CURRENT_mean,CCB_AMT_PAYMENT_TOTAL_CURRENT_min,CCB_AMT_PAYMENT_TOTAL_CURRENT_max,CCB_AMT_PAYMENT_TOTAL_CURRENT_median,CCB_AMT_PAYMENT_TOTAL_CURRENT_sum,CCB_AMT_RECEIVABLE_PRINCIPAL_mean,CCB_AMT_RECEIVABLE_PRINCIPAL_min,CCB_AMT_RECEIVABLE_PRINCIPAL_max,CCB_AMT_RECEIVABLE_PRINCIPAL_median,CCB_AMT_RECEIVABLE_PRINCIPAL_sum,CCB_AMT_RECIVABLE_mean,CCB_AMT_RECIVABLE_min,CCB_AMT_RECIVABLE_max,CCB_AMT_RECIVABLE_median,CCB_AMT_RECIVABLE_sum,CCB_AMT_TOTAL_RECEIVABLE_mean,CCB_AMT_TOTAL_RECEIVABLE_min,CCB_AMT_TOTAL_RECEIVABLE_max,CCB_AMT_TOTAL_RECEIVABLE_median,CCB_AMT_TOTAL_RECEIVABLE_sum,CCB_CNT_DRAWINGS_ATM_CURRENT_mean,CCB_CNT_DRAWINGS_ATM_CURRENT_min,CCB_CNT_DRAWINGS_ATM_CURRENT_max,CCB_CNT_DRAWINGS_ATM_CURRENT_median,CCB_CNT_DRAWINGS_ATM_CURRENT_sum,CCB_CNT_DRAWINGS_CURRENT_mean,CCB_CNT_DRAWINGS_CURRENT_min,CCB_CNT_DRAWINGS_CURRENT_max,CCB_CNT_DRAWINGS_CURRENT_median,CCB_CNT_DRAWINGS_CURRENT_sum,CCB_CNT_DRAWINGS_OTHER_CURRENT_mean,CCB_CNT_DRAWINGS_OTHER_CURRENT_min,CCB_CNT_DRAWINGS_OTHER_CURRENT_max,CCB_CNT_DRAWINGS_OTHER_CURRENT_median,CCB_CNT_DRAWINGS_OTHER_CURRENT_sum,CCB_CNT_DRAWINGS_POS_CURRENT_mean,CCB_CNT_DRAWINGS_POS_CURRENT_min,CCB_CNT_DRAWINGS_POS_CURRENT_max,CCB_CNT_DRAWINGS_POS_CURRENT_median,CCB_CNT_DRAWINGS_POS_CURRENT_sum,CCB_CNT_INSTALMENT_MATURE_CUM_mean,CCB_CNT_INSTALMENT_MATURE_CUM_min,CCB_CNT_INSTALMENT_MATURE_CUM_max,CCB_CNT_INSTALMENT_MATURE_CUM_median,CCB_CNT_INSTALMENT_MATURE_CUM_sum,CCB_SK_DPD_mean,CCB_SK_DPD_min,CCB_SK_DPD_max,CCB_SK_DPD_median,CCB_SK_DPD_sum,CCB_SK_DPD_DEF_mean,CCB_SK_DPD_DEF_min,CCB_SK_DPD_DEF_max,CCB_SK_DPD_DEF_median,CCB_SK_DPD_DEF_sum,CCB_previous_application_records_mean,CCB_previous_application_records_min,CCB_previous_application_records_max,CCB_previous_application_records_median,CCB_previous_application_records_sum,CCB_credit_card_balance_records_count
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0252,0.0383,0.9722,0.6341,0.0144,0.0,0.069,0.0833,0.125,0.0377,0.022,0.0198,0.0,0.0,0.025,0.0369,0.9722,0.6243,0.0144,0.0,0.069,0.0833,0.125,0.0375,0.0205,0.0193,0.0,0.0,reg oper account,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,100002.0,-874.0,-1437.0,-103.0,-1042.5,-6992.0,0.0,0.0,0.0,0.0,0.0,-261.75,-1072.0,780.0,0.0,-2094.0,-523.125,-1185.0,0.0,-479.5,-4185.0,1050.643125,0.0,5043.645,0.0,8405.145,0.0,0.0,0.0,0.0,0.0,108131.945625,0.0,450000.0,54130.5,865055.565,30722.625,0.0,245781.0,0.0,245781.0,3998.570625,0.0,31988.565,0.0,31988.565,0.0,0.0,0.0,0.0,0.0,-499.875,-1185.0,-7.0,-402.5,-3999.0,0.0,0.0,0.0,0.0,0.0,5.625,2.0,18.0,5.0,45.0,3.375,0.0,6.0,4.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.875,0.0,13.0,2.0,23.0,1.875,0.0,3.0,2.5,15.0,0.25,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.75,0.0,1.0,1.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,1.0,0.5,4.0,0.5,0.0,1.0,0.5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,100002.0,1038818.0,1038818.0,1038818.0,1038818.0,1038818.0,9251.775,9251.775,9251.775,9251.775,9251.775,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,0.0,0.0,0.0,0.0,0.0,179055.0,179055.0,179055.0,179055.0,179055.0,9.0,9.0,9.0,9.0,9.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-606.0,-606.0,-606.0,-606.0,-606.0,500.0,500.0,500.0,500.0,500.0,24.0,24.0,24.0,24.0,24.0,365243.0,365243.0,365243.0,365243.0,365243.0,-565.0,-565.0,-565.0,-565.0,-565.0,125.0,125.0,125.0,125.0,125.0,-25.0,-25.0,-25.0,-25.0,-25.0,-17.0,-17.0,-17.0,-17.0,-17.0,0.0,0.0,0.0,0.0,0.0,1.0,100002.0,-10.0,-19.0,-1.0,-10.0,-190.0,24.0,24.0,24.0,24.0,456.0,15.0,6.0,24.0,15.0,285.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,100002.0,1.052632,1.0,2.0,1.0,20.0,10.0,1.0,19.0,10.0,190.0,-295.0,-565.0,-25.0,-295.0,-5605.0,11559.247105,9251.775,53093.745,9251.775,219625.695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.0924,0.0538,0.9851,0.804,0.0497,0.0806,0.0345,0.2917,0.3333,0.0128,0.079,0.0554,0.0,0.0,0.0968,0.0529,0.9851,0.7987,0.0608,0.08,0.0345,0.2917,0.3333,0.0132,0.0787,0.0558,0.0039,0.01,reg oper account,block of flats,0.0714,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,100003.0,-1400.75,-2586.0,-606.0,-1205.5,-5603.0,0.0,0.0,0.0,0.0,0.0,-544.5,-2434.0,1216.0,-480.0,-2178.0,-823.0,-2131.0,0.0,-580.5,-3292.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,254350.125,22248.0,810000.0,92576.25,1017400.5,0.0,0.0,0.0,0.0,0.0,202500.0,0.0,810000.0,0.0,810000.0,0.0,0.0,0.0,0.0,0.0,-816.0,-2131.0,-43.0,-545.0,-3264.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.75,0.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,1.0,0.5,2.0,0.5,0.0,1.0,0.5,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,100003.0,2281150.0,1810518.0,2636178.0,2396755.0,6843451.0,56553.99,6737.31,98356.995,64567.665,169661.97,435436.5,68809.5,900000.0,337500.0,1306309.5,484191.0,68053.5,1035882.0,348637.5,1452573.0,2295.0,0.0,6885.0,0.0,6885.0,435436.5,68809.5,900000.0,337500.0,1306309.5,14.666667,12.0,17.0,15.0,44.0,1.0,1.0,1.0,1.0,3.0,0.033354,0.0,0.100061,0.0,0.100061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1305.0,-2341.0,-746.0,-828.0,-3915.0,533.0,-1.0,1400.0,200.0,1599.0,10.0,6.0,12.0,12.0,30.0,365243.0,365243.0,365243.0,365243.0,1095729.0,-1274.333333,-2310.0,-716.0,-797.0,-3823.0,-1004.333333,-1980.0,-386.0,-647.0,-3013.0,-1054.333333,-1980.0,-536.0,-647.0,-3163.0,-1047.333333,-1976.0,-527.0,-639.0,-3142.0,0.666667,0.0,1.0,1.0,2.0,3.0,100003.0,-43.785714,-77.0,-18.0,-26.5,-1226.0,10.107143,6.0,12.0,12.0,283.0,5.785714,0.0,12.0,6.0,162.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.928571,0.0,1.0,1.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,100003.0,1.04,1.0,2.0,1.0,26.0,5.08,1.0,12.0,5.0,127.0,-1378.16,-2310.0,-536.0,-797.0,-34454.0,64754.586,6662.97,560835.36,64275.615,1618864.65,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,Laborers,1.0,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.555912,0.729567,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,100004.0,-867.0,-1326.0,-408.0,-867.0,-1734.0,0.0,0.0,0.0,0.0,0.0,-488.5,-595.0,-382.0,-488.5,-977.0,-532.5,-683.0,-382.0,-532.5,-1065.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94518.9,94500.0,94537.8,94518.9,189037.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-532.0,-682.0,-382.0,-532.0,-1064.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,100004.0,1564014.0,1564014.0,1564014.0,1564014.0,1564014.0,5357.25,5357.25,5357.25,5357.25,5357.25,24282.0,24282.0,24282.0,24282.0,24282.0,20106.0,20106.0,20106.0,20106.0,20106.0,4860.0,4860.0,4860.0,4860.0,4860.0,24282.0,24282.0,24282.0,24282.0,24282.0,5.0,5.0,5.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0,0.212008,0.212008,0.212008,0.212008,0.212008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-815.0,-815.0,-815.0,-815.0,-815.0,30.0,30.0,30.0,30.0,30.0,4.0,4.0,4.0,4.0,4.0,365243.0,365243.0,365243.0,365243.0,365243.0,-784.0,-784.0,-784.0,-784.0,-784.0,-694.0,-694.0,-694.0,-694.0,-694.0,-724.0,-724.0,-724.0,-724.0,-724.0,-714.0,-714.0,-714.0,-714.0,-714.0,0.0,0.0,0.0,0.0,0.0,1.0,100004.0,-25.5,-27.0,-24.0,-25.5,-102.0,3.75,3.0,4.0,4.0,15.0,2.25,0.0,4.0,2.5,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,100004.0,1.333333,1.0,2.0,1.0,4.0,2.0,1.0,3.0,2.0,6.0,-754.0,-784.0,-724.0,-754.0,-2262.0,7096.155,5357.25,10573.965,5357.25,21288.465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.650442,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,100006.0,1932462.0,1020698.0,2827850.0,2078043.0,17392159.0,15767.45,0.0,39954.51,13500.0,141907.05,272203.26,0.0,688500.0,270000.0,2449829.34,291695.5,0.0,906615.0,267930.0,2625259.5,7742.26,0.0,66987.0,0.0,69680.34,272203.26,0.0,688500.0,270000.0,2449829.34,14.666667,12.0,15.0,15.0,132.0,1.0,1.0,1.0,1.0,9.0,0.036314,0.0,0.21783,0.0,0.326824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-272.444444,-617.0,-181.0,-181.0,-2452.0,894.222222,-1.0,8025.0,-1.0,8048.0,15.333333,0.0,48.0,12.0,138.0,162330.222222,0.0,365243.0,0.0,1460972.0,40474.0,-545.0,365243.0,0.0,364266.0,40704.0,-215.0,365243.0,0.0,366336.0,81101.111111,-425.0,365243.0,0.0,729910.0,81103.0,-416.0,365243.0,0.0,729927.0,0.0,0.0,0.0,0.0,0.0,9.0,100006.0,-9.619048,-20.0,-1.0,-8.0,-202.0,11.428571,0.0,48.0,12.0,240.0,8.238095,0.0,48.0,8.0,173.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.857143,0.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.095238,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,100006.0,1.125,1.0,2.0,1.0,18.0,4.4375,1.0,10.0,4.0,71.0,-252.25,-545.0,-11.0,-206.0,-4036.0,62947.088438,2482.92,691786.89,29027.52,1007153.415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,100006.0,-3.5,-6.0,-1.0,-3.5,-21.0,0.0,0.0,0.0,0.0,0.0,270000.0,270000.0,270000.0,270000.0,1620000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,6.0,6.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.322738,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,100007.0,-1149.0,-1149.0,-1149.0,-1149.0,-1149.0,0.0,0.0,0.0,0.0,0.0,-783.0,-783.0,-783.0,-783.0,-783.0,-783.0,-783.0,-783.0,-783.0,-783.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,146250.0,146250.0,146250.0,146250.0,146250.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-783.0,-783.0,-783.0,-783.0,-783.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,100007.0,2157812.0,1692033.0,2730157.0,2060607.5,12946871.0,12278.805,1834.29,22678.785,14524.3125,73672.83,150530.25,17176.5,247500.0,191250.0,903181.5,166638.75,14616.0,284400.0,197932.5,999832.5,1130.25,0.0,3676.5,0.0,6781.5,150530.25,17176.5,247500.0,191250.0,903181.5,12.333333,8.0,15.0,13.0,74.0,1.0,1.0,1.0,1.0,6.0,0.053172,0.0,0.21889,0.0,0.319033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1222.833333,-2357.0,-374.0,-986.5,-7337.0,409.166667,-1.0,1200.0,28.5,2455.0,20.666667,10.0,48.0,15.0,124.0,304369.166667,0.0,365243.0,365243.0,1826215.0,-1052.666667,-2326.0,0.0,-955.0,-6316.0,-697.666667,-2056.0,346.0,-535.0,-4186.0,60113.5,-2056.0,365243.0,-550.0,360681.0,60119.833333,-2041.0,365243.0,-543.0,360719.0,0.5,0.0,1.0,0.5,3.0,6.0,100007.0,-33.636364,-77.0,-1.0,-27.5,-2220.0,15.333333,10.0,24.0,12.0,1012.0,8.969697,0.0,24.0,8.0,592.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.939394,0.0,1.0,1.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015152,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,66.0,100007.0,1.166667,1.0,2.0,1.0,77.0,7.045455,1.0,17.0,7.0,465.0,-1028.606061,-2326.0,-14.0,-851.0,-67888.0,12666.444545,1821.78,22678.785,16037.64,835985.34,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,1.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,452.384394,0.0,22655.66,0.0,29857.37,0.954545,0.0,12.0,0.0,63.0,66.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [104]:
df_info_summary(train_df_join)

                                                    Non-Null Count  Null Count  % Null    Dtype
SK_ID_CURR                                                  307511           0    0.00    int64
TARGET                                                      307511           0    0.00    int64
NAME_CONTRACT_TYPE                                          307511           0    0.00   object
CODE_GENDER                                                 307511           0    0.00   object
FLAG_OWN_CAR                                                307511           0    0.00   object
FLAG_OWN_REALTY                                             307511           0    0.00   object
CNT_CHILDREN                                                307511           0    0.00    int64
AMT_INCOME_TOTAL                                            307511           0    0.00  float64
AMT_CREDIT                                                  307511           0    0.00  float64
AMT_ANNUITY                             

In [105]:
train_df_join.shape

(307511, 667)

In [106]:
# Guardar df final en formato parquet
train_df_join.to_parquet(os.path.join(PATH, "train.parquet"))