In [23]:
import pandas as pd
import numpy as np
import sklearn as sk
import gc
from tqdm import tqdm_notebook as tqdm

import warnings
warnings.filterwarnings('ignore')


#Have all columns appear when dataframes are displayed.
pd.set_option('display.max_columns', None) 
# Have 100 rows appear when a dataframe is displayed
pd.set_option('display.max_rows', 500)
# Display dimensions whenever a dataframe is printed out.
pd.set_option('display.show_dimensions', True)

bureau=pd.read_csv(r'C:/Users/Yeray/Desktop/DATA_SCIENCE_ML/Home-Credit-TFG/DATA/home-credit-default-risk/bureau.csv')
bureau_balance = pd.read_csv(r'C:/Users/Yeray/Desktop/DATA_SCIENCE_ML/Home-Credit-TFG/DATA/home-credit-default-risk/bureau_balance.csv')

In [24]:
#Vamos a usar solo aquellos que estén tanto en las tablas de bureau como de bureau_balance
bureau_borrowers = bureau[['SK_ID_CURR', 'SK_ID_BUREAU']]   
bureau_balance = bureau_borrowers.join(bureau_balance.set_index('SK_ID_BUREAU'), on='SK_ID_BUREAU', how='inner')

In [26]:
# Copiamos para no alterar bureau_balance original
df_balance_loan = bureau_balance.copy()

# Convertir STATUS a numérico (cat.codes)
df_balance_loan['STATUS'] = df_balance_loan['STATUS'].astype('category').cat.codes

# Definir agregaciones a nivel "loan"
months_balance_agg_by_loan = {
    'MONTHS_BALANCE': ['min', 'max', 'mean', 'size'],
    'STATUS':         ['mean', 'max', 'var']
}

# Hacer groupby a nivel SK_ID_BUREAU
df_loan_agg = df_balance_loan.groupby('SK_ID_BUREAU', as_index=False).agg(months_balance_agg_by_loan)

# Aplanar columnas
df_loan_agg.columns = [
    f"{col[0]}_{col[1].upper()}_BY_LOAN" if col[0] != 'SK_ID_BUREAU' else 'SK_ID_BUREAU'
    for col in df_loan_agg.columns
]

df_loan_agg.reset_index(inplace=True)

del df_balance_loan
gc.collect()

##############################################################################
#                  PARTE B: UNIR AL NIVEL "LOAN" CON LA TABLA bureau
##############################################################################

# bureau: (SK_ID_BUREAU, SK_ID_CURR, ...)
# Queremos un DF con 1 fila por SK_ID_BUREAU, pero ahora también teniendo SK_ID_CURR
df_loan_level = bureau[['SK_ID_BUREAU','SK_ID_CURR']].drop_duplicates()

# Merge con df_loan_agg
df_loan_level = df_loan_level.merge(df_loan_agg, on='SK_ID_BUREAU', how='left')
del df_loan_agg
gc.collect()

##############################################################################
#                  PARTE C: AGREGAR A NIVEL "BORROWER" (SK_ID_CURR)
##############################################################################

months_balance_agg_by_borrower = {
    'MONTHS_BALANCE_MIN_BY_LOAN':  ['min','max','mean'],
    'MONTHS_BALANCE_MAX_BY_LOAN':  ['min','max','mean'],
    'MONTHS_BALANCE_MEAN_BY_LOAN': ['min','max','mean'],
    'MONTHS_BALANCE_SIZE_BY_LOAN': ['min','max','mean'],
    'STATUS_MEAN_BY_LOAN':         ['min','max','mean'],
    'STATUS_MAX_BY_LOAN':          ['min','max','mean'],
    'STATUS_VAR_BY_LOAN':          ['min','max','mean']
}

df_borrower_agg = df_loan_level.groupby('SK_ID_CURR').agg(months_balance_agg_by_borrower)
df_borrower_agg.columns = [
    f"{col[0]}_{col[1].upper()}_BY_BORROWER"
    for col in df_borrower_agg.columns
]
df_borrower_agg.reset_index(inplace=True)

del df_loan_level
gc.collect()

0

In [29]:
# Si quieres, por ejemplo, contar cuántos meses DPD en 'bureau_balance'
# (donde 'STATUS' in ['2','3','4','5']) a nivel SK_ID_BUREAU y luego SK_ID_CURR
count_status_df = bureau_balance[['SK_ID_BUREAU','STATUS']].copy()
# Marcamos 1 si está en ['2','3','4','5']
count_status_df['IS_DPD'] = count_status_df['STATUS'].isin(['2','3','4','5']).astype(int)

# 1) a nivel SK_ID_BUREAU
dpd_loan = count_status_df.groupby('SK_ID_BUREAU', as_index=False)['IS_DPD'].sum()
dpd_loan.rename(columns={'IS_DPD':'COUNT_MONTHS_STATUS_WAS_DPD_BY_LOAN'}, inplace=True)


df_loan_level2 = bureau[['SK_ID_BUREAU','SK_ID_CURR']].drop_duplicates()
df_loan_level2 = df_loan_level2.merge(dpd_loan, on='SK_ID_BUREAU', how='left')

dpd_borrower = df_loan_level2.groupby('SK_ID_CURR', as_index=False)['COUNT_MONTHS_STATUS_WAS_DPD_BY_LOAN'].sum()
dpd_borrower.rename(columns={
        'COUNT_MONTHS_STATUS_WAS_DPD_BY_LOAN': 'COUNT_MONTHS_STATUS_WAS_DPD_(BUREAU_BALANCE)'
    }, inplace=True)

# 3) unimos con df_borrower_agg
df_borrower_agg = df_borrower_agg.merge(dpd_borrower, on='SK_ID_CURR', how='left')

del dpd_loan, dpd_borrower, count_status_df, df_loan_level2
gc.collect()

0

In [30]:
print(df_borrower_agg.shape)

(305811, 24)


In [34]:
df_borrower_agg.to_csv(r'C:/Users/Yeray/Desktop/DATA_SCIENCE_ML/Home-Credit-TFG/DATA/bureau_balance_agg_def_v3.csv', index=False)