In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import warnings
import os
warnings.filterwarnings('ignore')
print("Directorio actual:", os.getcwd())
# Have all columns appear when dataframes are displayed.
pd.set_option('display.max_columns', None) 
# Have 100 rows appear when a dataframe is displayed
pd.set_option('display.max_rows', 500)
# Display dimensions whenever a dataframe is printed out.
pd.set_option('display.show_dimensions', True)

installment_payments = pd.read_csv(r'C:/Users/Yeray/Desktop/DATA_SCIENCE_ML/Home-Credit-TFG/DATA/home-credit-default-risk/installments_payments.csv')

Directorio actual: c:\Users\Yeray\Desktop\DATA_SCIENCE_ML\Home-Credit-TFG\JUPYTER_NOTEBOOKS


In [3]:
#Feature Engineering

#Primero variables relacionada con los pagos
installment_payments['PAYMENT_INSTALLMENT_RATIO'] = installment_payments['AMT_PAYMENT'] / installment_payments['AMT_INSTALMENT']
installment_payments['PAYMENT_DIFFERENCE'] = installment_payments['AMT_PAYMENT'] - installment_payments['AMT_INSTALMENT']
installment_payments['PAGO_DE_MAS'] = (installment_payments['AMT_PAYMENT'] > installment_payments['AMT_INSTALMENT']).astype(int)
installment_payments['PAGO_DE_MENOS'] = (installment_payments['AMT_PAYMENT'] < installment_payments['AMT_INSTALMENT']).astype(int)

#Segundo variables relacionadas con los días de retraso
installment_payments['DAYS_PAST_DUE'] = installment_payments['DAYS_ENTRY_PAYMENT'] - installment_payments['DAYS_INSTALMENT']
installment_payments['DAYS_BEFORE_DUE'] = installment_payments['DAYS_INSTALMENT'] - installment_payments['DAYS_ENTRY_PAYMENT']
installment_payments['DAYS_PAST_DUE_RATIO'] = installment_payments['DAYS_PAST_DUE'] / installment_payments['DAYS_INSTALMENT']
installment_payments['DAYS_BEFORE_DUE_RATIO'] = installment_payments['DAYS_BEFORE_DUE'] / installment_payments['DAYS_INSTALMENT']

installment_payments['PAGO_RETRASADO'] = (installment_payments['DAYS_PAST_DUE'] > 0).astype(int)
installment_payments['PAGO_ANTICIPADO'] = (installment_payments['DAYS_BEFORE_DUE'] > 0).astype(int)

In [4]:
#En CNT_INSTALLMENT_VERSION si cambia el valor significa que ha habido un cambio en el plan de pagos, vamos a crear una variable que lo indique
pagos = installment_payments.sort_values(['SK_ID_PREV', 'NUM_INSTALMENT_NUMBER'])

#Detectamos cambios consecutivo en la variable CNT_INSTALMENT_VERSION
pagos['CAMBIO_PLAN_PAGOS'] = (
    pagos.groupby('SK_ID_PREV')['NUM_INSTALMENT_VERSION'].diff().ne(0).astype(int)
)

#Contamos cuántos cambios de plan de pagos ha habido
cambios_por_pago = (
    pagos.groupby('SK_ID_PREV', as_index=False)['CAMBIO_PLAN_PAGOS'].sum()
    .rename(columns={'CAMBIO_PLAN_PAGOS': 'NUM_CAMBIOS_PLAN_PAGOS'})
)

#Unimos la variable creada con el dataframe original
prev_curr_map = pagos[['SK_ID_PREV', 'SK_ID_CURR']].drop_duplicates()
cambios_por_pago = cambios_por_pago.merge(prev_curr_map, on='SK_ID_PREV', how='left')

#Agregamos la variable al dataframe original a nivel de SK_ID_CURR
cambios_por_cliente = (
    cambios_por_pago.groupby('SK_ID_CURR', as_index=False)['NUM_CAMBIOS_PLAN_PAGOS'].sum()
    .rename(columns={'NUM_CAMBIOS_PLAN_PAGOS': 'NUM_CAMBIOS_PLAN_PAGOS_TOTAL'})
)

#Unimos la variable al dataframe original
installment_payments = installment_payments.merge(cambios_por_cliente, on='SK_ID_CURR', how='left')

In [5]:
#Vamos a calcular una nueva variable. Media de ratio de pagos con más de 1000 días de retraso

def generate_installment_payment_ratio_1000_mean_mean(installments):
    # 1) Crear la diferencia de pago
    df = installments.copy()
    df['DIFF_PAYMENT'] = df['AMT_PAYMENT'] - df['AMT_INSTALMENT']

    # 2) Filtrar solo cuotas con DAYS_INSTALLMENT > -1000
    df_filtered = df[df['DAYS_INSTALMENT'] > -1000].copy()

    # 3) Agrupar a nivel de SK_ID_PREV => mean de DIFF_PAYMENT
    df_by_prev = (
        df_filtered
        .groupby('SK_ID_PREV', as_index=False)['DIFF_PAYMENT']
        .mean()
        .rename(columns={'DIFF_PAYMENT': 'MEAN_DIFF_PAYMENT_(LAST_1000)'})
    )

    # 4) Llevar SK_ID_CURR (porque SK_ID_PREV está ligado a un SK_ID_CURR)
    #    y luego agrupar a nivel de SK_ID_CURR => mean
    #    (por si un cliente tiene varios SK_ID_PREV)
    #    Primero hacemos un merge para que df_by_prev tenga SK_ID_CURR
    df_map_curr = (
        df_filtered[['SK_ID_PREV','SK_ID_CURR']]
        .drop_duplicates(subset='SK_ID_PREV')
    )
    df_by_prev = df_by_prev.merge(df_map_curr, on='SK_ID_PREV', how='left')

    # 5) Ahora, agregamos a nivel de SK_ID_CURR
    df_by_curr = (
        df_by_prev
        .groupby('SK_ID_CURR', as_index=False)['MEAN_DIFF_PAYMENT_(LAST_1000)']
        .mean()
        .rename(columns={
            'MEAN_DIFF_PAYMENT_(LAST_1000)': 'installment_payment_ratio_1000_mean_mean'
        })
    )
    return df_by_curr


installment_payment_ratio_1000_mean_mean = generate_installment_payment_ratio_1000_mean_mean(installment_payments)
installment_payments = installment_payments.merge(installment_payment_ratio_1000_mean_mean, on='SK_ID_CURR', how='left')

In [6]:
#Agregaciones
agg = {
    'NUM_INSTALMENT_VERSION': ['max','nunique'],
    'PAYMENT_INSTALLMENT_RATIO': ['max','mean','sum'],
    'PAYMENT_DIFFERENCE': ['max','mean','sum'],
    'PAGO_DE_MAS': ['mean','sum'],
    'PAGO_DE_MENOS': ['mean','sum'],
    'DAYS_PAST_DUE': ['max','mean','sum'],
    'DAYS_BEFORE_DUE': ['max','mean','sum'],
    'DAYS_PAST_DUE_RATIO': ['max','mean','sum'],
    'DAYS_BEFORE_DUE_RATIO': ['max','mean','sum'],
    'PAGO_RETRASADO': ['mean','sum'],
    'PAGO_ANTICIPADO': ['mean','sum'],
    'NUM_CAMBIOS_PLAN_PAGOS_TOTAL': ['max','mean','sum'],
    'installment_payment_ratio_1000_mean_mean': ['max','mean','sum']
}

installment_payments_agg = installment_payments.groupby('SK_ID_CURR').agg(agg)
installment_payments_agg.columns = pd.Index([e[0] + "_" + e[1].upper() + '_(INSTALL_PAY)' for e in installment_payments_agg.columns.tolist()])

In [7]:
#Vamos a añadir el número de pagos realizados por cada cliente
pagos_realizados = installment_payments[['SK_ID_CURR', 'NUM_INSTALMENT_NUMBER']].groupby('SK_ID_CURR', as_index=False, sort=False).count()
pagos_realizados = pagos_realizados.rename(columns={'NUM_INSTALMENT_NUMBER': 'NUM_PAGOS_REALIZADOS_(INSTALL_PAY)'})
installment_payments_agg = installment_payments_agg.join(pagos_realizados.set_index('SK_ID_CURR'), on='SK_ID_CURR', how='left')
del pagos_realizados
gc.collect()

0

In [8]:
#Vamos a añadir una variable que nos dé la última versión de pago
last_version = installment_payments[['SK_ID_CURR','DAYS_INSTALMENT','NUM_INSTALMENT_VERSION']]
last_version = last_version.loc[last_version.sort_values(['SK_ID_CURR','DAYS_INSTALMENT']).drop_duplicates('SK_ID_CURR', keep='last').index]
last_version = last_version.rename(columns={'NUM_INSTALMENT_VERSION': 'LAST_VERSION_(INSTALL_PAY)'})
installment_payments_agg = installment_payments_agg.join(last_version.set_index('SK_ID_CURR'), on='SK_ID_CURR', how='left')
del last_version
gc.collect()

0

In [9]:
installment_payments_agg.shape
#installment_payments_agg.to_csv(r'/home/yeray/TFG-Home-Credit-Default-Risk/JUPYTER_NOTEBOOKS/DATA/installment_payments_agg.csv', index = False)

(339587, 37)