In [60]:
import pandas as pd
import numpy as np
import sklearn as sk
import gc
from tqdm import tqdm_notebook as tqdm

import warnings
warnings.filterwarnings('ignore')


#Have all columns appear when dataframes are displayed.
pd.set_option('display.max_columns', None) 
# Have 100 rows appear when a dataframe is displayed
pd.set_option('display.max_rows', 500)
# Display dimensions whenever a dataframe is printed out.
pd.set_option('display.show_dimensions', True)

previous_application = pd.read_csv(r'/home/yeray/home-credit-default-risk/previous_application.csv')

In [61]:
print(previous_application['SK_ID_CURR'].dtype)

int64


In [62]:
#Primero, quitamos valores sin sentido
previous_application['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
previous_application['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
previous_application['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
previous_application['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
previous_application['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)

In [75]:
#Feature engineering
#RATIOS
previous_application['APP_CREDIT_RATIO'] = previous_application['AMT_APPLICATION'] / previous_application['AMT_CREDIT']
previous_application['APP_ANNUITY_RATIO'] = previous_application['AMT_APPLICATION'] / previous_application['AMT_ANNUITY']
previous_application['CREDIT_ANNUITY_RATIO'] = previous_application['AMT_CREDIT'] / previous_application['AMT_ANNUITY']
previous_application['DOWN_CREDIT_RATIO'] = previous_application['AMT_CREDIT'] / previous_application['AMT_DOWN_PAYMENT']
previous_application['DOWN_ANNUITY_RATIO'] = previous_application['AMT_ANNUITY'] / previous_application['AMT_DOWN_PAYMENT']

#DIFFERENCES
previous_application['DIFF_CREDIT_GOODS'] = previous_application['AMT_CREDIT'] - previous_application['AMT_GOODS_PRICE']
previous_application['DIFF_CREDIT_APP'] = previous_application['AMT_CREDIT'] - previous_application['AMT_APPLICATION']
previous_application['DIFF_APP_GOODS'] = previous_application['AMT_APPLICATION'] - previous_application['AMT_GOODS_PRICE']
previous_application['DIFF_DOWN_APP'] = previous_application['AMT_DOWN_PAYMENT'] - previous_application['AMT_APPLICATION']
previous_application['DIFF_DOWN_GOODS'] = previous_application['AMT_DOWN_PAYMENT'] - previous_application['AMT_GOODS_PRICE']
previous_application['DIFF_ANNUITY_GOODS'] = previous_application['AMT_ANNUITY'] - previous_application['AMT_GOODS_PRICE']
previous_application['DIFF_RATE_INTEREST'] = previous_application['RATE_INTEREST_PRIVILEGED'] - previous_application['RATE_INTEREST_PRIMARY']

#DIFFS Y RATIOS RELACIONADAS CON LOS DIAS
previous_application['DIFF_LAST_FIRST_DUE'] = previous_application['DAYS_LAST_DUE'] / previous_application['DAYS_FIRST_DUE']
previous_application['DIFF_LAST_DUE_1ST_VERSION'] = previous_application['DAYS_LAST_DUE'] / previous_application['DAYS_LAST_DUE_1ST_VERSION']
previous_application['DIFF_LAST_FIRST_DUE'] = previous_application['DAYS_LAST_DUE'] - previous_application['DAYS_FIRST_DUE']
previous_application['DIFF_LAST_TERMINATION'] = previous_application['DAYS_LAST_DUE'] - previous_application['DAYS_TERMINATION']
previous_application['DIFF_LAST_FIRST_DRAWING'] = previous_application['DAYS_LAST_DUE'] - previous_application['DAYS_FIRST_DRAWING']
previous_application['DIFF_FIRST_TERMINATION'] = previous_application['DAYS_FIRST_DUE'] - previous_application['DAYS_TERMINATION']
previous_application['DIFF_FIRST_DRAWING_TERMINATION'] = previous_application['DAYS_FIRST_DRAWING'] - previous_application['DAYS_TERMINATION']
previous_application['DIFF_TERMINATION_TO_DECISION'] = previous_application['DAYS_TERMINATION'] - previous_application['DAYS_DECISION']

#INTEREST
previous_application['INTEREST'] = previous_application['CNT_PAYMENT'] * previous_application['AMT_ANNUITY'] - previous_application['AMT_CREDIT']
previous_application['INTEREST_RATE'] = 2*12*previous_application['INTEREST']/(previous_application['AMT_CREDIT']*(previous_application['CNT_PAYMENT']+1))
previous_application['INTEREST_SHARE'] = previous_application['INTEREST'] / (previous_application['AMT_CREDIT'])

previous_application.replace([np.inf, -np.inf], np.nan, inplace=True)
all_nan_cols = [c for c in previous_application.columns if previous_application[c].isnull().all()]
if len(all_nan_cols)>0:
    print("Columnas con todo nulos: ", all_nan_cols)
previous_application.to_csv(r'/home/yeray/TFG-Home-Credit-Default-Risk/JUPYTER_NOTEBOOKS/DATA/prueba.csv')
print(previous_application['DIFF_APP_GOODS'])

0          0.0
1          0.0
2          0.0
3          0.0
4          0.0
          ... 
1670209    0.0
1670210    0.0
1670211    0.0
1670212    0.0
1670213    0.0
Name: DIFF_APP_GOODS, Length: 1670214, dtype: float64


In [70]:
previous_application_agg = previous_application.copy()

#Agregaciones    
num_aggregations = {
    #Primero de las variables base
    'AMT_ANNUITY': ['min', 'max', 'mean'],
    'AMT_APPLICATION': ['min', 'max', 'mean'],
    'AMT_CREDIT': ['min', 'max', 'mean'],
    'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
    'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'RATE_INTEREST_PRIMARY': ['min', 'max', 'mean'],
    'RATE_INTEREST_PRIVILEGED': ['min', 'max', 'mean'],
    'DAYS_DECISION': ['min', 'max', 'mean'],
    'DAYS_FIRST_DRAWING': ['min', 'max', 'mean'],
    'DAYS_FIRST_DUE': ['min', 'max', 'mean'],
    'DAYS_LAST_DUE_1ST_VERSION': ['min', 'max', 'mean'],
    'DAYS_LAST_DUE': ['min', 'max', 'mean'],
    'DAYS_TERMINATION': ['min', 'max', 'mean'],
    'CNT_PAYMENT': ['min', 'max', 'mean'],
    #Agregamos las variables nuevas
    'APP_CREDIT_RATIO': ['min', 'max', 'mean'],
    'APP_ANNUITY_RATIO': ['min', 'max', 'mean'],
    'CREDIT_ANNUITY_RATIO': ['min', 'max', 'mean'],
    'DOWN_CREDIT_RATIO': ['min', 'max', 'mean'],
    'DOWN_ANNUITY_RATIO': ['min', 'max', 'mean'],
    'DIFF_CREDIT_GOODS': ['min', 'max', 'mean'],
    'DIFF_CREDIT_APP': ['min', 'max', 'mean'],
    'DIFF_APP_GOODS': ['min', 'max', 'mean'],
    'DIFF_DOWN_APP': ['min', 'max', 'mean'],
    'DIFF_DOWN_GOODS': ['min', 'max', 'mean'],
    'INTEREST': ['min', 'max', 'mean'],
    'INTEREST_RATE': ['min', 'max', 'mean'],
    'INTEREST_SHARE': ['min', 'max', 'mean'],
    'DIFF_RATE_INTEREST': ['min', 'max', 'mean'],
    'DIFF_LAST_FIRST_DUE': ['min', 'max', 'mean'],
    'DIFF_LAST_DUE_1ST_VERSION': ['min', 'max', 'mean'],
    'DIFF_LAST_FIRST_DUE': ['min', 'max', 'mean'],
    'DIFF_LAST_TERMINATION': ['min', 'max', 'mean'],
    'DIFF_LAST_FIRST_DRAWING': ['min', 'max', 'mean'],
    'DIFF_FIRST_TERMINATION': ['min', 'max', 'mean'],
    'DIFF_FIRST_DRAWING_TERMINATION': ['min', 'max', 'mean'],
    'DIFF_TERMINATION_TO_DECISION': ['min', 'max', 'mean']             
}

categorical_features = ['NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START',
       'FLAG_LAST_APPL_PER_CONTRACT', 'NAME_CASH_LOAN_PURPOSE',
       'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON',
       'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY',
       'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE',
       'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION']

In [71]:
import gc
prev_app_df = previous_application.copy()
# Convertimos variables categóricas a códigos numéricos
previous_application_agg[categorical_features] = previous_application_agg[categorical_features].apply(lambda x: x.astype('category'))
cat_aggreagation = {}
for cat in categorical_features:
    previous_application_agg[cat] = previous_application_agg.loc[:, cat].cat.codes
    cat_aggreagation[cat] = ['mean']

# Agregamos variables numéricas y categóricas por SK_ID_CURR
previous_application_agg_df = previous_application_agg.groupby('SK_ID_CURR').agg({**numerical_agg, **cat_aggreagation})
previous_application_agg_df.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in previous_application_agg_df.columns.tolist()])

previous_application_agg_df.reset_index(inplace=True)
# Eliminamos previous_application_agg SOLO después de asegurarnos de que no lo necesitamos más
#del previous_application_agg
#gc.collect()

# ---- Agregaciones separadas para solicitudes aprobadas y rechazadas ----
previous_application_approved = previous_application[previous_application['NAME_CONTRACT_STATUS'] == 'Approved'].groupby('SK_ID_CURR').agg(numerical_agg)
cols = previous_application_approved.columns.tolist()
previous_application_approved.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in cols])

# Unimos con previous_application_agg_df
previous_application_agg_df = previous_application_agg_df.join(previous_application_approved, how='left', on='SK_ID_CURR')

# Eliminamos previous_application_approved después de su uso
del previous_application_approved
gc.collect()

# Agregamos los datos de solicitudes rechazadas
previous_application_refused = previous_application[previous_application['NAME_CONTRACT_STATUS'] == 'Refused'].groupby('SK_ID_CURR').agg(numerical_agg)
previous_application_refused.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in previous_application_refused.columns.tolist()])


# Unimos con previous_application_agg_df
previous_application_agg_df = previous_application_agg_df.join(previous_application_refused, how='left', on='SK_ID_CURR')
# Debug manual
refused_clients= set(previous_application_refused['SK_ID_CURR'])
nan_mask = previous_application_agg_df['REFUSED_AMT_CREDIT_MIN'].isnull()
no_refused_mask = ~previous_application_agg_df['SK_ID_CURR'].isin(refused_clients)
clients_sin_refused = final_df[nan_mask & no_refused_mask]
print("Estos clientes no tienen info en 'REFUSED_' => nulos porque no tenían previas 'Refused':")
print(clients_sin_refused[['SK_ID_CURR','REFUSED_AMT_CREDIT_MIN']])

# Eliminamos previous_application_refused después de su uso
del previous_application_refused
gc.collect()

# ---- Cálculo de Ratios entre solicitudes aprobadas y rechazadas ----
for e in cols:
    approved_col = 'APPROVED_' + e[0] + "_" + e[1].upper()
    refused_col = 'REFUSED_' + e[0] + "_" + e[1].upper()
    ratio_col = 'RATIO_APPROVED_TO_REFUSED_' + e[0] + "_" + e[1].upper()

    # Manejar posibles divisiones por cero
    previous_application_agg_df[ratio_col] = previous_application_agg_df[approved_col] / (previous_application_agg_df[refused_col] + 1e-6)


In [None]:
print(previous_application_agg_df.dtypes)

In [72]:
def imputar_valores_nulos(app_train):
    variables_continuas = app_train.select_dtypes(include=['float64','int64','int8','int32']).columns 
    for col in variables_continuas:
        app_train[col] = app_train[col].fillna(app_train[col].mean())
    variables_categoricas = app_train.select_dtypes(include=['object', 'category']).columns
    for col in variables_categoricas:
        app_train[col] = app_train[col].fillna(app_train[col].mode()[0])

    return app_train

previous_application_agg_df_nonulos = imputar_valores_nulos(prev_app_agg_final)

print("Valores nulos en app_train después de la imputación:")
print(previous_application_agg_df_nonulos.isnull().sum().sum())

Valores nulos en app_train después de la imputación:
30497130


In [73]:
null_cols = previous_application_agg_df_nonulos.columns[previous_application_agg_df_nonulos.isnull().any()]
print("Columnas con nulos tras la imputación:", null_cols.tolist())
# Cuántos nulos en cada una
#print(previous_application_agg_df_nonulos[null_cols].isnull().sum())
print(previous_application_agg_df_nonulos[null_cols].dtypes)

Columnas con nulos tras la imputación: ['REFUSED_RATE_INTEREST_PRIMARY_MIN_(PREV_APP)', 'REFUSED_RATE_INTEREST_PRIMARY_MAX_(PREV_APP)', 'REFUSED_RATE_INTEREST_PRIMARY_MEAN_(PREV_APP)', 'REFUSED_RATE_INTEREST_PRIVILEGED_MIN_(PREV_APP)', 'REFUSED_RATE_INTEREST_PRIVILEGED_MAX_(PREV_APP)', 'REFUSED_RATE_INTEREST_PRIVILEGED_MEAN_(PREV_APP)', 'REFUSED_DAYS_FIRST_DRAWING_MIN_(PREV_APP)', 'REFUSED_DAYS_FIRST_DRAWING_MAX_(PREV_APP)', 'REFUSED_DAYS_FIRST_DRAWING_MEAN_(PREV_APP)', 'REFUSED_DAYS_FIRST_DUE_MIN_(PREV_APP)', 'REFUSED_DAYS_FIRST_DUE_MAX_(PREV_APP)', 'REFUSED_DAYS_FIRST_DUE_MEAN_(PREV_APP)', 'REFUSED_DAYS_LAST_DUE_1ST_VERSION_MIN_(PREV_APP)', 'REFUSED_DAYS_LAST_DUE_1ST_VERSION_MAX_(PREV_APP)', 'REFUSED_DAYS_LAST_DUE_1ST_VERSION_MEAN_(PREV_APP)', 'REFUSED_DAYS_LAST_DUE_MIN_(PREV_APP)', 'REFUSED_DAYS_LAST_DUE_MAX_(PREV_APP)', 'REFUSED_DAYS_LAST_DUE_MEAN_(PREV_APP)', 'REFUSED_DAYS_TERMINATION_MIN_(PREV_APP)', 'REFUSED_DAYS_TERMINATION_MAX_(PREV_APP)', 'REFUSED_DAYS_TERMINATION_MEAN_(PREV_

In [22]:

previous_application_agg_df.to_csv(r'/home/yeray/TFG-Home-Credit-Default-Risk/JUPYTER_NOTEBOOKS/DATA/prev_app.csv')

In [None]:
print(previous_application_agg_df.dtypes)