In [15]:
import pandas as pd
import numpy as np
import sklearn as sk
import gc
from tqdm import tqdm_notebook as tqdm

import warnings
warnings.filterwarnings('ignore')


#Have all columns appear when dataframes are displayed.
pd.set_option('display.max_columns', None) 
# Have 100 rows appear when a dataframe is displayed
pd.set_option('display.max_rows', 500)
# Display dimensions whenever a dataframe is printed out.
pd.set_option('display.show_dimensions', True)


pos_cash= pd.read_csv(r'/home/yeray/home-credit-default-risk/POS_CASH_balance.csv')
pos_cash_agg= pos_cash.copy()

In [18]:
def engineer_pos_cash_features(pos_cash):
    df = pos_cash.copy()

    # Feature Engineering
    df['DAYS_PAST_DUE_TO_CNT_INSTALMENT_RATIO'] = df['SK_DPD'] / df['CNT_INSTALMENT']
    df['DAYS_PAST_DUE_TO_CNT_INSTALMENT_FUTURE_RATIO'] = df['SK_DPD'] / df['CNT_INSTALMENT_FUTURE']
    df['SK_DPD_DEF_TO_CNT_INSTALMENT_RATIO'] = df['SK_DPD_DEF'] / df['CNT_INSTALMENT']
    df['SK_DPD_DEF_TO_CNT_INSTALMENT_FUTURE_RATIO'] = df['SK_DPD_DEF'] / df['CNT_INSTALMENT_FUTURE']

    df['SK_DPD_TO_SK_DPD_DEF_RATIO'] = df['SK_DPD'] / df['SK_DPD_DEF']
    df['SK_DPD_TO_SK_DPD_DEF_DIFF'] = df['SK_DPD'] - df['SK_DPD_DEF']

    df['CNT_INSTALMENT_TO_CNT_INSTALMENT_FUTURE_RATIO'] = df['CNT_INSTALMENT'] / df['CNT_INSTALMENT_FUTURE']
    df['CNT_INSTALMENT_FUTURE_TO_CNT_INSTALMENT_DIFF'] = df['CNT_INSTALMENT_FUTURE'] - df['CNT_INSTALMENT']

    #Agregaciones 
    agg = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean', 'size'],
        'SK_DPD_DEF': ['max', 'mean', 'sum'],
        'CNT_INSTALMENT': ['max', 'mean', 'sum'],
        'CNT_INSTALMENT_FUTURE': ['max', 'mean', 'sum'],
        'NAME_CONTRACT_STATUS': ['max', 'mean'],
    
    #Variables nuevas
        'DAYS_PAST_DUE_TO_CNT_INSTALMENT_RATIO': ['max', 'mean'],
        'DAYS_PAST_DUE_TO_CNT_INSTALMENT_FUTURE_RATIO': ['max', 'mean'],
        'SK_DPD_DEF_TO_CNT_INSTALMENT_RATIO': ['max', 'mean'],
        'SK_DPD_DEF_TO_CNT_INSTALMENT_FUTURE_RATIO': ['max', 'mean'],
        'SK_DPD_TO_SK_DPD_DEF_RATIO': ['max', 'mean'],
        'SK_DPD_TO_SK_DPD_DEF_DIFF': ['max', 'mean'],
        'CNT_INSTALMENT_TO_CNT_INSTALMENT_FUTURE_RATIO': ['max', 'mean'],
        'CNT_INSTALMENT_FUTURE_TO_CNT_INSTALMENT_DIFF': ['max', 'mean']
}
    df[['NAME_CONTRACT_STATUS']] = df[['NAME_CONTRACT_STATUS']].apply(lambda x: x.astype('category'))
    df['NAME_CONTRACT_STATUS'] = df.loc[:, 'NAME_CONTRACT_STATUS'].cat.codes

    df_agg = df.groupby('SK_ID_CURR').agg(agg)
    df_agg.columns = [
        f"POS_CASH_{col[0]}_{col[1].upper()}"
        for col in df_agg.columns
    ]

    count_loans = (
        df.groupby('SK_ID_CURR')['SK_ID_PREV']
          .nunique()
          .reset_index()
          .rename(columns={'SK_ID_PREV':'COUNT_POS_CASH_LOANS_(POS_CASH)'})
        )
    df_agg = df_agg.merge(count_loans, on='SK_ID_CURR', how='left')

    df_mr = df[['SK_ID_CURR', 'SK_ID_PREV','MONTHS_BALANCE','NAME_CONTRACT_STATUS']].copy()
    df_mr.sort_values(['SK_ID_PREV','MONTHS_BALANCE'], inplace=True)
    df_mr.drop_duplicates('SK_ID_PREV',keep='last', inplace=True)

    estados_recientes = ['Completed','Signed']
    for status in estados_recientes:
        col_name = f"NUMBER_CONTRACTS_MOST_RECENTLY_{status.upper()}_CAT_(POS_CASH)"
        df_mr[col_name] = np.where(df_mr['NAME_CONTRACT_STATUS']==status, 1, 0)

        df_sum = df_mr.groupby('SK_ID_CURR', as_index=False)[col_name].sum()
        df_agg = df_agg.merge(df_sum, on='SK_ID_CURR', how='left')

    return df_agg




In [19]:
pos_cash_agg_final = engineer_pos_cash_features(pos_cash_agg)
pos_cash_agg_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337252 entries, 0 to 337251
Data columns (total 37 columns):
 #   Column                                                       Non-Null Count   Dtype  
---  ------                                                       --------------   -----  
 0   SK_ID_CURR                                                   337252 non-null  int64  
 1   POS_CASH_MONTHS_BALANCE_MAX                                  337252 non-null  int64  
 2   POS_CASH_MONTHS_BALANCE_MEAN                                 337252 non-null  float64
 3   POS_CASH_MONTHS_BALANCE_SIZE                                 337252 non-null  int64  
 4   POS_CASH_SK_DPD_MAX                                          337252 non-null  int64  
 5   POS_CASH_SK_DPD_MEAN                                         337252 non-null  float64
 6   POS_CASH_SK_DPD_SIZE                                         337252 non-null  int64  
 7   POS_CASH_SK_DPD_DEF_MAX                                      3372

In [None]:
pos_cash_agg_final.to_csv(r'/home/yeray/TFG-Home-Credit-Default-Risk/JUPYTER_NOTEBOOKS/DATA/pos_cash_agg_final.csv', index = False)