# Preprocessing

In [156]:
import pandas as pd
from lightgbm import LGBMRegressor, LGBMClassifier

Loading the data

In [157]:
def shape(df):
    dimensions = df.shape
    return f"{dimensions[0]} rows and {dimensions[1]} columns"

In [158]:
df = pd.read_csv('../../data/raw/application_train.csv')
shape(df)

'307511 rows and 122 columns'

Let's see the first rows of the dataset.


In [159]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


We are going to use SK_ID_CURR as the index of the dataset.

In [160]:
df.set_index('SK_ID_CURR', inplace=True)
df.head()

Unnamed: 0_level_0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,0,0,0,0,,,,,,
100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


We are going to start converting wrong formatted boolean columns to the correct format.

In [161]:
def format_boolean_columns(df, boolean_columns, true_label='Y', false_label='N'):
    if isinstance(boolean_columns, str):
        boolean_columns = [boolean_columns]

    for col in boolean_columns:
        df[col] = df[col].map({true_label: True, false_label: False})

    return df

In [162]:
y_n_cols = ["FLAG_OWN_CAR","FLAG_OWN_REALTY"]
df = format_boolean_columns(df,y_n_cols,true_label="Y",false_label="N")

df = format_boolean_columns(df,"EMERGENCYSTATE_MODE",true_label="Yes",false_label="No")

cols_1_0 = ["FLAG_MOBIL","FLAG_EMP_PHONE","FLAG_WORK_PHONE","FLAG_CONT_MOBILE","FLAG_PHONE","FLAG_EMAIL","REG_REGION_NOT_LIVE_REGION","REG_REGION_NOT_WORK_REGION","LIVE_REGION_NOT_WORK_REGION","REG_CITY_NOT_LIVE_CITY","REG_CITY_NOT_WORK_CITY","LIVE_CITY_NOT_WORK_CITY","FLAG_DOCUMENT_2", "FLAG_DOCUMENT_3", "FLAG_DOCUMENT_4", "FLAG_DOCUMENT_5", "FLAG_DOCUMENT_6", "FLAG_DOCUMENT_7", "FLAG_DOCUMENT_8", "FLAG_DOCUMENT_9", "FLAG_DOCUMENT_10", "FLAG_DOCUMENT_11", "FLAG_DOCUMENT_12", "FLAG_DOCUMENT_13", "FLAG_DOCUMENT_14", "FLAG_DOCUMENT_15", "FLAG_DOCUMENT_16", "FLAG_DOCUMENT_17", "FLAG_DOCUMENT_18", "FLAG_DOCUMENT_19", "FLAG_DOCUMENT_20", "FLAG_DOCUMENT_21"]
df[cols_1_0] = df[cols_1_0].astype(bool)

We are going to explore the missing values percentages of the columns.

In [163]:
def missing_values_percentage(df):
    missing_percent = df.isnull().mean() * 100
    missing_df = pd.DataFrame(missing_percent, columns=['Missing Percentage'])
    missing_df = missing_df[missing_df['Missing Percentage'] > 0]
    missing_df = missing_df.sort_values(by='Missing Percentage', ascending=False)

    return missing_df

In [164]:
missing_values_perc = missing_values_percentage(df)
missing_values_perc

Unnamed: 0,Missing Percentage
COMMONAREA_AVG,69.872297
COMMONAREA_MEDI,69.872297
COMMONAREA_MODE,69.872297
NONLIVINGAPARTMENTS_AVG,69.432963
NONLIVINGAPARTMENTS_MODE,69.432963
...,...
EXT_SOURCE_2,0.214626
AMT_GOODS_PRICE,0.090403
AMT_ANNUITY,0.003902
CNT_FAM_MEMBERS,0.000650


We are going to drop the columns with more than 45% of missing values.

In [165]:
def drop_columns(df, missing_df, threshold = 50):
    columns_to_drop = missing_df[missing_df['Missing Percentage'] > threshold].index
    df = df.drop(columns=columns_to_drop)

    return df

In [166]:
df = drop_columns(df,missing_values_perc,45)
missing_values_perc = missing_values_percentage(df)
missing_values_perc

Unnamed: 0,Missing Percentage
OCCUPATION_TYPE,31.345545
EXT_SOURCE_3,19.825307
AMT_REQ_CREDIT_BUREAU_WEEK,13.501631
AMT_REQ_CREDIT_BUREAU_MON,13.501631
AMT_REQ_CREDIT_BUREAU_YEAR,13.501631
AMT_REQ_CREDIT_BUREAU_QRT,13.501631
AMT_REQ_CREDIT_BUREAU_HOUR,13.501631
AMT_REQ_CREDIT_BUREAU_DAY,13.501631
NAME_TYPE_SUITE,0.420148
OBS_30_CNT_SOCIAL_CIRCLE,0.332021


We are going to check the remaining shape of the data.


In [167]:
shape(df)

'307511 rows and 72 columns'

We are going to fill missing values with 0s in the following columns: 

AMT_REQ_CREDIT_BUREAU_HOUR, AMT_REQ_CREDIT_BUREAU_DAY, AMT_REQ_CREDIT_BUREAU_WEEK, AMT_REQ_CREDIT_BUREAU_MON, AMT_REQ_CREDIT_BUREAU_QRT, AMT_REQ_CREDIT_BUREAU_YEAR

This decision is made because these columns are related to the number of enquiries to the Credit Bureau and it is possible that the missing values are due to the fact that the client has not made any requests.

In [168]:
amt_req_credit_bureau_cols = ["AMT_REQ_CREDIT_BUREAU_HOUR","AMT_REQ_CREDIT_BUREAU_DAY","AMT_REQ_CREDIT_BUREAU_WEEK","AMT_REQ_CREDIT_BUREAU_MON","AMT_REQ_CREDIT_BUREAU_QRT","AMT_REQ_CREDIT_BUREAU_YEAR"]
df[amt_req_credit_bureau_cols] = df[amt_req_credit_bureau_cols].fillna(0)

We are going to also fill missing values with 0s in the following columns:

OBS_30_CNT_SOCIAL_CIRCLE, DEF_30_CNT_SOCIAL_CIRCLE, OBS_60_CNT_SOCIAL_CIRCLE, DEF_60_CNT_SOCIAL_CIRCLE

This decision is made because these columns are related to the number of observations of clients' social surroundings and it is possible that the missing values are due to the fact that the client has not made any observations.

In [169]:
social_circle_cols = ["OBS_30_CNT_SOCIAL_CIRCLE","DEF_30_CNT_SOCIAL_CIRCLE","OBS_60_CNT_SOCIAL_CIRCLE","DEF_60_CNT_SOCIAL_CIRCLE"]
df[social_circle_cols] = df[social_circle_cols].fillna(0)

We are going to fill missing values in the column CNT_FAM_MEMBERS with 1. This decision is made because if there is not data about the number of family members, we can assume that the client is alone. 


In [170]:
df["CNT_FAM_MEMBERS"] = df["CNT_FAM_MEMBERS"].fillna(1)

We are going to fill missing values in the columns NAME_TYPE_SUITE with unaccompanied. This decision is made because if there is not data about wwho is accompanying the client, we can assume that the client is unaccompanied.

In [171]:
df["NAME_TYPE_SUITE"] = df["NAME_TYPE_SUITE"].fillna("Unaccompanied")

Finally, we are going to check if there are missing values in the TARGET column. If there are, we are going to drop the rows with missing values in the TARGET column.

In [172]:
missing_target = df["TARGET"].isnull().sum()
print(f"Missing values in the TARGET column: {missing_target}")
if missing_target > 0:
    df = df.dropna(subset=["TARGET"])
    df.reset_index(drop=True, inplace=True)
    

Missing values in the TARGET column: 0


For the rest of missing values, we are going to impute them using LGBMRegressor for numerical columns and LGBMClassifier for categorical columns.

It is decided to use these models because they are fast and they can handle missing values in the training data.

In [173]:
def splits_creation(data,col):
    train = data[data['is_nan'] == 0]
    test = data[data['is_nan'] == 1]
    X_train = train.drop([col,"is_nan"], axis=1)
    y_train = train[col]
    X_test = test.drop([col,"is_nan"], axis=1)
    return X_train, y_train, X_test

def factorize_categoricals(df):
    for cat_col in df.select_dtypes(include='object'):
        df[cat_col] = pd.factorize(df[cat_col])[0]
    return df

def train_predict(mode, X_train, y_train, X_test):
    if mode == "regression":
        model = LGBMRegressor()
    else:
        model = LGBMClassifier(num_leaves=100,max_depth = -1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return y_pred

def impute_missing_values(data, cols_list, mode):
    new_df = data.copy()
    for col in cols_list:
        if data[col].isna().sum() == 0:
            continue
        nan_ixs = data[data[col].isna()].index
        data['is_nan'] = 0
        data.loc[nan_ixs, 'is_nan'] = 1
        X = data.drop([col], axis=1)
        y = data[col]
        X = factorize_categoricals(X)
        data = X.join(y)
        X_train, y_train, X_test = splits_creation(data,col)
        y_pred = train_predict(mode, X_train, y_train, X_test)
        new_df.loc[nan_ixs, col] = y_pred
    return new_df

Before imputation, we need to separate data into numerical and categorical columns.

In [174]:
y = df["TARGET"]
df = df.drop(columns='TARGET')

numerical_cols = df.select_dtypes(include='number').columns

numerical_df = df[numerical_cols].drop(columns=['CNT_CHILDREN','CNT_FAM_MEMBERS','REGION_RATING_CLIENT','REGION_RATING_CLIENT_W_CITY',
                                 'HOUR_APPR_PROCESS_START','OBS_30_CNT_SOCIAL_CIRCLE','DEF_30_CNT_SOCIAL_CIRCLE',
                                 'OBS_60_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE'])
numerical_cols = numerical_df.columns

categorical_cols = df.columns.difference(numerical_cols)

In [175]:
df = impute_missing_values(df, numerical_cols, "regression")
df = impute_missing_values(df, categorical_cols, "classification")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014736 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2911
[LightGBM] [Info] Number of data points in the train set: 307499, number of used features: 66
[LightGBM] [Info] Start training from score 27108.573909
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018367 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2912
[LightGBM] [Info] Number of data points in the train set: 307233, number of used features: 65
[LightGBM] [Info] Start training from score 538396.207429
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014771 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory i

In [176]:
print(f"Missing values in the dataset: {df.isnull().sum().sum()}")

Missing values in the dataset: 0


In [177]:
categorical_df = df[categorical_cols]
numerical_df = df[numerical_cols]

We are going to save the data. We are going to save 3 files: df (complete df), numerical_df and categorical_df.

We are going to save the data using parquet format because it takes up less space and it is faster to read and write.

We need to rejoin the target column to the dataset.

In [178]:
df = df.join(y)
df.head()

Unnamed: 0_level_0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,...,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,TARGET
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002,Cash loans,M,False,True,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,...,False,False,False,0.0,0.0,0.0,0.0,0.0,1.0,1
100003,Cash loans,F,False,False,0,270000.0,1293502.5,35698.5,1129500.0,Family,...,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0
100004,Revolving loans,M,True,True,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,...,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0
100006,Cash loans,F,False,True,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,...,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0
100007,Cash loans,M,False,True,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,...,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0


Saving the data

In [179]:
df.to_parquet('../../data/processed/df.parquet')
numerical_df.to_parquet('../../data/processed/numerical_df.parquet')
categorical_df.to_parquet('../../data/processed/categorical_df.parquet')