# Data Preprocessing

## Importing Libraries

We will start by importing the necessary libraries for this notebook.

In [None]:
import pandas as pd
import numpy as np
import category_encoders as ce
from scipy.signal import ShortTimeFFT
from sklearn.preprocessing import OrdinalEncoder,TargetEncoder,LabelEncoder
from src.preprocessing import *

## Loading Data

In [None]:
df = cargar_datos('../data/raw/application_train.csv')
desc = forma(df)
desc

Let's see the first rows of the dataset.

In [None]:
df.head()

We are going to store the target conlumn in a separate variable.

In [None]:
y = df["TARGET"]
df = df.drop("TARGET", axis=1)

We are going to convert SK_ID_CURR to string and store it in a separate variable.

In [None]:
df["SK_ID_CURR"] = df["SK_ID_CURR"].astype(str)
sk_id_curr = df["SK_ID_CURR"]
df = df.drop("SK_ID_CURR", axis=1)

We are going to start converting wrong formatted boolean columns to the correct format.

In [None]:
y_n_cols = ["FLAG_OWN_CAR","FLAG_OWN_REALTY"]
df = format_boolean_columns(df,y_n_cols,true_label="Y",false_label="N")
df = format_boolean_columns(df,"EMERGENCYSTATE_MODE",true_label="Yes",false_label="No")
cols_1_0 = ["FLAG_MOBIL","FLAG_EMP_PHONE","FLAG_WORK_PHONE","FLAG_CONT_MOBILE","FLAG_PHONE","FLAG_EMAIL","REG_REGION_NOT_LIVE_REGION","REG_REGION_NOT_WORK_REGION","LIVE_REGION_NOT_WORK_REGION","REG_CITY_NOT_LIVE_CITY","REG_CITY_NOT_WORK_CITY","LIVE_CITY_NOT_WORK_CITY","FLAG_DOCUMENT_2", "FLAG_DOCUMENT_3", "FLAG_DOCUMENT_4", "FLAG_DOCUMENT_5", "FLAG_DOCUMENT_6", "FLAG_DOCUMENT_7", "FLAG_DOCUMENT_8", "FLAG_DOCUMENT_9", "FLAG_DOCUMENT_10", "FLAG_DOCUMENT_11", "FLAG_DOCUMENT_12", "FLAG_DOCUMENT_13", "FLAG_DOCUMENT_14", "FLAG_DOCUMENT_15", "FLAG_DOCUMENT_16", "FLAG_DOCUMENT_17", "FLAG_DOCUMENT_18", "FLAG_DOCUMENT_19", "FLAG_DOCUMENT_20", "FLAG_DOCUMENT_21"]
df[cols_1_0] = df[cols_1_0].astype(bool)

We are going to explore the missing values percentages of the columns.

In [None]:
missing_values_perc = missing_values_percentage(df)
missing_values_perc

We are going to drop the columns with more than 45% of missing values.

In [None]:
df = drop_columns(df,missing_values_perc,45)
missing_values_perc = missing_values_percentage(df)

We are going to check the remaining columns in df.

In [None]:
forma(df)

Let's check the columns with missing values.

In [None]:
df[missing_values_perc.index]

We are going to fill missing values with 0s in the following columns: 

AMT_REQ_CREDIT_BUREAU_HOUR, AMT_REQ_CREDIT_BUREAU_DAY, AMT_REQ_CREDIT_BUREAU_WEEK, AMT_REQ_CREDIT_BUREAU_MON, AMT_REQ_CREDIT_BUREAU_QRT, AMT_REQ_CREDIT_BUREAU_YEAR

This decision is made because these columns are related to the number of enquiries to the Credit Bureau and it is possible that the missing values are due to the fact that the client has not made any requests.

In [None]:
amt_req_credit_bureau_cols = ["AMT_REQ_CREDIT_BUREAU_HOUR","AMT_REQ_CREDIT_BUREAU_DAY","AMT_REQ_CREDIT_BUREAU_WEEK","AMT_REQ_CREDIT_BUREAU_MON","AMT_REQ_CREDIT_BUREAU_QRT","AMT_REQ_CREDIT_BUREAU_YEAR"]
df[amt_req_credit_bureau_cols] = df[amt_req_credit_bureau_cols].fillna(0)

We are going to also fill missing values with the mode in the following columns:

OBS_30_CNT_SOCIAL_CIRCLE, DEF_30_CNT_SOCIAL_CIRCLE, OBS_60_CNT_SOCIAL_CIRCLE, DEF_60_CNT_SOCIAL_CIRCLE

This decision is made because these columns are related to the number of observations of clients' social surroundings and it is possible that the missing values are due to the fact that the client has not made any observations.

In [None]:
social_circle_cols = ["OBS_30_CNT_SOCIAL_CIRCLE","DEF_30_CNT_SOCIAL_CIRCLE","OBS_60_CNT_SOCIAL_CIRCLE","DEF_60_CNT_SOCIAL_CIRCLE"]
df[social_circle_cols] = df[social_circle_cols].fillna(0)

We are going to fill missing values in the column CNT_FAM_MEMBERS with 1. This decision is made because if there is not data about the number of family members, we can assume that the client is alone. 

In [None]:
df["CNT_FAM_MEMBERS"] = df["CNT_FAM_MEMBERS"].fillna(1)

We are going to fill missing values in the columns NAME_TYPE_SUITE with unaccompanied. This decision is made because if there is not data about wwho is accompanying the client, we can assume that the client is unaccompanied.

In [None]:
df["NAME_TYPE_SUITE"] = df["NAME_TYPE_SUITE"].fillna("Unaccompanied")

In [None]:
df.head()

In order to use the imputers, we need to encode the categorical columns.

One Hot Encoding for NAME_CONTRACT_TYPE and CODE_GENDER columns, as they only have two possible values, so we are not going to increase our dimensionality heavily.

In [None]:
#Delete Rows with CODE_GENDER = XNA, only 4 in all dataset
y = y[df['CODE_GENDER'] != "XNA"]
sk_id_curr = sk_id_curr[df['CODE_GENDER'] != "XNA"]
df = df[df['CODE_GENDER'] != "XNA"]

one_hot_encoding_columns = ["NAME_CONTRACT_TYPE","CODE_GENDER"]
df = pd.get_dummies(df,columns=one_hot_encoding_columns ,drop_first=True)

Categorical Encoding for NAME_EDUCATION_TYPE and WEEKDAY_APPR_PROCESS_START columns.

In [None]:
education_order = ['Lower secondary', 'Secondary / secondary special', 'Incomplete higher', 
                   'Higher education', 'Academic degree']
encoderOrdinalEdu = OrdinalEncoder(categories=[education_order])
df['NAME_EDUCATION_TYPE'] = encoderOrdinalEdu.fit_transform(df[['NAME_EDUCATION_TYPE']])

week_order = ['MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY', 'SUNDAY']
encoderOrdinalWeek = OrdinalEncoder(categories=[week_order])
df['WEEKDAY_APPR_PROCESS_START'] = encoderOrdinalWeek.fit_transform(df[['WEEKDAY_APPR_PROCESS_START']])

Target Encoding for the rest of the categorical columns, without OCCUPATION_TYPE.

In [None]:
# target_encoding_columns = ["NAME_TYPE_SUITE","NAME_INCOME_TYPE","NAME_FAMILY_STATUS","NAME_HOUSING_TYPE","NAME_FAMILY_STATUS","NAME_HOUSING_TYPE","ORGANIZATION_TYPE"]
#encoderTarget = TargetEncoder()
#df[target_encoding_columns] = encoderTarget.fit_transform(df[target_encoding_columns], y) 

In [None]:
label_encoding_columns = [
    "NAME_TYPE_SUITE",
    "NAME_INCOME_TYPE",
    "NAME_FAMILY_STATUS",
    "NAME_HOUSING_TYPE",
    "ORGANIZATION_TYPE"
]

# A dictionary to store the encoders
encoders = {}

# Encoding each column
for column in label_encoding_columns:
    encoder = LabelEncoder()
    df[column] = encoder.fit_transform(df[column])
    encoders[column] = encoder 

We are going to enconde OCCUPATION_TYPE using the One Hot Encoding.

In [None]:
df = pd.get_dummies(df,columns=["OCCUPATION_TYPE"],drop_first=True)

In [None]:
df_imputed = impute_with_knn(df, ['OCCUPATION_TYPE', 'EXT_SOURCE_3', 'EXT_SOURCE_2', 'AMT_GOODS_PRICE', 'AMT_ANNUITY',
                          'DAYS_LAST_PHONE_CHANGE'])

Saving the numerical imputed df. We are using the parquet format to save memory.

In [None]:
df_imputed.to_csv('../data/processed/num_imputed_df.csv', index=False) 

We now are going to decode the columns that were encoded.

First, we copy the dataframe.

In [None]:
df_imp_imported = pd.read_parquet('../data/processed/num_imputed_df.parquet') #Read the data after imputing to avoid loosing time.
df_cat = df_imp_imported.copy()

In [None]:
df_cat = df_imputed.copy()

We decode OCCUPATION_TYPE.

In [None]:
occupation_columns = df_cat.columns[df_cat.columns.str.startswith('OCCUPATION_TYPE_')]
df_cat['OCCUPATION_TYPE'] = df_cat[occupation_columns].idxmax(axis=1).str.replace('OCCUPATION_TYPE_', '')
df_cat = df_cat.drop(columns=occupation_columns)

We decode NAME_EDUCATION_TYPE using the inverse_transform function of the encoder.

In [None]:
df_cat['NAME_EDUCATION_TYPE'].head()

In [43]:
df_cat['WEEKDAY_APPR_PROCESS_START'] = encoderOrdinalWeek.inverse_transform(df_cat[['WEEKDAY_APPR_PROCESS_START']]).ravel()

df_cat['WEEKDAY_APPR_PROCESS_START']

ValueError: could not convert string to float: 'WEDNESDAY'

In [None]:
# Decoding NAME_EDUCATION_TYPE
df_cat['NAME_EDUCATION_TYPE'] = encoderOrdinalEdu.inverse_transform(df_cat[['NAME_EDUCATION_TYPE']]).ravel()

In [None]:
df_cat['NAME_EDUCATION_TYPE'].head()

 We decode CODE_GENDER mapping 1s in CODE_GENDER_M into M and 0s into F.

In [None]:
# Decoding CODE_GENDER
df_cat['CODE_GENDER'] = df_cat['CODE_GENDER_M'].map({1: 'M', 0: 'F'})

In [None]:
df_cat['CODE_GENDER'].head()

We decode NAME_CONTRACT_TYPE mapping 1s in column NAME_CONTRACT_TYPE_Revolving loans into Revolving loans and 0s into cash loans.

In [None]:
# Decoding NAME_CONTRACT_TYPE
df_cat['NAME_CONTRACT_TYPE'] = df_cat['NAME_CONTRACT_TYPE_Revolving loans'].map({1: 'Revolving loans', 0: 'Cash loans'})

In [None]:
df_cat['NAME_CONTRACT_TYPE'].head()

Save the df_cat to parquet in order to save memory

In [45]:
df_cat.to_parquet('../data/processed/imputed_df.parquet', index=False)

In [44]:
df_cat

Unnamed: 0,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,...,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,NAME_CONTRACT_TYPE_Revolving loans,CODE_GENDER_M,target,OCCUPATION_TYPE,CODE_GENDER,NAME_CONTRACT_TYPE
0,0.0,1.0,0.0,202500.0,406597.5,24700.5,351000.0,18.0,20.0,Secondary / secondary special,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,Laborers,M,Cash loans
1,0.0,0.0,0.0,270000.0,1293502.5,35698.5,1129500.0,6.0,10.0,Higher education,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Core staff,F,Cash loans
2,1.0,1.0,0.0,67500.0,135000.0,6750.0,135000.0,16.0,17.0,Secondary / secondary special,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,Laborers,M,Revolving loans
3,0.0,1.0,0.0,135000.0,312682.5,29686.5,297000.0,20.0,16.0,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Laborers,F,Cash loans
4,0.0,1.0,0.0,121500.0,513000.0,21865.5,513000.0,16.0,17.0,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Core staff,M,Cash loans
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307502,0.0,0.0,0.0,157500.0,254700.0,27558.0,225000.0,17.0,19.0,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Sales staff,M,Cash loans
307503,0.0,1.0,0.0,72000.0,269550.0,12001.5,225000.0,19.0,5.0,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cleaning staff,F,Cash loans
307504,0.0,1.0,0.0,153000.0,677664.0,29979.0,585000.0,17.0,19.0,Higher education,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,Managers,F,Cash loans
307505,0.0,1.0,0.0,171000.0,370107.0,20205.0,319500.0,17.0,13.0,Secondary / secondary special,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Laborers,F,Cash loans
