## A. Data Preparation & Data Preprocessing
-----------------------------------------------------------------

### 1. Read Data

In [183]:
# Import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load config from utils file
import src.utils as utils

To import and dump data, create a configuration file.

In [184]:
CONFIG_DATA = utils.load_config()
CONFIG_DATA

{'raw_data_path': 'data/raw/credit_dataset.csv',
 'data_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'target_set_path': 'data/output/target.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'target_variable': 'Credit_Score',
 'test_size': 0.3,
 'num_columns': ['Age',
  'Annual_Income',
  'Num_of_Loan',
  'Num_of_Delayed_Payment',
  'Outstanding_Debt',
  'Monthly_Inhand_Salary',
  'Num_Credit_Inquiries',
  'Credit_Utilization_Ratio',
  'Total_EMI_per_month',
  'Num_Bank_Accounts',
  'Num_C

Create open_data() function to open the dataset

In [185]:
def open_data():
    """Import data and dump data"""

    # Import data
    path_data = CONFIG_DATA['raw_data_path']
    data = pd.read_csv(path_data)

    # Create data validation for data shape
    print("Shape of data    :", data.shape)

    # Dump the dataset to pickle format
    path_dump = CONFIG_DATA['data_path']
    utils.dump_pickle(data, path_dump)

    return data

In [186]:
data = open_data()
data.head()

Shape of data    : (7328, 23)


Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,Auto Loan,3,...,Good,809.98,26.82262,23,No,49.574949,80.415295,High_spent_Small_value_payments,312.494089,1
1,28.0,_______,34847.84,3037.986667,2.0,4.0,6.0,1.0,Credit-Builder Loan,3,...,Good,605.03,24.464031,33,No,18.816215,104.291825,Low_spent_Small_value_payments,470.690627,1
2,34.0,_______,143162.64,12187.22,1.0,5.0,8.0,3.0,Auto Loan,5,...,Good,1303.01,28.616735,26,No,246.99232,168.413703,!@9#%8,1043.315978,1
3,54.0,Entrepreneur,30689.89,2612.490833,2.0,5.0,4.0,1.0,Personal Loan,0,...,Good,632.46,26.544229,20,No,16.415452,81.228859,Low_spent_Large_value_payments,433.604773,1
4,33.0,Lawyer,131313.4,11242.78333,0.0,1.0,8.0,2.0,Mortgage Loan,0,...,Good,352.16,32.200509,37,NM,137.644605,378.171253,High_spent_Medium_value_payments,858.462474,1


### 2. Splitting the Sample

Split the input and output data and dump it
-Update the configuration file with the following information: 
- The path for the input and output data
- The name of the output variable
- The names of the input columns

In [187]:
CONFIG_DATA = utils.load_config()
CONFIG_DATA

{'raw_data_path': 'data/raw/credit_dataset.csv',
 'data_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'target_set_path': 'data/output/target.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'target_variable': 'Credit_Score',
 'test_size': 0.3,
 'num_columns': ['Age',
  'Annual_Income',
  'Num_of_Loan',
  'Num_of_Delayed_Payment',
  'Outstanding_Debt',
  'Monthly_Inhand_Salary',
  'Num_Credit_Inquiries',
  'Credit_Utilization_Ratio',
  'Total_EMI_per_month',
  'Num_Bank_Accounts',
  'Num_C

In [188]:
def input_output_splitting():
    "Function to split the input and output as predictors and target variable"

    # Load dataset
    data_path = CONFIG_DATA['data_path']
    data = utils.load_pickle(data_path)

    # Define target variable (y)
    target_var = CONFIG_DATA['target_variable']
    y = data[target_var]

    # Define predictors variable (X)
    X = data.drop(columns = [target_var],
                  axis = 1)
    
    # Create validation to splitting X and y
    print('Shape of y :', y.shape)
    print('Shape of X :', X.shape)

    # Dump the output
    dump_path_predictors = CONFIG_DATA['predictors_set_path']
    utils.dump_pickle(X, dump_path_predictors)

    dump_path_target = CONFIG_DATA['target_set_path']
    utils.dump_pickle(y, dump_path_target)

    return X, y
    

In [189]:
X, y = input_output_splitting()

Shape of y : (7328,)
Shape of X : (7328, 22)


In [190]:
X.head()

Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,...,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance
0,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,Auto Loan,3,...,4.0,Good,809.98,26.82262,23,No,49.574949,80.415295,High_spent_Small_value_payments,312.494089
1,28.0,_______,34847.84,3037.986667,2.0,4.0,6.0,1.0,Credit-Builder Loan,3,...,2.0,Good,605.03,24.464031,33,No,18.816215,104.291825,Low_spent_Small_value_payments,470.690627
2,34.0,_______,143162.64,12187.22,1.0,5.0,8.0,3.0,Auto Loan,5,...,3.0,Good,1303.01,28.616735,26,No,246.99232,168.413703,!@9#%8,1043.315978
3,54.0,Entrepreneur,30689.89,2612.490833,2.0,5.0,4.0,1.0,Personal Loan,0,...,4.0,Good,632.46,26.544229,20,No,16.415452,81.228859,Low_spent_Large_value_payments,433.604773
4,33.0,Lawyer,131313.4,11242.78333,0.0,1.0,8.0,2.0,Mortgage Loan,0,...,2.0,Good,352.16,32.200509,37,NM,137.644605,378.171253,High_spent_Medium_value_payments,858.462474


The training and testing sets for each predictor (X) and response (y) should then be divided.
- To partition the sample using stratify according to the percentage of response y, set `{stratify = y}`.
- To hold 30% of the sample as a testing set, specify `test_size = 0.3`.
- Adjust `{random_state = 42}` to ensure consistency.

In [191]:
# Import library 
from sklearn.model_selection import train_test_split

Update the configuration file with the test size and the train and test data paths.

In [192]:
CONFIG_DATA = utils.load_config()
CONFIG_DATA

{'raw_data_path': 'data/raw/credit_dataset.csv',
 'data_path': 'data/output/data.pkl',
 'predictors_set_path': 'data/output/predictors.pkl',
 'target_set_path': 'data/output/target.pkl',
 'train_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'test_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'data_train_path': 'data/output/data_train.pkl',
 'data_train_binned_path': 'data/output/data_train_binned.pkl',
 'crosstab_list_path': 'data/output/crosstab_list.pkl',
 'WOE_table_path': 'data/output/WOE_table.pkl',
 'IV_table_path': 'data/output/IV_table.pkl',
 'WOE_map_dict_path': 'data/output/WOE_map_dict.pkl',
 'X_train_woe_path': 'data/output/X_train_woe.pkl',
 'target_variable': 'Credit_Score',
 'test_size': 0.3,
 'num_columns': ['Age',
  'Annual_Income',
  'Num_of_Loan',
  'Num_of_Delayed_Payment',
  'Outstanding_Debt',
  'Monthly_Inhand_Salary',
  'Num_Credit_Inquiries',
  'Credit_Utilization_Ratio',
  'Total_EMI_per_month',
  'Num_Bank_Accounts',
  'Num_C

In [193]:
def train_test_splitting():
    """Function to split train & test, after that dump the data"""
    
    # Load the X and y dataset
    X = utils.load_pickle(CONFIG_DATA['predictors_set_path'])
    y = utils.load_pickle(CONFIG_DATA['target_set_path'])

    # Spliting the dataset
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        stratify = y,
                                                        test_size = CONFIG_DATA['test_size'],
                                                        random_state = 42)
    # Validation of splitting data
    print('X_train shape :', X_train.shape)
    print('y_train shape :', y_train.shape)
    print('X_test shape  :', X_test.shape)
    print('y_test shape  :', y_test.shape)

    # Dump data
    utils.dump_pickle(X_train, CONFIG_DATA['train_path'][0])
    utils.dump_pickle(y_train, CONFIG_DATA['train_path'][1])
    utils.dump_pickle(X_test, CONFIG_DATA['test_path'][0])
    utils.dump_pickle(y_test, CONFIG_DATA['test_path'][1])

    return X_train, X_test, y_train, y_test

In [194]:
X_train, X_test, y_train, y_test = train_test_splitting()

X_train shape : (5129, 22)
y_train shape : (5129,)
X_test shape  : (2199, 22)
y_test shape  : (2199,)


In [195]:
X_train.nunique()

Age                           42
Occupation                    16
Annual_Income               5127
Monthly_Inhand_Salary       5128
Num_Bank_Accounts             11
Num_Credit_Card               10
Interest_Rate                 34
Num_of_Loan                   10
Type_of_Loan                   8
Delay_from_due_date           71
Num_of_Delayed_Payment        40
Changed_Credit_Limit        2150
Num_Credit_Inquiries          13
Credit_Mix                     3
Outstanding_Debt            5075
Credit_Utilization_Ratio    5129
Credit_History_Age            42
Payment_of_Min_Amount          3
Total_EMI_per_month         5129
Amount_invested_monthly     4878
Payment_Behaviour              7
Monthly_Balance             5129
dtype: int64