In [15]:
import pandas as pd
import src.utils as utils

from sklearn.model_selection import train_test_split

# Load Configuration File

In [16]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/data.csv',
 'data_set_path': 'data/output/data.pkl',
 'input_set_path': 'data/output/input.pkl',
 'output_set_path': 'data/output/output.pkl',
 'input_columns_path': 'data/output/input_columns.pkl',
 'train_set_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'valid_set_path': ['data/output/X_valid.pkl', 'data/output/y_valid.pkl'],
 'test_set_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'index_column': 'Unnamed: 0',
 'output_column': 'SeriousDlqin2yrs',
 'seed': 42,
 'test_size': 0.2,
 'clean_late_col': 'NumberOfTimes90DaysLate',
 'clean_late_val': 96,
 'clean_unsecure_col': 'RevolvingUtilizationOfUnsecuredLines',
 'constant_imputer_col': 'NumberOfDependents',
 'constant_imputer_path': 'data/output/constant_imputer.pkl',
 'constant_imputer_val': 0.0,
 'median_imputer_col': 'MonthlyIncome',
 'median_imputer_path': 'data/output/median_imputer.pkl',
 'standardizer_path': 'data/output/standardizer.pkl',
 'preprocessor_path': 'd

# Data Collection

In [17]:
def read_data(return_file=True):
    # Read data
    data = pd.read_csv(CONFIG_DATA['raw_dataset_path'], 
                       sep=',',
                       index_col=CONFIG_DATA['index_column'])

    # Print data
    print('data shape   :', data.shape)

    # Dump data
    utils.pickle_dump(data, CONFIG_DATA['data_set_path'])

    # Return data
    if return_file:
        return data

In [18]:
data = read_data()
data.head().T

data shape   : (150000, 11)


Unnamed: 0,1,2,3,4,5
SeriousDlqin2yrs,1.0,0.0,0.0,0.0,0.0
RevolvingUtilizationOfUnsecuredLines,0.766127,0.957151,0.65818,0.23381,0.907239
age,45.0,40.0,38.0,30.0,49.0
NumberOfTime30-59DaysPastDueNotWorse,2.0,0.0,1.0,0.0,1.0
DebtRatio,0.802982,0.121876,0.085113,0.03605,0.024926
MonthlyIncome,9120.0,2600.0,3042.0,3300.0,63588.0
NumberOfOpenCreditLinesAndLoans,13.0,4.0,2.0,5.0,7.0
NumberOfTimes90DaysLate,0.0,0.0,1.0,0.0,0.0
NumberRealEstateLoansOrLines,6.0,0.0,0.0,0.0,1.0
NumberOfTime60-89DaysPastDueNotWorse,0.0,0.0,0.0,0.0,0.0


In [19]:
data.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.757222
std,0.249746,249.755371,14.771866,4.192781,2037.818523,14384.67,5.145951,4.169304,1.129771,4.155179,1.115086
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029867,41.0,0.0,0.175074,3400.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.559046,63.0,0.0,0.868254,8249.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


# Data Splitting

In [20]:
def split_input_output(return_file=True):
    # Read data
    data = utils.pickle_load(CONFIG_DATA['data_set_path'])

    # Split input & output
    y = data[CONFIG_DATA['output_column']]
    X = data.drop([CONFIG_DATA['output_column']], axis=1)

    # Print splitting
    print('Input shape  :', X.shape)
    print('Output shape :', y.shape)
    print('Input NAN    :')
    print(X.isnull().sum())
    print('Benchmark    :')
    print(y.value_counts(normalize=True))
    
    # Dump file
    utils.pickle_dump(X, CONFIG_DATA['input_set_path'])
    utils.pickle_dump(y, CONFIG_DATA['output_set_path'])
    utils.pickle_dump(X.columns, CONFIG_DATA['input_columns_path'])     # dump input columns

    if return_file:
        return X, y

In [21]:
X, y = split_input_output()

Input shape  : (150000, 10)
Output shape : (150000,)
Input NAN    :
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64
Benchmark    :
0    0.93316
1    0.06684
Name: SeriousDlqin2yrs, dtype: float64


In [22]:
def split_train_test(return_file=True):
    # Load data
    X = utils.pickle_load(CONFIG_DATA['input_set_path'])
    y = utils.pickle_load(CONFIG_DATA['output_set_path'])

    # Split test & rest (train & valid)
    X_train, X_test, y_train, y_test = train_test_split(
                                            X,
                                            y,
                                            test_size = CONFIG_DATA['test_size'],
                                            random_state = CONFIG_DATA['seed']
                                        )
    
    # Split train & valid
    X_train, X_valid, y_train, y_valid = train_test_split(
                                            X_train,
                                            y_train,
                                            test_size = CONFIG_DATA['test_size'],
                                            random_state = CONFIG_DATA['seed']
                                        )
    
    # Print splitting
    print('X_train shape :', X_train.shape)
    print('y_train shape :', y_train.shape)
    print('X_valid shape  :', X_valid.shape)
    print('y_valid shape  :', y_valid.shape)
    print('X_test shape  :', X_test.shape)
    print('y_test shape  :', y_test.shape)

    # Dump file
    utils.pickle_dump(X_train, CONFIG_DATA['train_set_path'][0])
    utils.pickle_dump(y_train, CONFIG_DATA['train_set_path'][1])
    utils.pickle_dump(X_valid, CONFIG_DATA['valid_set_path'][0])
    utils.pickle_dump(y_valid, CONFIG_DATA['valid_set_path'][1])
    utils.pickle_dump(X_test, CONFIG_DATA['test_set_path'][0])
    utils.pickle_dump(y_test, CONFIG_DATA['test_set_path'][1])

    if return_file:
        return X_train, X_valid, X_test, y_train, y_valid, y_test

In [23]:
X_train, X_valid, X_test, y_train, y_valid, y_test = split_train_test()

X_train shape : (96000, 10)
y_train shape : (96000,)
X_valid shape  : (24000, 10)
y_valid shape  : (24000,)
X_test shape  : (30000, 10)
y_test shape  : (30000,)


Get sample for testing

In [24]:
import numpy as np

In [30]:
np.random.seed(123)
y_sample_0 = y_test[y_test==0].sample(10)
y_sample_1 = y_test[y_test==1].sample(10)

y_sample = pd.concat((y_sample_0, y_sample_1), axis=0)
y_sample

111662    0
38064     0
46449     0
118458    0
109521    0
123210    0
109262    0
125596    0
86219     0
38840     0
21418     1
75003     1
67909     1
33771     1
10119     1
43455     1
62372     1
7034      1
55058     1
104990    1
Name: SeriousDlqin2yrs, dtype: int64

In [31]:
X_sample = X_test.loc[y_sample.index]
X_sample

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
111662,0.673854,49,0,0.638269,4436.0,6,0,2,0,2.0
38064,0.231724,63,0,0.454444,4488.0,19,0,2,0,0.0
46449,0.137089,78,0,2518.0,,8,0,2,0,0.0
118458,0.21992,51,0,0.353223,6250.0,9,0,2,0,1.0
109521,0.179662,69,0,0.13788,12916.0,9,0,0,0,0.0
123210,0.477672,31,0,0.114401,7700.0,11,0,0,0,1.0
109262,0.06913,53,0,0.244548,10500.0,24,0,2,0,3.0
125596,0.710694,37,0,0.155645,23257.0,4,0,1,0,0.0
86219,0.0,71,0,0.564551,5150.0,5,0,0,0,0.0
38840,0.385848,40,1,0.452293,4600.0,6,0,1,0,3.0


In [32]:
X_sample.to_csv('data/output/X_sample.csv', index=False)

In [33]:
y_sample.to_csv('data/output/y_sample.csv', index=False)