# Libraries, Modules, and Configuration File

In [2]:
import pandas as pd
import sys
sys.path.append('/root/ml_process_feb23/')
import src.util as utils
config = utils.load_config()
from sklearn.model_selection import train_test_split

# Data Collection

In [3]:
# Loading dataset
credit_data = pd.read_csv(config["dataset_original_path"])
credit_data


Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,yes,0,37.66667,4.5200,0.033270,124.983300,yes,no,3,54,1,12
1,yes,0,33.25000,2.4200,0.005217,9.854167,no,no,3,34,1,13
2,yes,0,33.66667,4.5000,0.004156,15.000000,yes,no,4,58,1,5
3,yes,0,30.50000,2.5400,0.065214,137.869200,no,no,0,25,1,7
4,yes,0,32.16667,9.7867,0.067051,546.503300,yes,no,2,64,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1314,yes,0,33.58333,4.5660,0.002146,7.333333,yes,no,0,94,1,19
1315,no,5,23.91667,3.1920,0.000376,0.000000,no,no,3,12,1,5
1316,yes,0,40.58333,4.6000,0.026513,101.298300,yes,no,2,1,1,2
1317,yes,0,32.83333,3.7000,0.008999,26.996670,no,yes,0,60,1,7


# Data Definition


In [4]:
# Check the DataFrame information
credit_data.info(verbose = False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Columns: 12 entries, card to active
dtypes: float64(4), int64(5), object(3)
memory usage: 123.8+ KB


From the information shown above, I comprehend a general 
insight into the `credit_data`.

It has 1319 rows and 12 features, which consist of 9 numerical features, and 3 numerical categorical features.








In [5]:
# Check the features name
credit_data.columns

Index(['card', 'reports', 'age', 'income', 'share', 'expenditure', 'owner',
       'selfemp', 'dependents', 'months', 'majorcards', 'active'],
      dtype='object')

Features definition:


*   `card`: Acceptance status of credit card application
*   `reports`: Number of major derogatory reports
*   `age`: Age, in years plus twelfths of a year
*   `income`: Yearly income (divided by 10,000)
*   `share`: Ratio of monthly credit card expenditure to yearly income
*   `expenditure`: Average monthly credit card expenditure
*   `owner`: Home ownership
*   `selfempl`: Self-employed status, "no" means an employee to a company
*   `dependents`: Number of dependents
*   `months`: Months living at current address
*   `majorcards`: Number of major credit cards held
*   `active`: Number of active credit accounts








# Data Validation


## Checking Null Values and Data Types in Each Feature

In [6]:
# Check the detailed DataFrame information
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   card         1319 non-null   object 
 1   reports      1319 non-null   int64  
 2   age          1319 non-null   float64
 3   income       1319 non-null   float64
 4   share        1319 non-null   float64
 5   expenditure  1319 non-null   float64
 6   owner        1319 non-null   object 
 7   selfemp      1319 non-null   object 
 8   dependents   1319 non-null   int64  
 9   months       1319 non-null   int64  
 10  majorcards   1319 non-null   int64  
 11  active       1319 non-null   int64  
dtypes: float64(4), int64(5), object(3)
memory usage: 123.8+ KB


There is no null value in this dataset.

The object (string) data types will be later converted to numeric.

## Checking The General Data Distribution

In [7]:
# Check the distribution of numerical features
credit_data.describe()


Unnamed: 0,reports,age,income,share,expenditure,dependents,months,majorcards,active
count,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0,1319.0
mean,0.456406,33.213103,3.365376,0.068732,185.057071,0.993935,55.267627,0.817286,6.996967
std,1.345267,10.142783,1.693902,0.094656,272.218917,1.247745,66.271746,0.386579,6.305812
min,0.0,0.166667,0.21,0.000109,0.0,0.0,0.0,0.0,0.0
25%,0.0,25.41667,2.24375,0.002316,4.583333,0.0,12.0,1.0,2.0
50%,0.0,31.25,2.9,0.038827,101.2983,1.0,30.0,1.0,6.0
75%,0.0,39.41667,4.0,0.093617,249.0358,2.0,72.0,1.0,11.0
max,14.0,83.5,13.5,0.90632,3099.505,6.0,540.0,1.0,46.0


Notes on the numerical data.
*  The counts in the entire features are the same value, meaning that `credit_data` has no null value.
* The minimum age recorded is 2 months old, which is consciously impossible to apply for a credit card at that age.
* Applicant(s) who have zero expenditure will also be checked thoroughly later to make sure the data is correctly filled in.






In [8]:
# Check the distribution of categorical features
credit_data.describe(include=['O'])

Unnamed: 0,card,owner,selfemp
count,1319,1319,1319
unique,2,2,2
top,yes,no,no
freq,1023,738,1228


All the categorical features have binary value, "yes" and "no".

## Data Deletion

The legal age to have a credit card is 18 years old, thus the data in feature `age` valued at less than 18 will be dropped.

I decide to do this deletion work in advance to limit the minimum data range in the data defense.

In [9]:
def del_rows(dataset, col, value):
    """A function for deleting data in numeric column,
    specify by the value"""
    
    dataset = dataset.loc[dataset[col] >= value].reset_index(drop = True)
    return dataset

In [10]:
# Execute the data deletion function
credit_data = del_rows(credit_data, 'age', 18)

# Data Defense

Create a defense mechanism for the input data.

In [11]:
def check_data(input_data, config):

    # Check data types
    assert input_data.select_dtypes("object").columns.to_list() == config["object_columns"], "input error, please fill the column(s) with 'yes' or 'no'"
    assert input_data.select_dtypes("int").columns.to_list() == config["int_columns"], "input error, please fill the column(s) with any numeric character."
    assert input_data.select_dtypes("float").columns.to_list() == config["float_columns"], "input error, please fill the column(s) with any numeric (decimal value is allowed) character."

    # Check range of data
    assert set(input_data[config["object_columns"][1]]).issubset(set(config["range_owner"])), "an error occurs in owner range."
    assert set(input_data[config["object_columns"][2]]).issubset(set(config["range_selfemp"])), "an error occurs in selfemp range."
    assert input_data[config["int_columns"][0]].between(config["range_reports"][0], config["range_reports"][1]).sum() == len(input_data), "an error occurs in reports range."
    assert input_data[config["float_columns"][0]].between(config["range_age"][0], config["range_age"][1]).sum() == len(input_data), "an error occurs in age range."
    assert input_data[config["float_columns"][1]].between(config["range_income"][0], config["range_income"][1]).sum() == len(input_data), "an error occurs in income range."
    assert input_data[config["float_columns"][2]].between(config["range_share"][0], config["range_share"][1]).sum() == len(input_data), "an error occurs in share range."
    assert input_data[config["float_columns"][3]].between(config["range_expenditure"][0], config["range_expenditure"][1]).sum() == len(input_data), "an error occurs in expenditure range."
    assert input_data[config["int_columns"][1]].between(config["range_dependents"][0], config["range_dependents"][1]).sum() == len(input_data), "an error occurs in dependents range."    
    assert input_data[config["int_columns"][2]].between(config["range_months"][0], config["range_months"][1]).sum() == len(input_data), "an error occurs in months range."
    assert input_data[config["int_columns"][3]].between(config["range_majorcards"][0], config["range_majorcards"][1]).sum() == len(input_data), "an error occurs in majorcards range."
    assert input_data[config["int_columns"][4]].between(config["range_active"][0], config["range_active"][1]).sum() == len(input_data), "an error occurs in active range."
   


In [12]:
# checking the data defense function
check_data(credit_data, config)

# Data Splitting

In [13]:
def split_input_output(dataset,
                       target_column,
                       save_file = True,
                       return_file = True):
    """Divide the data into its dependent variable/target (y-axis) and independent/predictor (x-axis) ones,
    input_df = predictors while output_df = target"""

    output_df = dataset[target_column]
    input_df = dataset.drop([target_column],
                            axis = 1)

    if save_file:  
        utils.pkl_dump(output_df, config["dataset_output_df_path"])
        utils.pkl_dump(input_df, config["dataset_input_df_path"])
    
    if return_file:
        return output_df, input_df


def split_train_test(x, y, TEST_SIZE):
    """Split the data into the training and test data,
    stratify parameter is activated,
    this function will be reproduced later as the data-splitting process further"""
    
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=TEST_SIZE,
                                                        random_state=46,
                                                        stratify=y)

    return x_train, x_test, y_train, y_test

def split_data(data_input, 
               data_output, 
               save_file = True,
               return_file=True, 
               TEST_SIZE=0.17):
    """Split the data into the training, validation and test data,
    first split process will return into the train and test data,
    the train data resulted from the first will be splitted further into the train and validation data.
    The TEST_SIZE 0.17 resulting the test data 17% proportion from the dataset length,
    while the proportion of train data and validation data are respectively 69% and 14%. 
    All files returned from this function are saved into pickle files."""

    x_train, x_test, y_train, y_test = split_train_test(data_input, 
                                                        data_output,
                                                        TEST_SIZE)

    x_train, x_valid, y_train, y_valid = split_train_test(x_train,
                                                          y_train,
                                                          TEST_SIZE)
    
    if save_file:
        utils.pkl_dump(x_train, config["dataset_train_path"][0])
        utils.pkl_dump(y_train, config["dataset_train_path"][1])
        utils.pkl_dump(x_valid, config["dataset_valid_path"][0])
        utils.pkl_dump(y_valid, config["dataset_valid_path"][1])
        utils.pkl_dump(x_test, config["dataset_test_path"][0])
        utils.pkl_dump(y_test, config["dataset_test_path"][1])

    if return_file:
        return x_train, y_train, \
            x_valid, y_valid, \
            x_test, y_test


In [14]:
# Execute the data-splitting functions

output_df, input_df = split_input_output(credit_data,
                                        target_column = "card", 
                                        save_file = False)

x_train, y_train, x_valid, y_valid, x_test, y_test = split_data(input_df,
                                                               output_df)


# Data Review

The "credit" feature in `credit_data` is set as the dependent variable (saved as output_df), while the rest features are the independent variables (saved as input_df).


In [15]:
# Recheck the division of the dependent and independent variables from the `credit_data`
print(f'The dependent variable (target):\n{output_df} \n')
print("-------------------------------------\n")
print(f'The independent variables (predictors):\n {input_df}')

The dependent variable (target):
0       yes
1       yes
2       yes
3       yes
4       yes
       ... 
1307    yes
1308     no
1309    yes
1310    yes
1311    yes
Name: card, Length: 1312, dtype: object 

-------------------------------------

The independent variables (predictors):
       reports       age  income     share  expenditure owner selfemp  \
0           0  37.66667  4.5200  0.033270   124.983300   yes      no   
1           0  33.25000  2.4200  0.005217     9.854167    no      no   
2           0  33.66667  4.5000  0.004156    15.000000   yes      no   
3           0  30.50000  2.5400  0.065214   137.869200    no      no   
4           0  32.16667  9.7867  0.067051   546.503300   yes      no   
...       ...       ...     ...       ...          ...   ...     ...   
1307        0  33.58333  4.5660  0.002146     7.333333   yes      no   
1308        5  23.91667  3.1920  0.000376     0.000000    no      no   
1309        0  40.58333  4.6000  0.026513   101.298300   yes     

Both the `input_df` and `output_df` are split into three data that will be used for training, validation, and testing the model.

The training data has the majority proportion.




In [16]:
# Inspect the data-splitting result
print(f"input_df shape: {input_df.shape}")
print(f"x_train shape: {x_train.shape}")
print(f"x_valid shape:{x_valid.shape}")
print(f"x_test shape: {x_test.shape}\n")
print("-------------------------------------\n")
print(f"output_df shape: {output_df.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_valid shape: {y_valid.shape}")
print(f"y_test shape: {y_test.shape}")


input_df shape: (1312, 11)
x_train shape: (903, 11)
x_valid shape:(185, 11)
x_test shape: (224, 11)

-------------------------------------

output_df shape: (1312,)
y_train shape: (903,)
y_valid shape: (185,)
y_test shape: (224,)


The stratified splitting method applies to the `output_df` to keep the value proportion approximately the same.

In [17]:
# Recheck the data stratification
print(f"y_train value proportion\n{y_train.value_counts(normalize = True)}")
print("-------------------------------------\n")
print(f"y_valid value proportion\n{y_valid.value_counts(normalize = True)}")
print("-------------------------------------\n")
print(f"y_test value proportion\n{y_test.value_counts(normalize = True)}")

y_train value proportion
yes    0.775194
no     0.224806
Name: card, dtype: float64
-------------------------------------

y_valid value proportion
yes    0.772973
no     0.227027
Name: card, dtype: float64
-------------------------------------

y_test value proportion
yes    0.776786
no     0.223214
Name: card, dtype: float64
