In [13]:
# import modules 
import sys
from pathlib import Path 

project_root = Path().resolve().parent # set current working directory to project root
sys.path.insert(0, str(project_root)) # set sys.path for project root to the highest priority search so we can use libaries from other directories

import pandas as pd
from typing import Tuple, Optional
from sklearn.model_selection import train_test_split
import joblib

In [14]:
# define load data function
def load_data(fname: str) -> pd.DataFrame:
    '''
    Load dataframe and its data shape information.

    Param:
    fname <str> : raw dataset path to be loaded.

    Return:
    <pd.DataFrame> : loaded dataframe.
    '''
    data = pd.read_csv(fname)
    print('Data Shape:', data.shape)
    return data

In [15]:
# make fname constant variable
FNAME = project_root/'data'/'raw'/'credit_risk_dataset.csv'

# execute load_data function
data = load_data(fname=FNAME)
data.head()

Data Shape: (32581, 12)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [16]:
# define split input output func
def split_input_output(data: pd.DataFrame, target_col: str) -> Tuple[pd.DataFrame, pd.Series]:

    """
    Split a DataFrame into input features (X) and target variable (y).

    Params:
    data <pd.DataFrame> : Input dataset containing features and target column.
    target_col <str> : Name of the target column to be separated from the features.

    Returns:
    Tuple<pd.DataFrame, pd.Series>
        X : pd.DataFrame
            DataFrame containing input features.
        y : pd.Series
            Series containing the target variable.
    """

    X = data.drop(columns=target_col)
    y = data[target_col]

    print('Original data shape:', data.shape)
    print('X data shape:', X.shape)
    print('y data shape:', y.shape)

    return X, y

In [17]:
# define target col variable
TARGET_COL = 'loan_status'

# execute split function
X, y = split_input_output(data=data, target_col=TARGET_COL)

Original data shape: (32581, 12)
X data shape: (32581, 11)
y data shape: (32581,)


In [18]:
# define split train & test function
def split_train_test(X: pd.DataFrame, y: pd.Series, test_size: float, random_state: Optional[int] = None) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:

    '''
    Split features and target into training and testing sets.

    Params:
    X <pd.DataFrame> : Feature matrix.
    y <pd.Series> : Target variable.
    test_size <float> : Proportion of the dataset to include in the test split (e.g., 0.2 for 20%).
    random_state <Optional[int], default=None> : Controls the shuffling applied to the data before splitting. Pass an integer for reproducible output.

    Returns:
    Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]
        X_train : pd.DataFrame
            Training feature set.
        X_test : pd.DataFrame
            Testing feature set.
        y_train : pd.Series
            Training target variable.
        y_test : pd.Series
            Testing target variable.
    '''

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    print('X train shape:', X_train.shape)
    print('X test shape:', X_test.shape)
    print('y train shape:', y_train.shape)
    print('y test shape:', y_test.shape)
    return X_train, X_test, y_train, y_test


In [19]:
# generate X_train and X_not_train data
X_train, X_not_train, y_train, y_not_train = split_train_test(
    X=X,
    y=y,
    test_size=0.2,
    random_state=42
)

X train shape: (26064, 11)
X test shape: (6517, 11)
y train shape: (26064,)
y test shape: (6517,)


In [20]:
# generate X_valid and X_test data
X_valid, X_test, y_valid, y_test = split_train_test(
    X=X_not_train,
    y=y_not_train,
    test_size=0.5,
    random_state=42
)

X train shape: (3258, 11)
X test shape: (3259, 11)
y train shape: (3258,)
y test shape: (3259,)


In [21]:
# define serialize function
def serialize_data(data: pd.DataFrame, path: str) -> None:
    """
    Serialize and save a Python object to disk using joblib.

    Params
    data <pd.DataFrame> : Python object to be serialized and saved.
    path <str> : File path where the serialized object will be stored.

    Return : None
    """
    joblib.dump(data, path)
    print('Saving object. . .')
    print(f'Your object has been successfully saved and stored into: {path}\n')

In [22]:
# execute serialize function
data_dict = {
    'X_train': X_train,
    'y_train': y_train,
    'X_test': X_test,
    'y_test': y_test,
    'X_valid': X_valid,
    'y_valid': y_valid
}

for name, data in data_dict.items():
    serialize_data(data=data, path=project_root/'data'/'interim'/f'{name}.pkl')

Saving object. . .
Your object has been successfully saved and stored into: /home/bagaskoroah/ml_process/BAGAS_MLPROCESS/data/interim/X_train.pkl

Saving object. . .
Your object has been successfully saved and stored into: /home/bagaskoroah/ml_process/BAGAS_MLPROCESS/data/interim/y_train.pkl

Saving object. . .
Your object has been successfully saved and stored into: /home/bagaskoroah/ml_process/BAGAS_MLPROCESS/data/interim/X_test.pkl

Saving object. . .
Your object has been successfully saved and stored into: /home/bagaskoroah/ml_process/BAGAS_MLPROCESS/data/interim/y_test.pkl

Saving object. . .
Your object has been successfully saved and stored into: /home/bagaskoroah/ml_process/BAGAS_MLPROCESS/data/interim/X_valid.pkl

Saving object. . .
Your object has been successfully saved and stored into: /home/bagaskoroah/ml_process/BAGAS_MLPROCESS/data/interim/y_valid.pkl



In [23]:
# define deserialize function
def deserialize_data(path: str) -> pd.DataFrame:
    """
    Load and deserialize a Python object from disk using joblib.

    Param
    path <str> : File path of the serialized object.

    Return
    data <pd.DataFrame> : The deserialized Python object loaded from disk.
    """
    print('Load object. . .')
    print(f'{path} has been successfully loaded!.')
    data = joblib.load(path)
    return data

In [24]:
# execute deserialize func
data = deserialize_data(path=project_root/'data'/'interim'/'X_test.pkl')
data

Load object. . .
/home/bagaskoroah/ml_process/BAGAS_MLPROCESS/data/interim/X_test.pkl has been successfully loaded!.


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
20439,30,31000,RENT,7.0,MEDICAL,D,4750,14.84,0.15,N,9
30809,36,60000,RENT,3.0,EDUCATION,A,9500,7.14,0.16,N,12
22978,29,33000,RENT,5.0,HOMEIMPROVEMENT,A,8000,5.79,0.24,N,8
19435,27,40000,RENT,3.0,PERSONAL,A,3000,7.05,0.07,N,8
31011,39,56000,RENT,16.0,DEBTCONSOLIDATION,C,10000,13.06,0.18,N,11
...,...,...,...,...,...,...,...,...,...,...,...
30578,36,34000,RENT,2.0,DEBTCONSOLIDATION,B,8000,,0.24,N,16
14216,24,50000,RENT,4.0,DEBTCONSOLIDATION,A,14400,,0.29,N,2
30342,38,52000,MORTGAGE,13.0,DEBTCONSOLIDATION,B,5000,10.99,0.10,N,15
19525,27,34000,OWN,7.0,EDUCATION,C,8000,12.53,0.24,N,10
