In [1]:
import pandas as pd

In [2]:
def load_data(fname):
    """Reads CSV data and performs basic preprocessing.

    This function reads data from a given CSV file, removes duplicates,
    and returns the data as a Pandas DataFrame.

    Args:
        fname (str): Path to the CSV file.

    Returns:
        data: DataFrame containing the preprocessed data.
    """

    if not isinstance(fname, str):
        raise TypeError("`fname` must be a string.")

    data = pd.read_csv(fname)
    print(f'Data shape  : {data.shape}')
    return data

In [3]:
FNAME = 'data/raw/credit_risk_dataset.csv'
data=load_data(FNAME)
data.head()

Data shape  : (32581, 12)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [4]:
def split_input_output(data, target_col):
    """
  Splits a dataset into input (X) and output (y).

  This function takes a DataFrame as input and the name of the target column as a string.
  The target column will be separated into the y variable, while the rest will be the X variable.

  Args:
    data (pd.DataFrame): DataFrame containing the complete dataset.
    target_col (str): Name of the column containing the target data.

  Returns:
    X (input) and y (output) both of them dataframe.
  """
    X = data.drop(columns=target_col)
    y = data[target_col]
    print(f'Original data shape : {data.shape}')
    print(f'X data shape        : {X.shape}')
    print(f'y data shape        : {y.shape}')
    return X, y

In [5]:
TARGET_COL = 'loan_status'
X, y = split_input_output(data=data,
                          target_col=TARGET_COL)

Original data shape : (32581, 12)
X data shape        : (32581, 11)
y data shape        : (32581,)


In [6]:
from sklearn.model_selection import train_test_split

In [8]:
def split_train_test(X, y, test_size, random_state):
    """
  Splits a dataset into training and testing sets.

  This function uses stratified sampling to ensure equal class proportions in the training and testing sets.

  Args:
    X: Features (independent variable).
    y: Target (dependent variable).
    test_size: Proportion of data to be used as the testing set.
    random_state: Value to generate the same random sequence each time the function is called.

  Returns:
    X_train, X_test, y_train, and y_test : Dataframe that use X and y for train,valid,and test.
  """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state,stratify=y)

    print(f"X_train shape   : {X_train.shape}")
    print(f"y_train shape   : {y_train.shape}")
    print(f"X_test shape    : {X_test.shape}")
    print(f"y_test shape    : {y_test.shape}")

    # Return X_train, X_test, y_train, y_test
    return X_train, X_test, y_train, y_test

In [9]:
# Split the data
# First, split the train & not train
X_train, X_not_train, y_train, y_not_train = split_train_test(X,y,0.2,42)
print()
# Then, split the valid & test
X_valid, X_test, y_valid, y_test = split_train_test(X_not_train,y_not_train,0.5,42)

X_train shape   : (26064, 11)
y_train shape   : (26064,)
X_test shape    : (6517, 11)
y_test shape    : (6517,)

X_train shape   : (3258, 11)
y_train shape   : (3258,)
X_test shape    : (3259, 11)
y_test shape    : (3259,)


In [10]:
import joblib

In [11]:
def serialize_data(data,path):
   """
  Serializes data using joblib.

  Args:
    data: Data to be serialized (can be a list, array, or other Python object).
    path: Path to the file where the data will be saved.

  Returns:
    None: This function does not return anything, but saves the data to a file.
  """
   joblib.dump(data,path)

In [12]:
serialize_data(X_train,"X_train.pkl")
serialize_data(y_train,"y_train.pkl")
serialize_data(X_valid,"X_valid.pkl")
serialize_data(y_valid,"y_valid.pkl")
serialize_data(X_test,"X_test.pkl")
serialize_data(y_test,"y_test.pkl")

In [13]:
def deserialized_data(path):
    """
  Deserializes data using joblib.

  Args:
    path: Path to the file where the data is stored.

  Returns:
    The deserialized data.
  """

    data=joblib.load(path)
    return data
