In [60]:
# Importing necessary libraries (Python packages) for the data manipulation, preprocessing, and splitting.
import pandas as pd
import numpy as np
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split

In [61]:
# Reading the data from the CSV file and assigning it to a variable df (dataframe).
df = pd.read_csv('credit.csv')

In [62]:
# Specifying a list of unwanted features (columns) and removing them using drop.
unwanted_features = ['ID', 'Customer_ID', 'Name', 'Age', 'SSN', 'Type_of_Loan', 'Num_Credit_Inquiries']
df.drop(columns=unwanted_features, inplace=True)

In [63]:
# Specifying a list of numerical columns required for later data preprocessing.
numerical_cols = ['Month', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts', 'Num_Credit_Card',
                  'Interest_Rate', 'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Num_of_Loan',
                  'Credit_Utilization_Ratio', 'Changed_Credit_Limit', 'Credit_Utilization_Ratio',
                  'Outstanding_Debt', 'Credit_History_Age', 'Total_EMI_per_month', 
                  'Amount_invested_monthly']

In [64]:
# Transform numerical data using the rank-based inverse normal transformation
transformer = QuantileTransformer(output_distribution='normal', random_state=0)
df[numerical_cols] = transformer.fit_transform(df[numerical_cols])

In [65]:
# Specifying a list of categorical columns required for later data preprocessing.
categorical_cols = ['Occupation', 'Delay_from_due_date', 'Num_of_Delayed_Payment', 
                    'Credit_Mix', 'Payment_Behaviour', 'Payment_of_Min_Amount']

In [66]:
# Performing one-hot encoding on categorical columns using pd.get_dummies() and updating the df variable.
df = pd.get_dummies(df, columns=categorical_cols)

In [67]:
# Separating features (X) and target variable (y) by creating train-validation-test datasets for model evaluation and development.
np.random.seed(0)
X = df.drop(columns=['Credit_Score'])
y = df['Credit_Score']

In [68]:
# Splitting the data into training, validation, and test sets using stratified sampling with the train_test_split method.
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=0)

In [69]:
# Displaying sizes of the training, validation, and test sets to the console.
print('Training set size: ', len(X_train))
print('Validation set size: ', len(X_val))
print('Testing set size: ', len(X_test))

Training set size:  80000
Validation set size:  10000
Testing set size:  10000
