## Importing libraries

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load dataset

In [26]:
df=pd.read_csv("../data/credit_risk_dataset.csv")

In [27]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


## Separate target and feature variables

In [28]:
X=df.drop("loan_status",axis=1)
y=df["loan_status"]

In [29]:
X.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,0.55,Y,4


In [30]:
y.head()

0    1
1    0
2    1
3    1
4    1
Name: loan_status, dtype: int64

## Numeric vs categorical columns

In [31]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

In [32]:
num_cols

Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length'],
      dtype='object')

In [33]:
cat_cols

Index(['person_home_ownership', 'loan_intent', 'loan_grade',
       'cb_person_default_on_file'],
      dtype='object')

## Handle missing values

In [34]:
from sklearn.impute import SimpleImputer

num_imputer= SimpleImputer(strategy="median")
X[num_cols]=num_imputer.fit_transform(X[num_cols])

In [35]:
X.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

## Feature Engineering

In [36]:
X["age_group"] = pd.cut(X["person_age"],bins=[18, 25, 35, 50, 100],labels=["18-25", "26-35", "36-50", "50+"])

In [37]:
X=X.drop("person_age",axis=1)

In [38]:
X.head()

Unnamed: 0,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,age_group
0,59000.0,RENT,123.0,PERSONAL,D,35000.0,16.02,0.59,Y,3.0,18-25
1,9600.0,OWN,5.0,EDUCATION,B,1000.0,11.14,0.1,N,2.0,18-25
2,9600.0,MORTGAGE,1.0,MEDICAL,C,5500.0,12.87,0.57,N,3.0,18-25
3,65500.0,RENT,4.0,MEDICAL,C,35000.0,15.23,0.53,N,2.0,18-25
4,54400.0,RENT,8.0,MEDICAL,C,35000.0,14.27,0.55,Y,4.0,18-25


In [39]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   person_income               32581 non-null  float64 
 1   person_home_ownership       32581 non-null  object  
 2   person_emp_length           32581 non-null  float64 
 3   loan_intent                 32581 non-null  object  
 4   loan_grade                  32581 non-null  object  
 5   loan_amnt                   32581 non-null  float64 
 6   loan_int_rate               32581 non-null  float64 
 7   loan_percent_income         32581 non-null  float64 
 8   cb_person_default_on_file   32581 non-null  object  
 9   cb_person_cred_hist_length  32581 non-null  float64 
 10  age_group                   32576 non-null  category
dtypes: category(1), float64(6), object(4)
memory usage: 2.5+ MB


## Encode categorical variables

In [40]:
X = pd.get_dummies(X, drop_first=True)

In [41]:
X.head()

Unnamed: 0,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_EDUCATION,...,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_Y,age_group_26-35,age_group_36-50,age_group_50+
0,59000.0,123.0,35000.0,16.02,0.59,3.0,False,False,True,False,...,False,False,True,False,False,False,True,False,False,False
1,9600.0,5.0,1000.0,11.14,0.1,2.0,False,True,False,True,...,True,False,False,False,False,False,False,False,False,False
2,9600.0,1.0,5500.0,12.87,0.57,3.0,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
3,65500.0,4.0,35000.0,15.23,0.53,2.0,False,False,True,False,...,False,True,False,False,False,False,False,False,False,False
4,54400.0,8.0,35000.0,14.27,0.55,4.0,False,False,True,False,...,False,True,False,False,False,False,True,False,False,False


In [42]:
X.shape

(32581, 24)

## Scale numerical features

In [43]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [45]:
X_scaled

array([[-0.11414329, 28.92661352,  4.01940376, ..., -0.8552042 ,
        -0.32876163, -0.09360637],
       [-0.91114671,  0.05676262, -1.35864998, ..., -0.8552042 ,
        -0.32876163, -0.09360637],
       [-0.91114671, -0.92187639, -0.64684875, ..., -0.8552042 ,
        -0.32876163, -0.09360637],
       ...,
       [ 0.16012914, -0.43255688,  4.01940376, ..., -0.8552042 ,
        -0.32876163, 10.68303367],
       [ 1.35402091,  0.05676262,  0.85584274, ..., -0.8552042 ,
        -0.32876163, 10.68303367],
       [-0.38841572, -0.67721664, -0.49262515, ..., -0.8552042 ,
        -0.32876163, 10.68303367]], shape=(32581, 24))

## Train-Test split

In [46]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y,test_size=0.2,stratify=y,random_state=42)

In [47]:
X_train.shape, X_test.shape

((26064, 24), (6517, 24))

In [48]:
y_train.shape, y_test.shape

((26064,), (6517,))