### Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Loading Dataset

In [2]:
df = pd.read_csv('./data/credit_risk_dataset.csv')
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


In [4]:
df.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,31686.0,32581.0,29465.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,4.789686,9589.371106,11.011695,0.218164,0.170203,5.804211
std,6.348078,61983.12,4.14263,6322.086646,3.240459,0.413006,0.106782,4.055001
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.47,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [5]:
df.isna().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

### Filling NA values with mean values

In [6]:
df['loan_int_rate'] = df['loan_int_rate'].fillna(value=df['loan_int_rate'].mean())
df['person_emp_length'] = df['person_emp_length'].fillna(value=df['person_emp_length'].mean())

### Changing categorical values to numerical values and performing standardization on them 

In [7]:
categorical_cols = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
numerical_cols = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 
                    'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [8]:
bool_cols = df.select_dtypes(include='bool').columns
df[bool_cols] = df[bool_cols].astype(int)
X = df.drop(columns='loan_status')
y = df['loan_status']
df.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_Y
0,-0.903374,-0.114143,28.935738,4.019404,1.625251,1,3.931411,-0.691554,0,0,...,0,1,0,0,0,1,0,0,0,1
1,-1.060904,-0.911147,0.051481,-1.35865,0.041636,0,-0.657458,-0.938167,0,1,...,0,0,0,1,0,0,0,0,0,0
2,-0.430783,-0.911147,-0.927646,-0.646849,0.603041,1,3.74411,-0.691554,0,0,...,1,0,0,0,1,0,0,0,0,0
3,-0.745843,-0.009274,-0.193301,4.019404,1.368887,1,3.369508,-0.938167,0,0,...,1,0,0,0,1,0,0,0,0,0
4,-0.588313,-0.188358,0.785826,4.019404,1.057357,1,3.556809,-0.444942,0,0,...,1,0,0,0,1,0,0,0,0,1


### Splitting data into training and testing sets and saving them in train and test files

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [10]:
x_train.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_Y
32377,5.712903,-0.323881,-0.682865,-0.757573,0.025411,-0.657458,4.487315,0,0,1,...,0,1,0,0,1,0,0,0,0,1
1338,-0.273252,-0.646554,-1.172428,-0.172315,1.764791,1.496501,-0.691554,0,1,0,...,0,0,0,0,0,0,1,0,0,0
7047,-0.745843,-0.243213,-0.438083,1.014021,0.680924,1.3092,-0.691554,0,0,0,...,0,1,0,0,1,0,0,0,0,1
8225,-0.903374,-0.16248,0.296263,-0.56776,-1.01627,-0.563808,-0.444942,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7178,-0.588313,0.20853,-0.438083,-0.409582,0.495952,-0.751109,-0.691554,0,0,1,...,0,1,0,0,1,0,0,0,0,0


In [11]:
train_data = pd.concat([pd.DataFrame(x_train), pd.DataFrame(y_train)], axis=1)
test_data = pd.concat([pd.DataFrame(x_test), pd.DataFrame(y_test)], axis=1)
train_data.to_csv('./data/train.csv', index=False)
test_data.to_csv('./data/test.csv', index=False)

In [12]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26064 entries, 32377 to 23654
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   person_age                   26064 non-null  float64
 1   person_income                26064 non-null  float64
 2   person_emp_length            26064 non-null  float64
 3   loan_amnt                    26064 non-null  float64
 4   loan_int_rate                26064 non-null  float64
 5   loan_percent_income          26064 non-null  float64
 6   cb_person_cred_hist_length   26064 non-null  float64
 7   person_home_ownership_OTHER  26064 non-null  int32  
 8   person_home_ownership_OWN    26064 non-null  int32  
 9   person_home_ownership_RENT   26064 non-null  int32  
 10  loan_intent_EDUCATION        26064 non-null  int32  
 11  loan_intent_HOMEIMPROVEMENT  26064 non-null  int32  
 12  loan_intent_MEDICAL          26064 non-null  int32  
 13  loan_intent_PERSO

In [13]:
y_test.info()

<class 'pandas.core.series.Series'>
Index: 6517 entries, 14668 to 24385
Series name: loan_status
Non-Null Count  Dtype
--------------  -----
6517 non-null   int64
dtypes: int64(1)
memory usage: 101.8 KB
