In [24]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import joblib
import sklearn


In [2]:
df= pd.read_csv('loan_data.csv')
df.head()

Unnamed: 0,age,income_lpa,credit_score,loan_amount,loan_tenure_months,employment_type,education_level,default_risk
0,56,24.2,570,18.43,48,unemployed,highschool,High
1,38,5.95,766,2.95,24,salaried,postgraduate,Medium
2,41,11.01,430,1.0,12,self-employed,highschool,High
3,41,25.22,685,4.95,48,salaried,highschool,Low
4,50,10.17,321,0.67,6,unemployed,highschool,High


In [3]:
df['education_level'].unique()

array(['highschool', 'postgraduate', 'graduate'], dtype=object)

In [4]:
df['employment_type'].unique()

array(['unemployed', 'salaried', 'self-employed'], dtype=object)

In [5]:
df_feat= df.copy()

In [6]:
df['age'].unique()

array([56, 38, 41, 50, 44, 64, 54, 21, 43, 24, 57, 46, 31, 42, 32, 25, 52,
       45, 30, 47, 39, 26, 29, 34, 40, 19, 23, 37, 59, 49, 33, 35, 36, 61,
       63, 20, 51, 62, 18, 53, 58, 48, 22, 27, 28, 60, 55], dtype=int64)

In [7]:
# Feature 1 age group
def age_group(age):
    if age < 24:
        return 'young'
    elif age < 40:
        return 'adult'
    elif age < 60:
        return 'Senior'
    else:
        return 'Old'

In [8]:
df_feat['age_group'] = df['age'].apply(age_group)

In [9]:
df_feat.head()

Unnamed: 0,age,income_lpa,credit_score,loan_amount,loan_tenure_months,employment_type,education_level,default_risk,age_group
0,56,24.2,570,18.43,48,unemployed,highschool,High,Senior
1,38,5.95,766,2.95,24,salaried,postgraduate,Medium,adult
2,41,11.01,430,1.0,12,self-employed,highschool,High,Senior
3,41,25.22,685,4.95,48,salaried,highschool,Low,Senior
4,50,10.17,321,0.67,6,unemployed,highschool,High,Senior


In [10]:
# dropping some unnecessary columns like age
df_feat.drop(columns=['age'], inplace=True)

In [11]:
df_feat.head()

Unnamed: 0,income_lpa,credit_score,loan_amount,loan_tenure_months,employment_type,education_level,default_risk,age_group
0,24.2,570,18.43,48,unemployed,highschool,High,Senior
1,5.95,766,2.95,24,salaried,postgraduate,Medium,adult
2,11.01,430,1.0,12,self-employed,highschool,High,Senior
3,25.22,685,4.95,48,salaried,highschool,Low,Senior
4,10.17,321,0.67,6,unemployed,highschool,High,Senior


In [12]:
df_feat.columns

Index(['income_lpa', 'credit_score', 'loan_amount', 'loan_tenure_months',
       'employment_type', 'education_level', 'default_risk', 'age_group'],
      dtype='object')

In [13]:
# making label and target columns
X= df_feat[['income_lpa', 'credit_score', 'loan_amount', 'loan_tenure_months','employment_type', 'education_level','age_group']]
y= df_feat['default_risk']

In [14]:
X.head()

Unnamed: 0,income_lpa,credit_score,loan_amount,loan_tenure_months,employment_type,education_level,age_group
0,24.2,570,18.43,48,unemployed,highschool,Senior
1,5.95,766,2.95,24,salaried,postgraduate,adult
2,11.01,430,1.0,12,self-employed,highschool,Senior
3,25.22,685,4.95,48,salaried,highschool,Senior
4,10.17,321,0.67,6,unemployed,highschool,Senior


In [15]:
y.head()

0      High
1    Medium
2      High
3       Low
4      High
Name: default_risk, dtype: object

In [16]:
# define categorical and numerical columns
categorical_columns=['employment_type','education_level','age_group',]
numerical_columns=['income_lpa','credit_score','loan_amount','loan_tenure_months']

In [17]:
# define column transformer for OHE
processor= ColumnTransformer(
    transformers=[
        ('cat',OneHotEncoder(),categorical_columns),
        ('num','passthrough',numerical_columns)
    ]
)

In [18]:
processor

In [19]:
# Create a pipeline with preprocessing and random forest classifier
pipeline= Pipeline(steps=[
    ('processor',processor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [20]:
# split the data into train test split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
pipeline.fit(x_train, y_train)

In [26]:
# predict and evaluate the result
y_pred= pipeline.predict(x_test)
accuracy_score(y_test , y_pred)

0.975

In [27]:
# download the model
# import pickle
# pickle_model_path= 'load_risk_model.pkl'
# with open(pickle_model_path, 'wb') as f:
#     pickle.dump(pipeline, f)

In [22]:
joblib.dump(pipeline, "model.pkl")

['model.pkl']

In [25]:
print(sklearn.__version__)

1.2.1
