<a href="https://colab.research.google.com/github/arfanrifqi/loan_prediction/blob/main/loan_data_prediction(modeling).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LIBRARY

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


#READ DATASET

In [77]:
load_data = pd.read_csv('/content/gdrive/MyDrive/dataset/credit_risk/data_clean.csv')

In [78]:
load_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466285 entries, 0 to 466284
Data columns (total 26 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   loan_amnt                    466285 non-null  int64  
 1   grade                        466285 non-null  object 
 2   annual_inc                   466285 non-null  float64
 3   verification_status          466285 non-null  object 
 4   purpose                      466285 non-null  object 
 5   dti                          466285 non-null  float64
 6   delinq_2yrs                  466285 non-null  float64
 7   inq_last_6mths               466285 non-null  float64
 8   open_acc                     466285 non-null  float64
 9   pub_rec                      466285 non-null  float64
 10  revol_bal                    466285 non-null  int64  
 11  revol_util                   466285 non-null  float64
 12  total_acc                    466285 non-null  float64
 13 

# FEATURE SCALING AND TRANSFORMATION

## using one hot encoding

In [79]:
cat_features = [col for col in load_data.select_dtypes(include='object').columns.tolist()]

In [80]:
onehot = pd.get_dummies(load_data[cat_features], drop_first=True)
onehot.head()

Unnamed: 0,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,verification_status_Source Verified,verification_status_Verified,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,initial_list_status_w
0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


## Standardization

In [81]:
num_features = [col for col in load_data.columns.tolist() if col not in cat_features + ['bad_flag']]

In [82]:
ss = StandardScaler()
std = pd.DataFrame(ss.fit_transform(load_data[num_features]), columns=num_features)

In [83]:
std.head()

Unnamed: 0,loan_amnt,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,tot_cur_bal,tot_coll_amt,mths_since_last_delinq,total_rec_late_fee,recoveries,collections_12_mths_ex_med,acc_now_delinq,term_months,emp_length_int,mths_since_earliest_cr_line,mths_since_issue_d
0,-1.124392,-0.896551,1.328632,-0.357012,0.17892,-1.641166,-0.31429,-0.124888,1.159498,-1.384557,-0.792648,-0.012089,-0.708792,-0.123464,-0.154549,-0.083608,-0.058307,-0.616225,1.138605,1.764941,1.446626
1,-1.426088,-0.787387,-2.065791,-0.357012,3.843328,-1.641166,-0.31429,-0.703378,-1.96598,-1.815538,-0.792648,-0.012089,-0.708792,-0.123464,0.05747,-0.083608,-0.058307,1.622784,-1.523744,-0.212426,1.446626
2,-1.438156,-1.110294,-1.082491,-0.357012,1.095022,-1.841641,-0.31429,-0.642003,1.78207,-1.298361,-0.792648,-0.012089,-0.708792,-0.123464,-0.154549,-0.083608,-0.058307,-0.616225,1.138605,-0.570896,1.446626
3,-0.521001,-0.438063,0.354248,-0.357012,0.17892,-0.237839,-0.31429,-0.514224,-1.478018,1.028934,-0.792648,-0.012089,0.860811,3.099264,-0.154549,-0.083608,-0.058307,-0.616225,1.138605,0.226989,1.446626
4,-1.365749,0.122311,0.091865,-0.357012,-0.737182,0.764538,-0.31429,0.558748,-0.094058,1.11513,-0.792648,-0.012089,0.991612,-0.123464,-0.154549,-0.083608,-0.058307,1.622784,-1.257509,0.238553,1.446626


In [84]:
onehot_data_model = pd.concat([onehot, load_data[['bad_flag']]], axis=1)

In [85]:
onehot_data_model.head()

Unnamed: 0,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,verification_status_Source Verified,verification_status_Verified,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,initial_list_status_w,bad_flag
0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


## using Advance Categorical Preprocessing

In [111]:
cat_data = load_data.pivot_table(index=cat_features,
                                values=['recoveries'],
                                aggfunc=[np.mean, np.median, np.max, np.min])
cat_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mean,median,amax,amin
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,recoveries,recoveries,recoveries,recoveries
grade,verification_status,purpose,initial_list_status,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
A,Not Verified,car,f,9.707613,0.0,1152.23,0.0
A,Not Verified,car,w,3.716667,0.0,590.95,0.0
A,Not Verified,credit_card,f,21.158087,0.0,16502.69,0.0
A,Not Verified,credit_card,w,2.031246,0.0,2160.2,0.0
A,Not Verified,debt_consolidation,f,17.835116,0.0,7002.1,0.0


In [112]:
cat_mean = cat_data['mean'].reset_index()
cat_median = cat_data['median'].reset_index()
cat_amax = cat_data['amax'].reset_index()
cat_amin = cat_data['amin'].reset_index()

In [113]:
temp_name = []
for column in cat_mean.columns:
    if column == cat_features:
        temp_name.append(column)
    else:
        name = 'mean' + '_' + column
        temp_name.append(name)

cat_mean.columns = temp_name

In [114]:
temp_name = []
for column in cat_median.columns:
    if column == cat_features:
        temp_name.append(column)
    else:
        name = 'median' + '_' + column
        temp_name.append(name)

cat_median.columns = temp_name

In [115]:
temp_name = []
for column in cat_amax.columns:
    if column == cat_features:
        temp_name.append(column)
    else:
        name = 'amax' + '_' + column
        temp_name.append(name)

cat_amax.columns = temp_name

In [116]:
temp_name = []
for column in cat_amin.columns:
    if column == cat_features:
        temp_name.append(column)
    else:
        name = 'amin' + '_' + column
        temp_name.append(name)

cat_amin.columns = temp_name

In [117]:
cat_amin.head()

Unnamed: 0,amin_grade,amin_verification_status,amin_purpose,amin_initial_list_status,amin_recoveries
0,A,Not Verified,car,f,0.0
1,A,Not Verified,car,w,0.0
2,A,Not Verified,credit_card,f,0.0
3,A,Not Verified,credit_card,w,0.0
4,A,Not Verified,debt_consolidation,f,0.0


In [118]:
join_data = pd.concat([cat_mean, cat_median, cat_amax, cat_amin], axis=1)

In [119]:
join_data.head()

Unnamed: 0,mean_grade,mean_verification_status,mean_purpose,mean_initial_list_status,mean_recoveries,median_grade,median_verification_status,median_purpose,median_initial_list_status,median_recoveries,amax_grade,amax_verification_status,amax_purpose,amax_initial_list_status,amax_recoveries,amin_grade,amin_verification_status,amin_purpose,amin_initial_list_status,amin_recoveries
0,A,Not Verified,car,f,9.707613,A,Not Verified,car,f,0.0,A,Not Verified,car,f,1152.23,A,Not Verified,car,f,0.0
1,A,Not Verified,car,w,3.716667,A,Not Verified,car,w,0.0,A,Not Verified,car,w,590.95,A,Not Verified,car,w,0.0
2,A,Not Verified,credit_card,f,21.158087,A,Not Verified,credit_card,f,0.0,A,Not Verified,credit_card,f,16502.69,A,Not Verified,credit_card,f,0.0
3,A,Not Verified,credit_card,w,2.031246,A,Not Verified,credit_card,w,0.0,A,Not Verified,credit_card,w,2160.2,A,Not Verified,credit_card,w,0.0
4,A,Not Verified,debt_consolidation,f,17.835116,A,Not Verified,debt_consolidation,f,0.0,A,Not Verified,debt_consolidation,f,7002.1,A,Not Verified,debt_consolidation,f,0.0


In [120]:
advance_data = pd.concat([join_data, std, load_data['bad_flag']], axis=1)
advance_data.head()

Unnamed: 0,mean_grade,mean_verification_status,mean_purpose,mean_initial_list_status,mean_recoveries,median_grade,median_verification_status,median_purpose,median_initial_list_status,median_recoveries,amax_grade,amax_verification_status,amax_purpose,amax_initial_list_status,amax_recoveries,amin_grade,amin_verification_status,amin_purpose,amin_initial_list_status,amin_recoveries,loan_amnt,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,tot_cur_bal,tot_coll_amt,mths_since_last_delinq,total_rec_late_fee,recoveries,collections_12_mths_ex_med,acc_now_delinq,term_months,emp_length_int,mths_since_earliest_cr_line,mths_since_issue_d,bad_flag
0,A,Not Verified,car,f,9.707613,A,Not Verified,car,f,0.0,A,Not Verified,car,f,1152.23,A,Not Verified,car,f,0.0,-1.124392,-0.896551,1.328632,-0.357012,0.17892,-1.641166,-0.31429,-0.124888,1.159498,-1.384557,-0.792648,-0.012089,-0.708792,-0.123464,-0.154549,-0.083608,-0.058307,-0.616225,1.138605,1.764941,1.446626,0
1,A,Not Verified,car,w,3.716667,A,Not Verified,car,w,0.0,A,Not Verified,car,w,590.95,A,Not Verified,car,w,0.0,-1.426088,-0.787387,-2.065791,-0.357012,3.843328,-1.641166,-0.31429,-0.703378,-1.96598,-1.815538,-0.792648,-0.012089,-0.708792,-0.123464,0.05747,-0.083608,-0.058307,1.622784,-1.523744,-0.212426,1.446626,1
2,A,Not Verified,credit_card,f,21.158087,A,Not Verified,credit_card,f,0.0,A,Not Verified,credit_card,f,16502.69,A,Not Verified,credit_card,f,0.0,-1.438156,-1.110294,-1.082491,-0.357012,1.095022,-1.841641,-0.31429,-0.642003,1.78207,-1.298361,-0.792648,-0.012089,-0.708792,-0.123464,-0.154549,-0.083608,-0.058307,-0.616225,1.138605,-0.570896,1.446626,0
3,A,Not Verified,credit_card,w,2.031246,A,Not Verified,credit_card,w,0.0,A,Not Verified,credit_card,w,2160.2,A,Not Verified,credit_card,w,0.0,-0.521001,-0.438063,0.354248,-0.357012,0.17892,-0.237839,-0.31429,-0.514224,-1.478018,1.028934,-0.792648,-0.012089,0.860811,3.099264,-0.154549,-0.083608,-0.058307,-0.616225,1.138605,0.226989,1.446626,0
4,A,Not Verified,debt_consolidation,f,17.835116,A,Not Verified,debt_consolidation,f,0.0,A,Not Verified,debt_consolidation,f,7002.1,A,Not Verified,debt_consolidation,f,0.0,-1.365749,0.122311,0.091865,-0.357012,-0.737182,0.764538,-0.31429,0.558748,-0.094058,1.11513,-0.792648,-0.012089,0.991612,-0.123464,-0.154549,-0.083608,-0.058307,1.622784,-1.257509,0.238553,1.446626,0


# TRAINING AND TESTING

## training and testing using one hot encode

In [95]:
X = onehot_data_model.drop('bad_flag', axis=1)
y = onehot_data_model['bad_flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### balance model

In [96]:
pd.Series(y_train).value_counts()

0    332250
1     40778
Name: bad_flag, dtype: int64

#### undersampling

In [97]:
from imblearn.under_sampling import RandomUnderSampler
undersample = RandomUnderSampler(sampling_strategy='majority')
X_under, y_under = undersample.fit_resample(X_train, y_train)

In [98]:
pd.Series(y_under).value_counts()

0    40778
1    40778
Name: bad_flag, dtype: int64

#### oversampling

In [99]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_over, y_over = oversample.fit_resample(X_train, y_train)

### logistic reggression

In [100]:
logreg_over = LogisticRegression(solver='liblinear', random_state=0)
logreg_under = LogisticRegression(solver='liblinear', random_state=0)

In [101]:
logreg_over.fit(X_over, y_over)
logreg_under.fit(X_under, y_under)

LogisticRegression(random_state=0, solver='liblinear')

In [102]:
y_pred_testOver = logreg_over.predict(X_test)
y_pred_testUnder = logreg_under.predict(X_test)

####training model

##### performance model using oversampling data

In [103]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred_testOver ))

              precision    recall  f1-score   support

           0       0.93      0.52      0.67     83067
           1       0.15      0.70      0.25     10190

    accuracy                           0.54     93257
   macro avg       0.54      0.61      0.46     93257
weighted avg       0.85      0.54      0.62     93257



##### performance model using undersampling data

In [105]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred_testUnder ))

              precision    recall  f1-score   support

           0       0.93      0.59      0.72     83067
           1       0.16      0.65      0.26     10190

    accuracy                           0.59     93257
   macro avg       0.55      0.62      0.49     93257
weighted avg       0.85      0.59      0.67     93257



In [106]:
from sklearn.metrics import accuracy_score

print('Model accuracy using oversampling score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_testOver)))
print('Model accuracy using undersampling score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_testUnder)))

Model accuracy using oversampling score: 0.5425
Model accuracy using undersampling score: 0.5947


In [107]:
# print the scores on training and test set

print('Training set Oversampling score: {:.4f}'.format(logreg_over.score(X_over, y_over)))
print('Test set score: {:.4f}'.format(logreg_over.score(X_test, y_test)))
print("====================================")
print('Training set Undersampling score: {:.4f}'.format(logreg_under.score(X_under, y_under)))
print('Test set score: {:.4f}'.format(logreg_under.score(X_test, y_test)))

Training set Oversampling score: 0.6058
Test set score: 0.5425
Training set Undersampling score: 0.6100
Test set score: 0.5947


##training and testing using advance data

In [132]:
advance_data.fillna(0, inplace=True)

In [133]:
x = advance_data.drop(['mean_grade','mean_verification_status','mean_purpose',
                      'mean_initial_list_status','median_grade','median_verification_status', 
                      'median_purpose','median_initial_list_status','amax_grade','amax_verification_status', 
                      'amax_purpose','amax_initial_list_status','amin_grade','amin_verification_status','amin_purpose', 
                      'amin_initial_list_status','bad_flag'], axis=1)
y = advance_data['bad_flag']

x_train, x_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [134]:
pd.Series(Y_train).value_counts()

0    332250
1     40778
Name: bad_flag, dtype: int64

In [135]:
undersample = RandomUnderSampler(sampling_strategy='majority')
x_under, Y_under = undersample.fit_resample(x_train, Y_train)

In [136]:
oversample = SMOTE()
x_over, Y_over = oversample.fit_resample(x_train, Y_train)

In [137]:
logreg_over.fit(x_over, Y_over)
logreg_under.fit(x_under, Y_under)

LogisticRegression(random_state=0, solver='liblinear')

In [138]:
Y_pred_testOver = logreg_over.predict(x_test)
Y_pred_testUnder = logreg_under.predict(x_test)

#### training

##### oversampling

In [140]:
print(classification_report(Y_test,Y_pred_testOver ))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95     83067
           1       0.59      0.56      0.57     10190

    accuracy                           0.91     93257
   macro avg       0.77      0.76      0.76     93257
weighted avg       0.91      0.91      0.91     93257



#####undersampling

In [141]:
print(classification_report(Y_test,Y_pred_testUnder ))

              precision    recall  f1-score   support

           0       0.95      0.94      0.95     83067
           1       0.56      0.56      0.56     10190

    accuracy                           0.90     93257
   macro avg       0.75      0.75      0.75     93257
weighted avg       0.90      0.90      0.90     93257



In [142]:
# print the scores on training and test set

print('Training set Oversampling score: {:.4f}'.format(logreg_over.score(x_over, Y_over)))
print('Test set score: {:.4f}'.format(logreg_over.score(x_test, Y_test)))
print("====================================")
print('Training set Undersampling score: {:.4f}'.format(logreg_under.score(x_under, Y_under)))
print('Test set score: {:.4f}'.format(logreg_under.score(x_test, Y_test)))

Training set Oversampling score: 0.7676
Test set score: 0.9088
Training set Undersampling score: 0.7513
Test set score: 0.9031
