In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Importing Data**

In [None]:
import pandas as pd
import numpy as np
import math
 
# To display all the columns of dataframe
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("default")

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Technocolabs/Bondora_Final.csv")

In [None]:
df

Unnamed: 0,BidsPortfolioManager,BidsApi,BidsManual,LanguageCode,Age,Interest,LoanDuration,Education,MaritalStatus,EmploymentDurationCurrentEmployer,HomeOwnershipType,IncomeTotal,ExistingLiabilities,LiabilitiesTotal,DebtToIncome,FreeCash,MonthlyPaymentDay,Rating,PreviousRepaymentsBeforeLoan,Amount,RepaymentYears,ROI,EMI,Defaulted
0,0,0,115.0410,Estonian,61,0.3000,12,Vocational Education,Married,UpTo3Years,Owner,10500.0,0,0.00,0.00,0.00,25,F,0.0000,115.0408,0.416667,23.076923,11.214997,0
1,0,0,140.6057,Estonian,48,0.2500,1,Higher Education,Divorced,MoreThan5Years,Owner,10800.0,0,0.00,0.00,0.00,15,F,258.6256,140.6057,0.166667,20.000000,143.534985,0
2,0,0,319.5580,Estonian,58,0.2500,20,Secondary Education,Married,UpTo4Years,Owner,7000.0,0,0.00,0.00,0.00,25,F,0.0000,319.5409,0.500000,20.000000,19.699580,1
3,0,0,57.5205,Estonian,23,0.4500,15,Basic Education,Single,UpTo2Years,Owner,11600.0,0,0.00,0.00,0.00,15,F,0.0000,57.5205,0.500000,31.034483,5.083429,0
4,0,0,319.5582,Estonian,25,0.3000,12,Secondary Education,Cohabitant,UpTo2Years,Owner,6800.0,0,0.00,0.00,0.00,25,F,0.0000,319.5436,0.416667,23.076923,31.151388,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73899,2515,0,485.0000,Finnish,37,0.3101,60,Vocational Education,Single,MoreThan5Years,Council House,1400.0,1,500.00,7.69,792.32,1,E,0.0000,3000.0000,0.583333,23.669949,98.929540,0
73900,1880,0,1120.0000,Finnish,35,0.1825,60,Vocational Education,Single,MoreThan5Years,Owner,2265.0,5,1777.46,31.58,520.67,7,B,389.3600,3000.0000,0.583333,15.433404,76.588868,0
73901,1975,0,525.0000,Finnish,40,0.2483,60,Vocational Education,Married,MoreThan5Years,Owner,2500.0,2,1350.00,3.22,1069.58,1,D,0.0000,2500.0000,0.583333,19.891052,73.129355,1
73902,1840,0,1160.0000,Estonian,47,0.1774,60,Higher Education,Married,MoreThan5Years,"Tenant, Unfurnished Property",540.0,8,732.05,44.35,50.49,1,B,96.8000,3000.0000,0.583333,15.067097,75.756600,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73904 entries, 0 to 73903
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   BidsPortfolioManager               73904 non-null  int64  
 1   BidsApi                            73904 non-null  int64  
 2   BidsManual                         73904 non-null  float64
 3   LanguageCode                       73904 non-null  object 
 4   Age                                73904 non-null  int64  
 5   Interest                           73904 non-null  float64
 6   LoanDuration                       73904 non-null  int64  
 7   Education                          73904 non-null  object 
 8   MaritalStatus                      73904 non-null  object 
 9   EmploymentDurationCurrentEmployer  73904 non-null  object 
 10  HomeOwnershipType                  73904 non-null  object 
 11  IncomeTotal                        73904 non-null  flo

**Re-Converting Categorical Variables**

In [None]:
cat_cols=['LanguageCode','Education','MaritalStatus','EmploymentDurationCurrentEmployer',
          'HomeOwnershipType','Rating']

for col in cat_cols:
  df[col] = df[col].astype("category")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73904 entries, 0 to 73903
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   BidsPortfolioManager               73904 non-null  int64   
 1   BidsApi                            73904 non-null  int64   
 2   BidsManual                         73904 non-null  float64 
 3   LanguageCode                       73904 non-null  category
 4   Age                                73904 non-null  int64   
 5   Interest                           73904 non-null  float64 
 6   LoanDuration                       73904 non-null  int64   
 7   Education                          73904 non-null  category
 8   MaritalStatus                      73904 non-null  category
 9   EmploymentDurationCurrentEmployer  73904 non-null  category
 10  HomeOwnershipType                  73904 non-null  category
 11  IncomeTotal                        73904 

In [None]:
df.select_dtypes("float").columns

Index(['BidsManual', 'Interest', 'IncomeTotal', 'LiabilitiesTotal',
       'DebtToIncome', 'FreeCash', 'PreviousRepaymentsBeforeLoan', 'Amount',
       'RepaymentYears', 'ROI', 'EMI'],
      dtype='object')

In [None]:
df["LanguageCode"].value_counts()

Estonian     31712
Finnish      19670
Spanish      15614
Russian       6254
English        577
Slovakian       72
German           5
Name: LanguageCode, dtype: int64

In [None]:
df["Education"].value_counts()

Secondary Education     27177
Higher Education        20081
Vocational Education    16556
Primary Education        5193
Basic Education          4894
Not Set                     3
Name: Education, dtype: int64

In [None]:
df["MaritalStatus"].value_counts()

Not Specified    44222
Single           10366
Married           8499
Cohabitant        7648
Divorced          2728
Widow              441
Name: MaritalStatus, dtype: int64

In [None]:
df["EmploymentDurationCurrentEmployer"].value_counts()

MoreThan5Years    28712
UpTo5Years        13609
UpTo1Year         13588
UpTo2Years         5064
UpTo3Years         4174
Retiree            3344
UpTo4Years         2785
Other              2027
TrialPeriod         601
Name: EmploymentDurationCurrentEmployer, dtype: int64

In [None]:
df["HomeOwnershipType"].value_counts()

Owner                             25354
Tenant, Pre-Furnished Property    16144
Living with Parents               12076
Mortgage                           8414
Tenant, Unfurnished Property       3741
Other                              3179
Joint Ownership                    2433
Joint Tenant                       1253
Council House                       823
Owner with Encumbrance              485
Not Specified                         2
Name: HomeOwnershipType, dtype: int64

In [None]:
df["Rating"].value_counts()

F     18001
E     11978
HR    11639
D     11262
C      9996
B      6683
A      2732
AA     1613
Name: Rating, dtype: int64

In [None]:
df["LoanDuration"].value_counts()

60    32928
36    23620
48     5765
24     3469
12     3245
18     1837
6       920
9       811
3       612
30      333
1       174
2        42
5        32
4        27
10       20
20       15
8        13
15       12
7         9
16        7
22        2
14        2
11        1
19        1
21        1
13        1
17        1
42        1
27        1
52        1
38        1
Name: LoanDuration, dtype: int64

# **Removing Outliers**

In [None]:
dtypes = pd.DataFrame(df.dtypes,columns=["Data Type"])

dtypes["Unique Values"]=df.nunique().sort_values(ascending=True)

dtypes["Null Values"]=df.isnull().sum()

dtypes["% null Values"]=df.isnull().sum()/len(df)

dtypes.sort_values(by="Unique Values" , ascending=False).style.background_gradient(cmap='YlOrRd',axis=0)

Unnamed: 0,Data Type,Unique Values,Null Values,% null Values
EMI,float64,41328,0,0.0
LiabilitiesTotal,float64,28657,0,0.0
PreviousRepaymentsBeforeLoan,float64,26359,0,0.0
FreeCash,float64,20899,0,0.0
ROI,float64,8022,0,0.0
DebtToIncome,float64,6668,0,0.0
Interest,float64,5981,0,0.0
BidsPortfolioManager,int64,5241,0,0.0
Amount,float64,4705,0,0.0
BidsManual,float64,3781,0,0.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73904 entries, 0 to 73903
Data columns (total 24 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   BidsPortfolioManager               73904 non-null  int64   
 1   BidsApi                            73904 non-null  int64   
 2   BidsManual                         73904 non-null  float64 
 3   LanguageCode                       73904 non-null  category
 4   Age                                73904 non-null  int64   
 5   Interest                           73904 non-null  float64 
 6   LoanDuration                       73904 non-null  int64   
 7   Education                          73904 non-null  category
 8   MaritalStatus                      73904 non-null  category
 9   EmploymentDurationCurrentEmployer  73904 non-null  category
 10  HomeOwnershipType                  73904 non-null  category
 11  IncomeTotal                        73904 

In [None]:
df.describe()

Unnamed: 0,BidsPortfolioManager,BidsApi,BidsManual,Age,Interest,LoanDuration,IncomeTotal,ExistingLiabilities,LiabilitiesTotal,DebtToIncome,FreeCash,MonthlyPaymentDay,PreviousRepaymentsBeforeLoan,Amount,RepaymentYears,ROI,EMI,Defaulted
count,73904.0,73904.0,73904.0,73904.0,73904.0,73904.0,73904.0,73904.0,73904.0,73904.0,73904.0,73904.0,73904.0,73904.0,73904.0,73904.0,73904.0,73904.0
mean,1162.355312,34.313853,540.308158,39.688407,0.408538,44.441762,1894.096,3.235914,715.5684,11.457649,177.403072,11.326126,894.152579,2423.297085,0.527727,26.642468,112.625924,0.587302
std,1467.016327,167.215926,784.486776,12.214057,0.312053,16.433666,6330.991,3.407826,45631.97,18.855534,858.731279,7.357694,1828.419069,2138.611968,0.061944,11.350549,102.812285,0.492323
min,0.0,0.0,0.0,0.0,0.03,1.0,0.0,0.0,0.0,0.0,-2332.0,0.0,0.0,12.7789,0.166667,2.912621,0.756816,0.0
25%,240.0,0.0,26.0,30.0,0.2323,36.0,900.0,1.0,100.0,0.0,0.0,5.0,0.0,700.0,0.5,18.850929,40.767093,0.0
50%,577.0,0.0,280.0,38.0,0.3246,48.0,1300.0,2.0,375.0,0.0,0.0,10.0,450.895,2020.0,0.5,24.505511,91.219153,1.0
75%,1565.0,6.0,711.0,48.0,0.5201,60.0,2000.0,5.0,725.9,18.92,210.11,16.0,861.138387,3185.0,0.583333,34.214854,145.139532,1.0
max,10625.0,7570.0,10630.0,77.0,2.6431,60.0,1012019.0,36.0,12400000.0,198.02,158748.64,28.0,34077.42,10632.0,0.583333,72.55085,3501.371672,1.0


In [None]:
Numeric_Cols = [ col for col in df if df[col].dtype in ["int64","float64"]]
print(len(Numeric_Cols))

Numeric_Cols

18


['BidsPortfolioManager',
 'BidsApi',
 'BidsManual',
 'Age',
 'Interest',
 'LoanDuration',
 'IncomeTotal',
 'ExistingLiabilities',
 'LiabilitiesTotal',
 'DebtToIncome',
 'FreeCash',
 'MonthlyPaymentDay',
 'PreviousRepaymentsBeforeLoan',
 'Amount',
 'RepaymentYears',
 'ROI',
 'EMI',
 'Defaulted']

**Removing Outliers with Percentile**

In [None]:
X = df.copy()


for col in X[Numeric_Cols]:
  low, high = X[col].quantile([0.001, 0.99])
  mask = X[col].between(low, high)
  X = X[mask]
  
X.describe()

Unnamed: 0,BidsPortfolioManager,BidsApi,BidsManual,Age,Interest,LoanDuration,IncomeTotal,ExistingLiabilities,LiabilitiesTotal,DebtToIncome,FreeCash,MonthlyPaymentDay,PreviousRepaymentsBeforeLoan,Amount,RepaymentYears,ROI,EMI,Defaulted
count,64477.0,64477.0,64477.0,64477.0,64477.0,64477.0,64477.0,64477.0,64477.0,64477.0,64477.0,64477.0,64477.0,64477.0,64477.0,64477.0,64477.0,64477.0
mean,1055.10168,20.2772,506.962213,39.447353,0.380772,44.928285,1566.405013,3.013958,459.263193,10.246107,154.948703,11.275835,785.701407,2265.20747,0.529644,26.02273,102.176199,0.586814
std,1194.298204,66.786132,624.43093,11.953203,0.219553,16.182228,1320.467876,2.867229,448.944437,17.290782,293.333065,7.253064,1267.434941,1833.552711,0.059226,10.023692,78.839161,0.492409
min,0.0,0.0,0.0,18.0,0.0858,1.0,200.0,0.0,0.0,0.0,0.0,1.0,0.0,38.35,0.166667,7.902008,6.230519,0.0
25%,240.0,0.0,40.0,30.0,0.2308,36.0,860.0,1.0,100.0,0.0,0.0,5.0,0.0,740.0,0.5,18.752031,38.805432,0.0
50%,562.0,0.0,290.0,38.0,0.3222,48.0,1250.0,2.0,355.1,0.0,0.0,10.0,529.18,2020.0,0.5,24.368477,87.978006,1.0
75%,1492.0,5.0,713.0,48.0,0.5056,60.0,1900.0,4.0,660.79,16.72,204.96,15.0,861.138387,3185.0,0.583333,33.581296,137.520876,1.0
max,7365.0,670.0,3575.0,68.0,1.5445,60.0,14000.0,15.0,2480.46,68.34,1501.34,27.0,9650.69,10630.0,0.583333,60.699548,498.177132,1.0


In [None]:
X

Unnamed: 0,BidsPortfolioManager,BidsApi,BidsManual,LanguageCode,Age,Interest,LoanDuration,Education,MaritalStatus,EmploymentDurationCurrentEmployer,HomeOwnershipType,IncomeTotal,ExistingLiabilities,LiabilitiesTotal,DebtToIncome,FreeCash,MonthlyPaymentDay,Rating,PreviousRepaymentsBeforeLoan,Amount,RepaymentYears,ROI,EMI,Defaulted
0,0,0,115.0410,Estonian,61,0.3000,12,Vocational Education,Married,UpTo3Years,Owner,10500.0,0,0.00,0.00,0.00,25,F,0.0000,115.0408,0.416667,23.076923,11.214997,0
1,0,0,140.6057,Estonian,48,0.2500,1,Higher Education,Divorced,MoreThan5Years,Owner,10800.0,0,0.00,0.00,0.00,15,F,258.6256,140.6057,0.166667,20.000000,143.534985,0
2,0,0,319.5580,Estonian,58,0.2500,20,Secondary Education,Married,UpTo4Years,Owner,7000.0,0,0.00,0.00,0.00,25,F,0.0000,319.5409,0.500000,20.000000,19.699580,1
4,0,0,319.5582,Estonian,25,0.3000,12,Secondary Education,Cohabitant,UpTo2Years,Owner,6800.0,0,0.00,0.00,0.00,25,F,0.0000,319.5436,0.416667,23.076923,31.151388,0
5,0,0,300.3845,Estonian,22,0.3000,24,Secondary Education,Single,UpTo2Years,Owner,9500.0,0,0.00,0.00,0.00,9,F,0.0000,300.4314,0.500000,23.076923,16.797967,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73899,2515,0,485.0000,Finnish,37,0.3101,60,Vocational Education,Single,MoreThan5Years,Council House,1400.0,1,500.00,7.69,792.32,1,E,0.0000,3000.0000,0.583333,23.669949,98.929540,0
73900,1880,0,1120.0000,Finnish,35,0.1825,60,Vocational Education,Single,MoreThan5Years,Owner,2265.0,5,1777.46,31.58,520.67,7,B,389.3600,3000.0000,0.583333,15.433404,76.588868,0
73901,1975,0,525.0000,Finnish,40,0.2483,60,Vocational Education,Married,MoreThan5Years,Owner,2500.0,2,1350.00,3.22,1069.58,1,D,0.0000,2500.0000,0.583333,19.891052,73.129355,1
73902,1840,0,1160.0000,Estonian,47,0.1774,60,Higher Education,Married,MoreThan5Years,"Tenant, Unfurnished Property",540.0,8,732.05,44.35,50.49,1,B,96.8000,3000.0000,0.583333,15.067097,75.756600,0


In [None]:
df = X
df

Unnamed: 0,BidsPortfolioManager,BidsApi,BidsManual,LanguageCode,Age,Interest,LoanDuration,Education,MaritalStatus,EmploymentDurationCurrentEmployer,HomeOwnershipType,IncomeTotal,ExistingLiabilities,LiabilitiesTotal,DebtToIncome,FreeCash,MonthlyPaymentDay,Rating,PreviousRepaymentsBeforeLoan,Amount,RepaymentYears,ROI,EMI,Defaulted
0,0,0,115.0410,Estonian,61,0.3000,12,Vocational Education,Married,UpTo3Years,Owner,10500.0,0,0.00,0.00,0.00,25,F,0.0000,115.0408,0.416667,23.076923,11.214997,0
1,0,0,140.6057,Estonian,48,0.2500,1,Higher Education,Divorced,MoreThan5Years,Owner,10800.0,0,0.00,0.00,0.00,15,F,258.6256,140.6057,0.166667,20.000000,143.534985,0
2,0,0,319.5580,Estonian,58,0.2500,20,Secondary Education,Married,UpTo4Years,Owner,7000.0,0,0.00,0.00,0.00,25,F,0.0000,319.5409,0.500000,20.000000,19.699580,1
4,0,0,319.5582,Estonian,25,0.3000,12,Secondary Education,Cohabitant,UpTo2Years,Owner,6800.0,0,0.00,0.00,0.00,25,F,0.0000,319.5436,0.416667,23.076923,31.151388,0
5,0,0,300.3845,Estonian,22,0.3000,24,Secondary Education,Single,UpTo2Years,Owner,9500.0,0,0.00,0.00,0.00,9,F,0.0000,300.4314,0.500000,23.076923,16.797967,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73899,2515,0,485.0000,Finnish,37,0.3101,60,Vocational Education,Single,MoreThan5Years,Council House,1400.0,1,500.00,7.69,792.32,1,E,0.0000,3000.0000,0.583333,23.669949,98.929540,0
73900,1880,0,1120.0000,Finnish,35,0.1825,60,Vocational Education,Single,MoreThan5Years,Owner,2265.0,5,1777.46,31.58,520.67,7,B,389.3600,3000.0000,0.583333,15.433404,76.588868,0
73901,1975,0,525.0000,Finnish,40,0.2483,60,Vocational Education,Married,MoreThan5Years,Owner,2500.0,2,1350.00,3.22,1069.58,1,D,0.0000,2500.0000,0.583333,19.891052,73.129355,1
73902,1840,0,1160.0000,Estonian,47,0.1774,60,Higher Education,Married,MoreThan5Years,"Tenant, Unfurnished Property",540.0,8,732.05,44.35,50.49,1,B,96.8000,3000.0000,0.583333,15.067097,75.756600,0




---



# **Encoding Categorcial Features with Label Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder

# Label encoding for categoricals
for colname in df.select_dtypes(["object","category","bool"]):
    df[colname], _ = df[colname].factorize()

df

Unnamed: 0,BidsPortfolioManager,BidsApi,BidsManual,LanguageCode,Age,Interest,LoanDuration,Education,MaritalStatus,EmploymentDurationCurrentEmployer,HomeOwnershipType,IncomeTotal,ExistingLiabilities,LiabilitiesTotal,DebtToIncome,FreeCash,MonthlyPaymentDay,Rating,PreviousRepaymentsBeforeLoan,Amount,RepaymentYears,ROI,EMI,Defaulted
0,0,0,115.0410,0,61,0.3000,12,0,0,0,0,10500.0,0,0.00,0.00,0.00,25,0,0.0000,115.0408,0.416667,23.076923,11.214997,0
1,0,0,140.6057,0,48,0.2500,1,1,1,1,0,10800.0,0,0.00,0.00,0.00,15,0,258.6256,140.6057,0.166667,20.000000,143.534985,0
2,0,0,319.5580,0,58,0.2500,20,2,0,2,0,7000.0,0,0.00,0.00,0.00,25,0,0.0000,319.5409,0.500000,20.000000,19.699580,1
4,0,0,319.5582,0,25,0.3000,12,2,2,3,0,6800.0,0,0.00,0.00,0.00,25,0,0.0000,319.5436,0.416667,23.076923,31.151388,0
5,0,0,300.3845,0,22,0.3000,24,2,3,3,0,9500.0,0,0.00,0.00,0.00,9,0,0.0000,300.4314,0.500000,23.076923,16.797967,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73899,2515,0,485.0000,2,37,0.3101,60,0,3,1,4,1400.0,1,500.00,7.69,792.32,1,4,0.0000,3000.0000,0.583333,23.669949,98.929540,0
73900,1880,0,1120.0000,2,35,0.1825,60,0,3,1,0,2265.0,5,1777.46,31.58,520.67,7,6,389.3600,3000.0000,0.583333,15.433404,76.588868,0
73901,1975,0,525.0000,2,40,0.2483,60,0,0,1,0,2500.0,2,1350.00,3.22,1069.58,1,3,0.0000,2500.0000,0.583333,19.891052,73.129355,1
73902,1840,0,1160.0000,0,47,0.1774,60,1,0,1,1,540.0,8,732.05,44.35,50.49,1,6,96.8000,3000.0000,0.583333,15.067097,75.756600,0


In [None]:
df.dtypes

BidsPortfolioManager                   int64
BidsApi                                int64
BidsManual                           float64
LanguageCode                           int64
Age                                    int64
Interest                             float64
LoanDuration                           int64
Education                              int64
MaritalStatus                          int64
EmploymentDurationCurrentEmployer      int64
HomeOwnershipType                      int64
IncomeTotal                          float64
ExistingLiabilities                    int64
LiabilitiesTotal                     float64
DebtToIncome                         float64
FreeCash                             float64
MonthlyPaymentDay                      int64
Rating                                 int64
PreviousRepaymentsBeforeLoan         float64
Amount                               float64
RepaymentYears                       float64
ROI                                  float64
EMI       



---



# **Classification Pipelines**

In [None]:
from sklearn.preprocessing import StandardScaler,PowerTransformer

Preprocessing_Pipeline = Pipeline([
    ("Scaler", StandardScaler() ),
    ("Transformer", PowerTransformer() ),
])

In [None]:
X = df.drop(columns=["RepaymentYears",	"ROI",	"EMI"]).copy()
y = X.pop("Defaulted")


X_train, X_test, y_train, y_test = train_test_split( X , y, stratify=y )

In [None]:
X_train = Preprocessing_Pipeline.fit_transform(X_train)

pd.DataFrame(X_train, columns =X.columns)

Unnamed: 0,BidsPortfolioManager,BidsApi,BidsManual,LanguageCode,Age,Interest,LoanDuration,Education,MaritalStatus,EmploymentDurationCurrentEmployer,HomeOwnershipType,IncomeTotal,ExistingLiabilities,LiabilitiesTotal,DebtToIncome,FreeCash,MonthlyPaymentDay,Rating,PreviousRepaymentsBeforeLoan,Amount
0,-1.316387,-0.600245,1.869170,-1.047156,0.709448,0.349462,1.014835,0.524040,0.764476,0.994578,-1.195958,0.851771,-1.431772,-1.371679,-0.707968,-0.665713,-0.077332,0.822795,0.662499,0.814884
1,-0.467955,-0.600245,-0.052783,0.681916,1.379589,-0.087241,1.014835,0.524040,0.764476,0.994578,1.123351,1.929676,0.934356,1.392578,-0.707968,-0.665713,-0.698172,0.822795,0.075495,-0.170517
2,-0.695417,0.685693,0.412321,-0.093734,-0.884037,0.002125,1.014835,0.524040,0.764476,-0.942409,-1.195958,-0.107312,-0.730425,-0.970301,-0.707968,-0.665713,0.338451,0.822795,-0.731237,-0.670371
3,-0.280824,-0.600245,-1.208270,-1.047156,-1.544587,0.191674,1.014835,0.524040,-1.282370,0.177459,-0.522562,-1.837594,0.313306,0.185794,1.495865,0.158578,-0.863364,0.822795,-1.120984,-1.260290
4,-0.403325,-0.600245,0.182249,0.681916,-1.207680,1.136471,1.014835,-0.375324,0.764476,1.652384,0.481263,0.462140,-1.431772,-1.371679,-0.707968,-0.665713,-1.032099,-1.324639,0.662499,0.241755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48352,-0.686818,-0.420308,-0.354221,1.333559,-0.005801,1.530703,1.014835,0.524040,0.764476,-0.942409,1.996517,0.339237,-0.147845,-0.750823,-0.707968,-0.665713,-1.204201,-0.118285,0.241794,-1.260290
48353,-0.462011,-0.600245,0.190420,0.681916,-1.098274,-0.413172,1.014835,-1.432991,0.764476,0.994578,-1.195958,0.674275,0.313306,0.153462,-0.707968,-0.665713,-0.698172,0.383355,0.662499,1.455697
48354,-1.224821,-0.600245,0.238486,1.333559,-0.095570,0.860153,-0.736109,-0.375324,0.764476,0.994578,0.481263,-0.473135,-1.431772,-1.371679,-0.707968,-0.665713,-0.379249,-1.324639,0.662499,0.241755
48355,-1.265256,-0.420308,0.033273,-1.047156,-0.377363,-0.929885,-0.736109,-0.375324,0.764476,0.177459,0.039233,-2.249628,0.934356,-0.575473,-0.707968,-0.665713,1.917613,-0.692073,-0.615889,-1.260290


In [None]:
X_test = Preprocessing_Pipeline.transform(X_test)

pd.DataFrame(X_test, columns =X.columns)

Unnamed: 0,BidsPortfolioManager,BidsApi,BidsManual,LanguageCode,Age,Interest,LoanDuration,Education,MaritalStatus,EmploymentDurationCurrentEmployer,HomeOwnershipType,IncomeTotal,ExistingLiabilities,LiabilitiesTotal,DebtToIncome,FreeCash,MonthlyPaymentDay,Rating,PreviousRepaymentsBeforeLoan,Amount
0,-0.323447,-0.600245,-1.099447,-1.047156,-1.544587,-1.681474,-0.736109,0.524040,0.764476,0.614225,0.039233,-1.978241,1.355163,-0.425277,-0.707968,-0.665713,1.179487,1.222118,0.881885,-1.260290
1,2.136419,-0.600245,0.176776,0.681916,1.624971,-0.567549,1.014835,0.524040,-1.282370,0.614225,1.604005,0.615789,1.937945,2.325855,1.785143,0.584413,-1.032099,0.383355,2.189821,1.862932
2,-0.583497,-0.600245,1.595674,-1.047156,-1.318559,0.133337,-1.474568,1.287714,-0.982604,-1.608633,0.039233,-1.538324,0.660683,-0.657686,1.685115,1.418131,-0.077332,-0.118285,-1.120984,0.153791
3,-1.098707,0.884955,0.184978,1.333559,-0.779296,2.592052,-0.736109,0.524040,0.764476,1.652384,0.039233,-0.107312,-1.431772,-1.371679,-0.707968,-0.665713,-1.032099,-0.118285,0.254783,-1.260290
4,0.331902,0.685693,-0.650286,-1.047156,-1.430873,-1.838390,1.014835,0.524040,0.764476,0.994578,0.481263,-0.677394,1.355163,0.348771,-0.707968,-0.665713,-0.226014,1.222118,0.183627,-0.670371
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16115,-0.280824,-0.600245,-1.208270,-1.047156,0.249782,-1.127619,-0.736109,1.287714,-0.982604,-0.942409,-1.195958,-1.386640,0.660683,-0.103749,1.093096,1.337675,-0.077332,-0.692073,0.857646,-1.260290
16116,-0.485868,-0.600245,0.428124,-1.047156,-0.005801,-1.368838,1.014835,0.524040,0.764476,0.614225,0.481263,0.203589,0.934356,-0.635390,-0.707968,-0.665713,-0.379249,1.592862,-0.079704,1.028284
16117,-1.069643,-0.250699,-0.736864,-1.047156,0.409544,-1.647303,-0.736109,0.524040,0.764476,0.614225,-1.195958,-1.378935,0.660683,-0.433850,-0.707968,-0.665713,0.835550,1.592862,0.662499,-1.260290
16118,-0.738785,-0.600245,0.461193,1.333559,1.743896,0.995052,-0.021519,-0.375324,0.764476,-0.942409,0.039233,-0.376397,0.660683,0.266737,-0.707968,-0.665713,-1.379514,-1.324639,0.662499,0.989124


In [None]:
RF_Classifier = RandomForestClassifier(random_state=0)

In [None]:
RF_Classifier.fit(X_train, y_train)

RF_Classifier.score(X_train, y_train)

1.0

In [None]:
RF_Classifier.score(X_test, y_test)

0.7351116625310173

In [None]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, RF_Classifier.predict(X_test)))

0.7351116625310173


In [None]:
RF_Classifier.predict(X_test)

array([1, 1, 0, ..., 1, 1, 1])

In [None]:
Main_Classification_Pipeline = Pipeline([
        ("Scaler", StandardScaler() ),
        ("Transformer", PowerTransformer() ),
        ("Classifier", RF_Classifier )
])

# **Regression Pipelines**

In [None]:
from sklearn.ensemble import AdaBoostRegressor

from sklearn.metrics import mean_squared_error,r2_score

### **Predicting Repayment Years**

In [None]:
X = df.drop(columns=["EMI","ROI","Defaulted"]).copy()
y = X.pop("RepaymentYears")


X_train, X_test, y_train, y_test = train_test_split( X , y, stratify=y, random_state=0 )

X_train = Preprocessing_Pipeline.fit_transform(X_train)

In [None]:
pd.DataFrame(X_train, columns =X.columns)

Unnamed: 0,BidsPortfolioManager,BidsApi,BidsManual,LanguageCode,Age,Interest,LoanDuration,Education,MaritalStatus,EmploymentDurationCurrentEmployer,HomeOwnershipType,IncomeTotal,ExistingLiabilities,LiabilitiesTotal,DebtToIncome,FreeCash,MonthlyPaymentDay,Rating,PreviousRepaymentsBeforeLoan,Amount
0,0.914897,2.308582,-1.206671,-0.096439,0.782828,-0.183967,-0.739233,-1.432588,-0.977484,-0.941868,-1.195306,-0.280612,-0.730468,-0.059278,0.680691,1.749129,0.063418,0.388729,-1.122341,0.235500
1,-1.257456,0.773562,-0.008595,1.331608,0.412229,1.575881,-0.024960,-0.374459,0.766901,-0.941868,1.126661,0.977087,-0.145491,0.534271,-0.710544,-0.667953,1.815525,-0.113645,0.183085,-1.263408
2,-0.792102,2.159483,0.169029,0.679344,1.800898,1.019805,-0.024960,-0.374459,0.766901,-0.941868,-1.195306,1.537018,-0.730468,-0.945288,-0.710544,-0.667953,-1.204882,-0.113645,0.661991,0.235500
3,-0.549259,0.190770,-0.628904,-1.048780,0.782828,-0.174218,-0.739233,0.527018,0.766901,0.613300,-1.195306,-1.921278,-1.435218,-1.376408,-0.710544,-0.667953,-0.228755,0.388729,-1.122341,-1.263408
4,-0.553346,-0.600015,-0.278351,0.679344,0.638780,1.140038,1.013045,-1.432588,0.766901,-0.941868,-1.195306,1.005653,-1.435218,-1.376408,-0.710544,-0.667953,-0.700102,-1.324582,0.661991,1.636884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48352,-0.039441,-0.600015,0.538426,-1.048780,-0.373648,-1.668970,-0.739233,0.527018,0.766901,-0.941868,1.126661,1.532247,1.819983,1.298041,-0.710544,-0.667953,0.713127,1.597913,0.601944,-0.176655
48353,1.057888,-0.600015,-0.310176,1.331608,0.922135,1.061428,1.013045,-0.374459,-1.608550,-0.343012,-0.519495,-0.632871,-0.145491,0.152456,1.201314,1.580210,1.712961,-0.113645,-1.122341,0.147485
48354,1.422446,2.082284,-1.206671,-1.048780,0.333446,-0.213455,-0.024960,1.294330,-0.977484,0.177078,-1.195306,-0.435334,1.944956,1.663844,1.788515,-0.020602,0.201838,0.388729,-1.122341,0.555017
48355,-0.615397,2.125882,1.046175,1.331608,-1.204081,1.210154,-0.739233,0.527018,0.766901,0.613300,0.043465,-0.470579,-0.730468,-0.157402,-0.710544,-0.667953,-0.381773,-1.324582,0.661991,0.235500


In [None]:
Ada_Repay = AdaBoostRegressor(random_state=0)

Ada_Repay.fit(X_train,y_train)
Ada_Repay.score(X_train,y_train)

0.887467838353813

In [None]:
X_test = Preprocessing_Pipeline.transform(X_test)

pd.DataFrame(X_test, columns =X.columns)

Unnamed: 0,BidsPortfolioManager,BidsApi,BidsManual,LanguageCode,Age,Interest,LoanDuration,Education,MaritalStatus,EmploymentDurationCurrentEmployer,HomeOwnershipType,IncomeTotal,ExistingLiabilities,LiabilitiesTotal,DebtToIncome,FreeCash,MonthlyPaymentDay,Rating,PreviousRepaymentsBeforeLoan,Amount
0,1.514511,-0.600015,-0.735639,0.679344,-0.571035,-0.578475,1.013045,-1.432588,-1.471876,0.177078,0.485540,0.057449,-0.730468,0.412731,0.540829,1.760084,0.590578,0.388729,-1.122341,0.726504
1,0.889354,-0.600015,-1.221087,0.679344,-0.002399,0.820720,-0.739233,-1.432588,0.766901,-0.941868,0.485540,0.676577,0.665015,1.232844,-0.710544,-0.667953,1.712961,-1.324582,-0.426560,0.161908
2,-0.345073,-0.600015,1.318550,0.679344,1.800898,0.662758,-0.739233,-0.374459,0.766901,1.942641,-1.195306,0.854043,-1.435218,-1.376408,-0.710544,-0.667953,-1.033241,0.828334,0.661991,1.205649
3,1.759552,-0.600015,0.487119,-1.048780,-0.183889,-1.341600,1.013045,0.527018,0.766901,0.613300,1.126661,-0.065421,-0.145491,-0.711937,-0.710544,-0.667953,-0.538976,1.597913,1.774609,1.636884
4,-0.848062,-0.600015,0.897450,1.331608,1.188655,-0.358838,-1.861874,1.294330,-0.977484,-0.941868,-1.195306,-0.190918,-0.730468,0.003686,1.150903,1.725165,1.712961,0.388729,-1.122341,-0.737592
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16115,0.652554,-0.600015,0.385007,0.679344,-0.775521,1.140038,1.013045,0.527018,0.766901,-0.941868,0.485540,0.464558,-1.435218,-1.376408,-0.710544,-0.667953,0.949883,-1.324582,0.661991,0.305505
16116,1.546881,-0.600015,-1.230728,-0.096439,-1.315038,-0.189556,-1.257476,-0.374459,-1.278986,-1.607736,0.043465,-1.245315,1.166394,0.275705,1.738680,1.271243,0.590578,-0.113645,-1.122341,0.726504
16117,-0.935007,-0.600015,-0.404619,1.331608,0.333446,0.428687,-0.739233,-0.374459,0.766901,0.993044,-1.195306,0.404721,-0.145491,0.708142,-0.710544,-0.667953,-1.557531,0.828334,0.661991,0.235500
16118,-0.776609,-0.600015,-0.212621,1.331608,-0.092065,2.033069,-0.739233,-0.374459,0.766901,0.613300,-1.195306,-0.782524,-0.730468,0.288772,-0.710544,-0.667953,-0.864922,-0.113645,-1.122341,-1.263408


In [None]:
Ada_Repay.score(X_test, y_test)

0.8864752380501444

In [None]:
print(r2_score(y_test, Ada_Repay.predict(X_test)))

0.8864752380501444


In [None]:
print("MSE : " ,mean_squared_error(y_test, Ada_Repay.predict(X_test)))
print("RMSE : " , mean_squared_error(y_test, Ada_Repay.predict(X_test)) **(0.5) )

MSE :  0.0003981140263195639
RMSE :  0.019952794950070626


In [None]:
results = list(zip(X, Ada_Repay.feature_importances_))

importance = pd.DataFrame(results, columns = ["Feature", "Importance"])

importance = importance.sort_values(by="Importance", ascending=False)

importance.sort_values(by="Importance" , ascending=False).style.background_gradient(cmap='YlOrRd',axis=0)

Unnamed: 0,Feature,Importance
6,LoanDuration,0.866137
5,Interest,0.133845
4,Age,1.8e-05
10,HomeOwnershipType,0.0
2,BidsManual,0.0
3,LanguageCode,0.0
7,Education,0.0
8,MaritalStatus,0.0
9,EmploymentDurationCurrentEmployer,0.0
1,BidsApi,0.0


In [None]:
Ada_Repay.predict(X_test)

array([0.5818956 , 0.52323265, 0.52323265, ..., 0.52323265, 0.5       ,
       0.58333333])

### **Predicting EMI(Monthly Payment)**

In [None]:
X = df.drop(columns=["RepaymentYears","ROI","Defaulted"]).copy()
y = X.pop("EMI")


X_train, X_test, y_train, y_test = train_test_split( X , y, random_state=0 )

X_train = Preprocessing_Pipeline.fit_transform(X_train)

In [None]:
Ada_EMI = AdaBoostRegressor(random_state=0, n_estimators=50, learning_rate=0.1)

Ada_EMI.fit(X_train,y_train)

Ada_EMI.score(X_train,y_train)

0.838683049545949

In [None]:
X_test = Preprocessing_Pipeline.transform(X_test)

In [None]:
Ada_EMI.score(X_test, y_test)

0.8342705498125706

In [None]:
print(r2_score(y_test, Ada_EMI.predict(X_test)))

0.8342705498125706


In [None]:
print("MSE : " ,mean_squared_error(y_test, Ada_EMI.predict(X_test)))
print("RMSE : " , mean_squared_error(y_test, Ada_EMI.predict(X_test)) **(0.5) )

MSE :  1021.0314973818529
RMSE :  31.95358348263701


In [None]:
results = list(zip(X, Ada_EMI.feature_importances_))

importance = pd.DataFrame(results, columns = ["Feature", "Importance"])

importance = importance.sort_values(by="Importance", ascending=False)

importance.sort_values(by="Importance" , ascending=False).style.background_gradient(cmap='YlOrRd',axis=0)

Unnamed: 0,Feature,Importance
19,Amount,0.761423
6,LoanDuration,0.195004
5,Interest,0.042169
2,BidsManual,0.001404
11,IncomeTotal,0.0
3,LanguageCode,0.0
4,Age,0.0
7,Education,0.0
8,MaritalStatus,0.0
9,EmploymentDurationCurrentEmployer,0.0


In [None]:
Ada_EMI.predict(X_test)

array([ 90.47590901,  39.82482369, 148.1251853 , ..., 248.11167995,
        47.0554509 , 148.1251853 ])

### **Predicting ROI**

In [None]:
X = df.drop(columns=["RepaymentYears","EMI","Defaulted"]).copy()
y = X.pop("ROI")


X_train, X_test, y_train, y_test = train_test_split( X , y, random_state=0 )

X_train = Preprocessing_Pipeline.fit_transform(X_train)

In [None]:
Ada_ROI = AdaBoostRegressor(random_state=0)

Ada_ROI.fit(X_train,y_train)

Ada_ROI.score(X_train,y_train)

0.9956883536562728

In [None]:
Ada_ROI.get_params()

{'base_estimator': None,
 'learning_rate': 1.0,
 'loss': 'linear',
 'n_estimators': 50,
 'random_state': 0}

In [None]:
X_test = Preprocessing_Pipeline.transform(X_test)

In [None]:
Ada_ROI.score(X_test, y_test)

0.9956011239200778

In [None]:
print(r2_score(y_test, Ada_ROI.predict(X_test)))

0.9956011239200778


In [None]:
print("MSE : " ,mean_squared_error(y_test, Ada_ROI.predict(X_test)))
print("RMSE : " , mean_squared_error(y_test, Ada_ROI.predict(X_test)) **(0.5) )

MSE :  0.44010806372809125
RMSE :  0.6634064091701943


In [None]:
results = list(zip(X, Ada_ROI.feature_importances_))

importance = pd.DataFrame(results, columns = ["Feature", "Importance"])

importance = importance.sort_values(by="Importance", ascending=False)

importance.sort_values(by="Importance" , ascending=False).style.background_gradient(cmap='YlOrRd',axis=0)

Unnamed: 0,Feature,Importance
5,Interest,1.0
0,BidsPortfolioManager,0.0
2,BidsManual,0.0
3,LanguageCode,0.0
4,Age,0.0
6,LoanDuration,0.0
7,Education,0.0
8,MaritalStatus,0.0
9,EmploymentDurationCurrentEmployer,0.0
1,BidsApi,0.0


In [None]:
Ada_ROI.predict(X_test)

array([36.55629313, 16.89779458, 25.78616304, ..., 42.59333545,
       11.23493058, 18.34331085])

# **Saving Models**

**Saving Preprocessing Pipeline**

In [None]:
import pickle

pickle.dump(Preprocessing_Pipeline , open('/content/drive/MyDrive/Technocolabs/Models/Preprocessing_Pipeline.pkl', 'wb'))

**Saving Classification Models**

In [None]:
import joblib

joblib.dump(RF_Classifier  ,open('/content/drive/MyDrive/Technocolabs/Models/RF_Classifier.pkl', 'wb'), compress=3)

**Saving Regression Models**

In [None]:
pickle.dump(Ada_Repay , open('/content/drive/MyDrive/Technocolabs/Models/Ada_Repay.pkl', 'wb'))

In [None]:
pickle.dump(Ada_EMI , open('/content/drive/MyDrive/Technocolabs/Models/Ada_EMI.pkl', 'wb'))

In [None]:
pickle.dump(Ada_ROI , open('/content/drive/MyDrive/Technocolabs/Models/Ada_ROI.pkl', 'wb'))

# **Making Prediction Function With All Target Variables**

In [None]:
def Make_Predictions(df):

  # Converting Categoricals
  cat_cols=['LanguageCode','Education','MaritalStatus','EmploymentDurationCurrentEmployer',
          'HomeOwnershipType','Rating']
          
  for col in cat_cols:
    df[col] = df[col].astype("category")

  # Encoding Categoricals
  for colname in df.select_dtypes(["object","category","bool"]):
      df[colname], _ = df[colname].factorize()

  # Preprocessing Data
  #Preprocessing_Pipeline = pickle.load(open('Preprocessing_Pipeline.pkl', 'rb'))

  df = Preprocessing_Pipeline.transform(df)


  Predictions = {} 

  # Making Classification Predictions
  RF_Classifier = joblib.load(open('RF_Classifier.pkl', 'rb'))

  Predictions["Defaulted"] = RF_Classifier.predict(df)
  
  

  # Making Regression Predictions
  
    # Repayment Years Prediction
  Ada_Repay = pickle.load(open('Ada_Repay.pkl', 'rb'))

  Predictions["RepaymentYears"] = Ada_Repay.predict(df)

    # Making EMI Predictions
  Ada_EMI = pickle.load(open('Ada_EMI.pkl', 'rb'))

  Predictions["EMI"] = Ada_EMI.predict(df)

    # Making ROI Predictions
  Ada_ROI = pickle.load(open('Ada_ROI.pkl', 'rb'))

  Predictions["ROI"] = Ada_ROI.predict(df)
  

  # Converting to DataFrame 
  Predictions = pd.DataFrame(Predictions)

  return pd.DataFrame(Predictions)

In [None]:
preds = Make_Predictions(df.drop(columns=["RepaymentYears",	"ROI",	"EMI",	"Defaulted"]))
preds

Unnamed: 0,Defaulted,RepaymentYears,EMI,ROI
0,0,0.416667,47.196177,22.831531
1,0,0.166667,131.774631,20.604271
2,1,0.523233,39.824824,20.604271
3,0,0.416667,47.196177,22.831531
4,0,0.523233,39.824824,22.831531
...,...,...,...,...
64472,0,0.581896,117.372498,23.572128
64473,1,0.583333,117.372498,15.926540
64474,1,0.581896,117.243216,20.604271
64475,0,0.583333,117.372498,15.271707


In [None]:
preds.describe()

Unnamed: 0,Defaulted,RepaymentYears,EMI,ROI
count,64477.0,64477.0,64477.0,64477.0
mean,0.616685,0.529908,107.429671,26.005362
std,0.486198,0.051688,66.108809,9.919905
min,0.0,0.166667,39.824824,11.234931
25%,0.0,0.5,45.107868,18.176329
50%,1.0,0.523233,107.469293,24.021237
75%,1.0,0.581896,148.125185,35.008566
max,1.0,0.583333,360.31633,58.704386


**Evaluating Performence on All Dataset**

In [None]:
accuracy_score(df["Defaulted"], preds["Defaulted"])

0.9334956651208958

In [None]:
r2_score(df["RepaymentYears"],preds["RepaymentYears"])

0.8871930256175925

In [None]:
r2_score(df["EMI"],preds["EMI"])

0.8375933131332061

In [None]:
r2_score(df["ROI"],preds["ROI"])

0.9956666370955746