### Load libraries

In [1]:
from warnings import simplefilter

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import ConvergenceWarning
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (f1_score, precision_score, recall_score)
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.pipeline import (Pipeline, make_pipeline)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier

simplefilter(action='ignore', category=FutureWarning)
simplefilter("always", ConvergenceWarning)

### Load data

In [2]:
data = pd.read_csv('Train_nyOWmfK.csv', encoding='latin1')

### Data informations, statistics and describe

In [3]:
data.shape

(87020, 26)

In [4]:
data.head(10)

Unnamed: 0,ID,Gender,City,Monthly_Income,DOB,Lead_Creation_Date,Loan_Amount_Applied,Loan_Tenure_Applied,Existing_EMI,Employer_Name,...,Interest_Rate,Processing_Fee,EMI_Loan_Submitted,Filled_Form,Device_Type,Var2,Source,Var4,LoggedIn,Disbursed
0,ID000002C20,Female,Delhi,20000,23-May-78,15-May-15,300000.0,5.0,0.0,CYBOSOL,...,,,,N,Web-browser,G,S122,1,0,0
1,ID000004E40,Male,Mumbai,35000,07-Oct-85,04-May-15,200000.0,2.0,0.0,TATA CONSULTANCY SERVICES LTD (TCS),...,13.25,,6762.9,N,Web-browser,G,S122,3,0,0
2,ID000007H20,Male,Panchkula,22500,10-Oct-81,19-May-15,600000.0,4.0,0.0,ALCHEMIST HOSPITALS LTD,...,,,,N,Web-browser,B,S143,1,0,0
3,ID000008I30,Male,Saharsa,35000,30-Nov-87,09-May-15,1000000.0,5.0,0.0,BIHAR GOVERNMENT,...,,,,N,Web-browser,B,S143,3,0,0
4,ID000009J40,Male,Bengaluru,100000,17-Feb-84,20-May-15,500000.0,2.0,25000.0,GLOBAL EDGE SOFTWARE,...,,,,N,Web-browser,B,S134,3,1,0
5,ID000010K00,Male,Bengaluru,45000,21-Apr-82,20-May-15,300000.0,5.0,15000.0,COGNIZANT TECHNOLOGY SOLUTIONS INDIA PVT LTD,...,13.99,1500.0,6978.92,N,Web-browser,B,S143,3,1,0
6,ID000011L10,Female,Sindhudurg,70000,23-Oct-87,01-May-15,6.0,5.0,0.0,CARNIVAL CRUISE LINE,...,,,,N,Web-browser,B,S133,1,0,0
7,ID000012M20,Male,Bengaluru,20000,25-Jul-75,20-May-15,200000.0,5.0,2597.0,GOLDEN TULIP FLORITECH PVT. LTD,...,,,,N,Web-browser,B,S159,3,0,0
8,ID000013N30,Male,Kochi,75000,26-Jan-72,02-May-15,0.0,0.0,0.0,SIIS PVT LTD,...,14.85,26000.0,30824.65,Y,Mobile,C,S122,5,0,0
9,ID000014O40,Female,Mumbai,30000,12-Sep-89,03-May-15,300000.0,3.0,0.0,SOUNDCLOUD.COM,...,18.25,1500.0,10883.38,N,Web-browser,B,S133,1,0,0


In [5]:
data.dtypes

ID                        object
Gender                    object
City                      object
Monthly_Income             int64
DOB                       object
Lead_Creation_Date        object
Loan_Amount_Applied      float64
Loan_Tenure_Applied      float64
Existing_EMI             float64
Employer_Name             object
Salary_Account            object
Mobile_Verified           object
Var5                       int64
Var1                      object
Loan_Amount_Submitted    float64
Loan_Tenure_Submitted    float64
Interest_Rate            float64
Processing_Fee           float64
EMI_Loan_Submitted       float64
Filled_Form               object
Device_Type               object
Var2                      object
Source                    object
Var4                       int64
LoggedIn                   int64
Disbursed                  int64
dtype: object

In [6]:
data.describe(percentiles=[0.2, 0.4, 0.6, 0.8])

Unnamed: 0,Monthly_Income,Loan_Amount_Applied,Loan_Tenure_Applied,Existing_EMI,Var5,Loan_Amount_Submitted,Loan_Tenure_Submitted,Interest_Rate,Processing_Fee,EMI_Loan_Submitted,Var4,LoggedIn,Disbursed
count,87020.0,86949.0,86949.0,86949.0,87020.0,52407.0,52407.0,27726.0,27420.0,27726.0,87020.0,87020.0,87020.0
mean,58849.97,230250.7,2.131399,3696.228,4.961503,395010.6,3.891369,19.197474,5131.150839,10999.528377,2.949805,0.02935,0.014629
std,2177511.0,354206.8,2.014193,39810.21,5.670385,308248.1,1.165359,5.834213,4725.837644,7512.32305,1.69772,0.168785,0.120062
min,0.0,0.0,0.0,0.0,0.0,50000.0,1.0,11.99,200.0,1176.41,0.0,0.0,0.0
20%,15000.0,0.0,0.0,0.0,0.0,190000.0,3.0,14.85,2000.0,5422.58,1.0,0.0,0.0
40%,20609.2,100000.0,1.0,0.0,1.0,290000.0,4.0,16.35,3000.0,8342.71,3.0,0.0,0.0
50%,25000.0,100000.0,2.0,0.0,2.0,300000.0,4.0,18.0,4000.0,9392.97,3.0,0.0,0.0
60%,28000.0,200000.0,3.0,0.0,3.0,370000.0,4.0,18.4,4400.0,10597.55,3.0,0.0,0.0
80%,45000.0,300000.0,5.0,5000.0,11.0,500000.0,5.0,20.0,7500.0,14362.93,5.0,0.0,0.0
max,444554400.0,10000000.0,10.0,10000000.0,18.0,3000000.0,6.0,37.0,50000.0,144748.28,7.0,1.0,1.0


### train-test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(["LoggedIn", "Disbursed"], axis=1),
                                          data.Disbursed,
                                          test_size=10000,
                                          random_state=123)

### data preparation for machine learning, pre-processing and processing data  by type

In [8]:
def  Preproces(data):
    data = data.drop(["ID", "Lead_Creation_Date", "Employer_Name", "City", "Salary_Account"], axis=1)
    data["Gender"].replace({'Female': 0, 'Male':1}, inplace = True)
    data["Mobile_Verified"].replace({'Y': 0, 'N':1}, inplace = True)
    data["Filled_Form"].replace({'Y': 0, 'N':1}, inplace = True)
    data["Device_Type"].replace({'Web-browser': 0, 'Mobile':1}, inplace = True)
    data["DOB"] = 2015 -('19' + data["DOB"].str[-2:]).astype('int')
    
    return data

def Proces():
    num_features = ["Monthly_Income", "DOB", "Loan_Amount_Applied", "Loan_Tenure_Applied", "Existing_EMI",
                   "Loan_Amount_Submitted", "Loan_Tenure_Submitted", "Interest_Rate", "Processing_Fee",
                   "EMI_Loan_Submitted", "Var5"]
    num_pipeline = make_pipeline(SimpleImputer(strategy = "constant", fill_value = 0), 
                                 StandardScaler())
    cat_feauters = ["Var1", "Var2", "Var4", "Source"]
    cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"),
                                OneHotEncoder(handle_unknown="ignore"))
    other_feauters = ["Gender", "Mobile_Verified", "Filled_Form", "Device_Type"]
    other_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"))

    processor = ColumnTransformer(transformers=[
        ("num", num_pipeline, num_features),
        ("cat", cat_pipeline, cat_feauters),
        ("cabin", other_pipeline, other_feauters)
    ])
    return processor

In [9]:
X_test = Preproces(X_test)
X_train = Preproces(X_train)

### machine learning process pipeline with two models and a grid search including data imbalance

In [21]:
num_splits = 10

param_grid_lr =  [{
    'penalty': ['l2'],
    'C' : np.logspace(-2.5, 0.7, 10),
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'saga'],
    'class_weight': ['balanced']
    },
    {
    'penalty': ['l1'],
    'C' : np.logspace(-2.5, 0.7, 10),
    'solver' : ['liblinear', 'saga'],
    'class_weight': ['balanced']
    },
    {
    'penalty': ['elasticnet'],
    'C' : np.logspace(-2.5, 0.7, 10),
    'solver' : ['saga'],
    'class_weight': ['balanced']
    }
]
param_grid_rf =  [{
    'n_estimators' : list(range(61,101,15)),
    'max_features' : list(range(12,53,15)),
    'min_samples_split': list(range(500,1103,150)),
    'class_weight': ['balanced', 'balanced_subsample']
    }]

kf = KFold(n_splits=num_splits)
grid_lr = GridSearchCV(LogisticRegression(), param_grid=param_grid_lr, 
                       cv=kf, verbose=10, refit=True,  n_jobs=-1)
grid_rf = GridSearchCV(RandomForestClassifier(), param_grid=param_grid_rf, 
                       cv=kf, verbose=10, refit=True, n_jobs=-1)
                                 
classification_process_logistic_regression = make_pipeline(
    Proces(),
    SelectFromModel(RandomForestClassifier(n_estimators=25, max_features='auto'), 0.0001),
    grid_lr)

classification_process_random_forest = make_pipeline(
    Proces(),
    SelectFromModel(RandomForestClassifier(n_estimators=25, max_features='auto'), 0.0001),
    grid_rf)

In [11]:
classification_process_logistic_regression.fit(X_train, y_train)
classification_process_logistic_regression.predict(X_test)

Fitting 10 folds for each of 70 candidates, totalling 700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   21.5s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   59.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  4

array([0, 0, 1, ..., 0, 1, 1], dtype=int64)

In [15]:
print(precision_score(y_test, classification_process_logistic_regression.predict_proba(X_test)[:,1]>0.7, 
                      average='macro', zero_division=0))
print(recall_score(y_test, classification_process_logistic_regression.predict_proba(X_test)[:,1]>0.7, 
                   average='macro', zero_division=0))
print(f1_score(y_test, classification_process_logistic_regression.predict_proba(X_test)[:,1]>0.7,
               average='macro', zero_division=0))

0.526307029096567
0.6917680379080985
0.5208957958281591


In [22]:
classification_process_random_forest.fit(X_train, y_train)
classification_process_random_forest.predict(X_test)

Fitting 10 folds for each of 90 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 15.1min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 17.4min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 20.3min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 22.8min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 25.7min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 28.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 32.0min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed: 41

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [23]:
print(precision_score(y_test, classification_process_random_forest.predict_proba(X_test)[:,1]>0.7,
                       average='macro', zero_division=0))
print(recall_score(y_test, classification_process_random_forest.predict_proba(X_test)[:,1]>0.7,
                   average='macro', zero_division=0))
print(f1_score(y_test, classification_process_random_forest.predict_proba(X_test)[:,1]>0.7,
               average='macro', zero_division=0))

0.5323147412903991
0.6245503922431503
0.5432502902179572


### for comparison, the same pipline with the wrong assumption or ignorance that the data is unbalanced

In [26]:
num_splits = 10

param_grid_lr =  [{
    'penalty': ['l2'],
    'C' : np.logspace(-2.5, 0.7, 10),
    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'saga'],
    'class_weight': [None]
    },
    {
    'penalty': ['l1'],
    'C' : np.logspace(-2.5, 0.7, 10),
    'solver' : ['liblinear', 'saga'],
    'class_weight': [None]
    },
    {
    'penalty': ['elasticnet'],
    'C' : np.logspace(-2.5, 0.7, 10),
    'solver' : ['saga'],
    'class_weight': [None]
    }
]
param_grid_rf =  [{
    'n_estimators' : list(range(61,101,15)),
    'max_features' : list(range(12,53,15)),
    'min_samples_split': list(range(500,1103,150)),
    'class_weight': [None]
    }]

kf = KFold(n_splits=num_splits)
grid_lr = GridSearchCV(LogisticRegression(), param_grid=param_grid_lr, 
                       cv=kf, verbose=10, refit=True,  n_jobs=-1)
grid_rf = GridSearchCV(RandomForestClassifier(), param_grid=param_grid_rf, 
                       cv=kf, verbose=10, refit=True, n_jobs=-1)
                                 
classification_process_logistic_regression_all = make_pipeline(
    Proces(),
    SelectFromModel(RandomForestClassifier(n_estimators=25, max_features='auto'), 0.0001),
    grid_lr)

classification_process_random_forest_all = make_pipeline(
    Proces(),
    SelectFromModel(RandomForestClassifier(n_estimators=25, max_features='auto'), 0.0001),
    grid_rf)

In [27]:
classification_process_logistic_regression_all.fit(X_train, y_train)
classification_process_logistic_regression_all.predict(X_test)

Fitting 10 folds for each of 70 candidates, totalling 700 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   33.4s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   54.1s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  3

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [28]:
print(precision_score(y_test, classification_process_logistic_regression_all.predict_proba(X_test)[:,1]>0.7, 
                      average='macro', zero_division=0))
print(recall_score(y_test, classification_process_logistic_regression_all.predict_proba(X_test)[:,1]>0.7, 
                   average='macro', zero_division=0))
print(f1_score(y_test, classification_process_logistic_regression_all.predict_proba(X_test)[:,1]>0.7,
               average='macro', zero_division=0))

0.49235
0.5
0.4961455131757948


In [29]:
classification_process_random_forest_all.fit(X_train, y_train)
classification_process_random_forest_all.predict(X_test)

Fitting 10 folds for each of 45 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed: 16.1min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 18.7min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed: 21.1min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 23.7min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 26.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 29.7min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed: 38

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [30]:
print(precision_score(y_test, classification_process_random_forest_all.predict_proba(X_test)[:,1]>0.7,
                       average='macro', zero_division=0))
print(recall_score(y_test, classification_process_random_forest_all.predict_proba(X_test)[:,1]>0.7,
                   average='macro', zero_division=0))
print(f1_score(y_test, classification_process_random_forest_all.predict_proba(X_test)[:,1]>0.7,
               average='macro', zero_division=0))

0.49235
0.5
0.4961455131757948


### Conclusion
For unbalanced data, balancing or choosing methods that can oppose this is important, as shown in the attached example.
Both methods, after taking into account the balance between classes, came out better than without taking into account. The methods without balancing were below 50% prediction.
The method is more resistant (by itself more resistant) to such a problem, the random forest did better in this case, obtaining f1 score 54%.