In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
# Load the datasets
raw_train = pd.read_csv("train.csv")
raw_test = pd.read_csv("test.csv")

In [3]:
# Check df
raw_train.head()

Unnamed: 0,customer_id,rate,amount,purpose,period,cus_age,gender,education_level,marital_status,has_children,living_situation,total_experience,income,job_sector,DTI,APR,ccr_tot_mounth_amt,ccr_payed_loan_tot_amt,ccr_act_loan_tot_rest_amt,loan_status
0,3683,20.506515,4835.783206,Personal,11,35,Female,Educated,Single,Yes,Dependent,137,1063.261168,Public,31.798407,19.605675,63.301509,0.0,168.928244,0
1,11464,19.46815,5043.251855,Personal,8,43,Male,Educated,Single,Yes,Independent,184,165.723689,Public,41.770757,30.610585,46.914918,12824.121594,1168.763167,1
2,2170,19.825773,4110.494869,Maintenance,17,39,Female,Unknown/Other,Single,No,Dependent,0,87.185314,Private,35.620239,19.827671,9.626135,5874.631144,345.921464,1
3,10330,19.112241,3977.84654,Personal,31,33,Female,Unknown/Other,Single,No,Dependent,65,727.132601,Private,31.287542,21.566299,0.0,6897.890318,0.0,0
4,14564,19.995923,3864.3562,Maintenance,12,26,Male,Unknown/Other,Married,Yes,Dependent,107,1507.508695,Private,27.803383,21.229908,16.979558,2100.455986,0.0,0


In [4]:
# Check shape
raw_train.shape

(13852, 20)

In [5]:
# Check number of unique values in the columns
raw_train.nunique()

customer_id                  13852
rate                         13852
amount                       11837
purpose                          2
period                          54
cus_age                         59
gender                           2
education_level                  2
marital_status                   2
has_children                     2
living_situation                 2
total_experience               404
income                       13311
job_sector                       2
DTI                          13843
APR                          13852
ccr_tot_mounth_amt           11008
ccr_payed_loan_tot_amt       10855
ccr_act_loan_tot_rest_amt    10281
loan_status                      2
dtype: int64

In [6]:
# Copying datasets.
train_df = raw_train.copy()
test_df = raw_test.copy()

In [7]:
# Check general information about dataset.
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13852 entries, 0 to 13851
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customer_id                13852 non-null  int64  
 1   rate                       13852 non-null  float64
 2   amount                     12460 non-null  float64
 3   purpose                    13852 non-null  object 
 4   period                     13852 non-null  int64  
 5   cus_age                    13852 non-null  int64  
 6   gender                     13852 non-null  object 
 7   education_level            13852 non-null  object 
 8   marital_status             12523 non-null  object 
 9   has_children               13852 non-null  object 
 10  living_situation           13852 non-null  object 
 11  total_experience           13852 non-null  int64  
 12  income                     13852 non-null  float64
 13  job_sector                 12466 non-null  obj

In [8]:
# loan_status column is the target column and missing in the test data.
# So we can use this test data only for generating the prediction.
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3464 entries, 0 to 3463
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customer_id                3464 non-null   int64  
 1   rate                       3464 non-null   float64
 2   amount                     3125 non-null   float64
 3   purpose                    3464 non-null   object 
 4   period                     3464 non-null   int64  
 5   cus_age                    3464 non-null   int64  
 6   gender                     3464 non-null   object 
 7   education_level            3464 non-null   object 
 8   marital_status             3106 non-null   object 
 9   has_children               3464 non-null   object 
 10  living_situation           3464 non-null   object 
 11  total_experience           3464 non-null   int64  
 12  income                     3464 non-null   float64
 13  job_sector                 3102 non-null   objec

In [9]:
# To continue the preprocessing we will separate loan_status column from the train_df dataframe.
train_y = train_df["loan_status"].copy()

In [10]:
# Let's drop loan_status column.
train_df.drop(columns=["loan_status"], inplace = True)

In [11]:
# Let's validate.
# train_df and test_df both are now in sync.
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13852 entries, 0 to 13851
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customer_id                13852 non-null  int64  
 1   rate                       13852 non-null  float64
 2   amount                     12460 non-null  float64
 3   purpose                    13852 non-null  object 
 4   period                     13852 non-null  int64  
 5   cus_age                    13852 non-null  int64  
 6   gender                     13852 non-null  object 
 7   education_level            13852 non-null  object 
 8   marital_status             12523 non-null  object 
 9   has_children               13852 non-null  object 
 10  living_situation           13852 non-null  object 
 11  total_experience           13852 non-null  int64  
 12  income                     13852 non-null  float64
 13  job_sector                 12466 non-null  obj

In [12]:
# Dropping the unnecessary columns
train_df.drop(columns="customer_id", inplace = True)
test_df.drop(columns="customer_id", inplace = True)

In [13]:
# Check columns in train_df
train_df.columns

Index(['rate', 'amount', 'purpose', 'period', 'cus_age', 'gender',
       'education_level', 'marital_status', 'has_children', 'living_situation',
       'total_experience', 'income', 'job_sector', 'DTI', 'APR',
       'ccr_tot_mounth_amt', 'ccr_payed_loan_tot_amt',
       'ccr_act_loan_tot_rest_amt'],
      dtype='object')

In [14]:
# Let's check duplicates.
# There are no duplicates.
train_df[train_df.duplicated()]

Unnamed: 0,rate,amount,purpose,period,cus_age,gender,education_level,marital_status,has_children,living_situation,total_experience,income,job_sector,DTI,APR,ccr_tot_mounth_amt,ccr_payed_loan_tot_amt,ccr_act_loan_tot_rest_amt


In [15]:
test_df[test_df.duplicated()]

Unnamed: 0,rate,amount,purpose,period,cus_age,gender,education_level,marital_status,has_children,living_situation,total_experience,income,job_sector,DTI,APR,ccr_tot_mounth_amt,ccr_payed_loan_tot_amt,ccr_act_loan_tot_rest_amt


In [16]:
# Missing Values Analysis.
train_df.isna().sum()

rate                            0
amount                       1392
purpose                         0
period                          0
cus_age                         0
gender                          0
education_level                 0
marital_status               1329
has_children                    0
living_situation                0
total_experience                0
income                          0
job_sector                   1386
DTI                             0
APR                             0
ccr_tot_mounth_amt              0
ccr_payed_loan_tot_amt          0
ccr_act_loan_tot_rest_amt       0
dtype: int64

In [17]:
# Let's impute missing values.
# Numeric --> mean
# Categorical --> mode

num_cols = ["rate", "amount", "period", "cus_age", "total_experience", "ccr_act_loan_tot_rest_amt",
            "income", "DTI", "APR", "ccr_tot_mounth_amt", "ccr_payed_loan_tot_amt"]

cat_cols = ["purpose", "gender", "education_level", "marital_status", "has_children", "living_situation", "job_sector"]

In [18]:
# Let's impute categorical columns.
cat_imputer = SimpleImputer(strategy="most_frequent")
cat_imputer.fit(train_df[cat_cols])

train_df[cat_cols] = cat_imputer.transform(train_df[cat_cols])
test_df[cat_cols] = cat_imputer.transform(test_df[cat_cols])

In [19]:
# Let's impute numerical columns.
num_imputer = SimpleImputer(strategy="mean")
num_imputer.fit(train_df[num_cols])

train_df[num_cols] = num_imputer.transform(train_df[num_cols])
test_df[num_cols] = num_imputer.transform(test_df[num_cols])

In [20]:
# Recheck missing values.
train_df.isna().sum()

rate                         0
amount                       0
purpose                      0
period                       0
cus_age                      0
gender                       0
education_level              0
marital_status               0
has_children                 0
living_situation             0
total_experience             0
income                       0
job_sector                   0
DTI                          0
APR                          0
ccr_tot_mounth_amt           0
ccr_payed_loan_tot_amt       0
ccr_act_loan_tot_rest_amt    0
dtype: int64

In [21]:
# Let's apply simple feature engineering
# Convert monthly income to annual income
train_df["annual_income"] = train_df["income"] * 12
test_df["annual_income"] = test_df["income"] * 12

# Creating Income to Loan Amount ratio column
train_df["income_to_loan_ratio"] = train_df["annual_income"] / train_df["amount"]
test_df["income_to_loan_ratio"] = test_df["annual_income"] / test_df["amount"]

# We'll just use annual income, so let's drop income column.
train_df.drop(columns="income", inplace = True)
test_df.drop(columns="income", inplace = True)

In [22]:
# Let's check train_df
train_df.head()

Unnamed: 0,rate,amount,purpose,period,cus_age,gender,education_level,marital_status,has_children,living_situation,total_experience,job_sector,DTI,APR,ccr_tot_mounth_amt,ccr_payed_loan_tot_amt,ccr_act_loan_tot_rest_amt,annual_income,income_to_loan_ratio
0,20.506515,4835.783206,Personal,11.0,35.0,Female,Educated,Single,Yes,Dependent,137.0,Public,31.798407,19.605675,63.301509,0.0,168.928244,12759.13402,2.638483
1,19.46815,5043.251855,Personal,8.0,43.0,Male,Educated,Single,Yes,Independent,184.0,Public,41.770757,30.610585,46.914918,12824.121594,1168.763167,1988.684268,0.394326
2,19.825773,4110.494869,Maintenance,17.0,39.0,Female,Unknown/Other,Single,No,Dependent,0.0,Private,35.620239,19.827671,9.626135,5874.631144,345.921464,1046.223771,0.254525
3,19.112241,3977.84654,Personal,31.0,33.0,Female,Unknown/Other,Single,No,Dependent,65.0,Private,31.287542,21.566299,0.0,6897.890318,0.0,8725.591215,2.193546
4,19.995923,3864.3562,Maintenance,12.0,26.0,Male,Unknown/Other,Married,Yes,Dependent,107.0,Private,27.803383,21.229908,16.979558,2100.455986,0.0,18090.104336,4.681272


In [23]:
# Application of Label Encoder

* Each of the categorical column has two unique values, so we can safely apply just label encoder, otherwise we should consider using one-hot encoding to maintain data integrity and prevent unintended ordinality or bias.

In [24]:
for col in cat_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

In [25]:
# Let's check train_df
train_df.head()

Unnamed: 0,rate,amount,purpose,period,cus_age,gender,education_level,marital_status,has_children,living_situation,total_experience,job_sector,DTI,APR,ccr_tot_mounth_amt,ccr_payed_loan_tot_amt,ccr_act_loan_tot_rest_amt,annual_income,income_to_loan_ratio
0,20.506515,4835.783206,1,11.0,35.0,0,0,1,1,0,137.0,1,31.798407,19.605675,63.301509,0.0,168.928244,12759.13402,2.638483
1,19.46815,5043.251855,1,8.0,43.0,1,0,1,1,1,184.0,1,41.770757,30.610585,46.914918,12824.121594,1168.763167,1988.684268,0.394326
2,19.825773,4110.494869,0,17.0,39.0,0,1,1,0,0,0.0,0,35.620239,19.827671,9.626135,5874.631144,345.921464,1046.223771,0.254525
3,19.112241,3977.84654,1,31.0,33.0,0,1,1,0,0,65.0,0,31.287542,21.566299,0.0,6897.890318,0.0,8725.591215,2.193546
4,19.995923,3864.3562,0,12.0,26.0,1,1,0,1,0,107.0,0,27.803383,21.229908,16.979558,2100.455986,0.0,18090.104336,4.681272


In [26]:
# Change numerical columns list
num_cols.append("annual_income")
num_cols.append("income_to_loan_ratio")
num_cols.remove("income")

In [27]:
num_cols

['rate',
 'amount',
 'period',
 'cus_age',
 'total_experience',
 'ccr_act_loan_tot_rest_amt',
 'DTI',
 'APR',
 'ccr_tot_mounth_amt',
 'ccr_payed_loan_tot_amt',
 'annual_income',
 'income_to_loan_ratio']

In [28]:
# Apply log transofmation on numerical columns
# Adding a small constant to avoid log(0)
constant = 1e-6
# Applying log transformation with added constant for safety
train_df[num_cols] = np.log(train_df[num_cols] + constant)
test_df[num_cols] = np.log(test_df[num_cols] + constant)

In [29]:
# Applying MinMax Scaling
sc = StandardScaler()
train_df = sc.fit_transform(train_df)
test_df = sc.transform(test_df)

In [30]:
# Building the model
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_df, train_y, test_size=0.3, random_state=0)

In [31]:
# Let's initialize the model
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
log.fit(X_train, y_train)

In [32]:
y_pred_test = log.predict(X_test)

In [33]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.94      0.97      0.96      2900
           1       0.92      0.87      0.89      1256

    accuracy                           0.94      4156
   macro avg       0.93      0.92      0.92      4156
weighted avg       0.94      0.94      0.94      4156



In [34]:
# Serialization & Deserialization
import joblib
joblib.dump(log, "log_trained_model_v1.pkl")

['log_trained_model_v1.pkl']

In [35]:
final_model = joblib.load("log_trained_model_v1.pkl")

In [36]:
# Check final model
final_model

In [37]:
final_model.intercept_, final_model.coef_

(array([-2.10390973]),
 array([[ 1.64370645, -0.02871501, -0.10247344, -0.00856863, -0.01678236,
         -0.01824755, -0.03344421,  0.04077326,  0.00984667,  0.08341515,
         -0.46804013, -0.04629651,  2.50561617,  1.99936365, -0.0681306 ,
         -0.13449706,  0.07461028, -0.1532065 , -0.0837062 ]]))

In [38]:
log.intercept_, log.coef_

(array([-2.10390973]),
 array([[ 1.64370645, -0.02871501, -0.10247344, -0.00856863, -0.01678236,
         -0.01824755, -0.03344421,  0.04077326,  0.00984667,  0.08341515,
         -0.46804013, -0.04629651,  2.50561617,  1.99936365, -0.0681306 ,
         -0.13449706,  0.07461028, -0.1532065 , -0.0837062 ]]))