## ðŸ’² Loan Payment Prediction

Given *data about loans*, let's try to predict whether a given loan will be **paid off** or not.

We will use six different models to make our predictions.

Data source: https://www.kaggle.com/datasets/zhijinzhai/loandata

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
data = pd.read_csv('archive/Loan payments data.csv')
data

Unnamed: 0,Loan_ID,loan_status,Principal,terms,effective_date,due_date,paid_off_time,past_due_days,age,education,Gender
0,xqd20166231,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/14/2016 19:31,,45,High School or Below,male
1,xqd20168902,PAIDOFF,1000,30,9/8/2016,10/7/2016,10/7/2016 9:00,,50,Bechalor,female
2,xqd20160003,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/25/2016 16:58,,33,Bechalor,female
3,xqd20160004,PAIDOFF,1000,15,9/8/2016,9/22/2016,9/22/2016 20:00,,27,college,male
4,xqd20160005,PAIDOFF,1000,30,9/9/2016,10/8/2016,9/23/2016 21:36,,28,college,female
...,...,...,...,...,...,...,...,...,...,...,...
495,xqd20160496,COLLECTION_PAIDOFF,1000,30,9/12/2016,10/11/2016,10/14/2016 19:08,3.0,28,High School or Below,male
496,xqd20160497,COLLECTION_PAIDOFF,1000,15,9/12/2016,9/26/2016,10/10/2016 20:02,14.0,26,High School or Below,male
497,xqd20160498,COLLECTION_PAIDOFF,800,15,9/12/2016,9/26/2016,9/29/2016 11:49,3.0,30,college,male
498,xqd20160499,COLLECTION_PAIDOFF,1000,30,9/12/2016,11/10/2016,11/11/2016 22:40,1.0,38,college,female


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Loan_ID         500 non-null    object 
 1   loan_status     500 non-null    object 
 2   Principal       500 non-null    int64  
 3   terms           500 non-null    int64  
 4   effective_date  500 non-null    object 
 5   due_date        500 non-null    object 
 6   paid_off_time   400 non-null    object 
 7   past_due_days   200 non-null    float64
 8   age             500 non-null    int64  
 9   education       500 non-null    object 
 10  Gender          500 non-null    object 
dtypes: float64(1), int64(3), object(7)
memory usage: 43.1+ KB


### Preprocessing

In [47]:
df = data.copy()

In [48]:
df

Unnamed: 0,Loan_ID,loan_status,Principal,terms,effective_date,due_date,paid_off_time,past_due_days,age,education,Gender
0,xqd20166231,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/14/2016 19:31,,45,High School or Below,male
1,xqd20168902,PAIDOFF,1000,30,9/8/2016,10/7/2016,10/7/2016 9:00,,50,Bechalor,female
2,xqd20160003,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/25/2016 16:58,,33,Bechalor,female
3,xqd20160004,PAIDOFF,1000,15,9/8/2016,9/22/2016,9/22/2016 20:00,,27,college,male
4,xqd20160005,PAIDOFF,1000,30,9/9/2016,10/8/2016,9/23/2016 21:36,,28,college,female
...,...,...,...,...,...,...,...,...,...,...,...
495,xqd20160496,COLLECTION_PAIDOFF,1000,30,9/12/2016,10/11/2016,10/14/2016 19:08,3.0,28,High School or Below,male
496,xqd20160497,COLLECTION_PAIDOFF,1000,15,9/12/2016,9/26/2016,10/10/2016 20:02,14.0,26,High School or Below,male
497,xqd20160498,COLLECTION_PAIDOFF,800,15,9/12/2016,9/26/2016,9/29/2016 11:49,3.0,30,college,male
498,xqd20160499,COLLECTION_PAIDOFF,1000,30,9/12/2016,11/10/2016,11/11/2016 22:40,1.0,38,college,female


In [49]:
df['loan_status'].unique()

array(['PAIDOFF', 'COLLECTION', 'COLLECTION_PAIDOFF'], dtype=object)

In [50]:
df.isna().sum()

Loan_ID             0
loan_status         0
Principal           0
terms               0
effective_date      0
due_date            0
paid_off_time     100
past_due_days     300
age                 0
education           0
Gender              0
dtype: int64

In [51]:
{column: len(df[column].unique()) for column in df.columns}

{'Loan_ID': 500,
 'loan_status': 3,
 'Principal': 6,
 'terms': 3,
 'effective_date': 7,
 'due_date': 25,
 'paid_off_time': 321,
 'past_due_days': 34,
 'age': 33,
 'education': 4,
 'Gender': 2}

In [52]:
# Drop Loan_ID column
df = df.drop('Loan_ID', axis=1)

In [53]:
df

Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,paid_off_time,past_due_days,age,education,Gender
0,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/14/2016 19:31,,45,High School or Below,male
1,PAIDOFF,1000,30,9/8/2016,10/7/2016,10/7/2016 9:00,,50,Bechalor,female
2,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/25/2016 16:58,,33,Bechalor,female
3,PAIDOFF,1000,15,9/8/2016,9/22/2016,9/22/2016 20:00,,27,college,male
4,PAIDOFF,1000,30,9/9/2016,10/8/2016,9/23/2016 21:36,,28,college,female
...,...,...,...,...,...,...,...,...,...,...
495,COLLECTION_PAIDOFF,1000,30,9/12/2016,10/11/2016,10/14/2016 19:08,3.0,28,High School or Below,male
496,COLLECTION_PAIDOFF,1000,15,9/12/2016,9/26/2016,10/10/2016 20:02,14.0,26,High School or Below,male
497,COLLECTION_PAIDOFF,800,15,9/12/2016,9/26/2016,9/29/2016 11:49,3.0,30,college,male
498,COLLECTION_PAIDOFF,1000,30,9/12/2016,11/10/2016,11/11/2016 22:40,1.0,38,college,female


In [54]:
# Create date/time columns
for column in ['effective_date', 'due_date', 'paid_off_time']:
    df[column] = pd.to_datetime(df[column])

# df['effective_year'] = df['effective_date'].apply(lambda x: x.year) # single value columns
# df['effective_month'] = df['effective_date'].apply(lambda x: x.month) # single value columns
df['effective_day'] = df['effective_date'].apply(lambda x: x.day)

# df['due_year'] = df['due_date'].apply(lambda x: x.year) # single value columns
df['due_month'] = df['due_date'].apply(lambda x: x.month)
df['due_day'] = df['due_date'].apply(lambda x: x.day)

# df['paid_off_year'] = df['paid_off_time'].apply(lambda x: x.year)
df['paid_off_month'] = df['paid_off_time'].apply(lambda x: x.month)
df['paid_off_day'] = df['paid_off_time'].apply(lambda x: x.day)
df['paid_off_hour'] = df['paid_off_time'].apply(lambda x: x.hour)

df = df.drop(['effective_date', 'due_date', 'paid_off_time'], axis=1)

In [55]:
df

Unnamed: 0,loan_status,Principal,terms,past_due_days,age,education,Gender,effective_day,due_month,due_day,paid_off_month,paid_off_day,paid_off_hour
0,PAIDOFF,1000,30,,45,High School or Below,male,8,10,7,9.0,14.0,19.0
1,PAIDOFF,1000,30,,50,Bechalor,female,8,10,7,10.0,7.0,9.0
2,PAIDOFF,1000,30,,33,Bechalor,female,8,10,7,9.0,25.0,16.0
3,PAIDOFF,1000,15,,27,college,male,8,9,22,9.0,22.0,20.0
4,PAIDOFF,1000,30,,28,college,female,9,10,8,9.0,23.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,COLLECTION_PAIDOFF,1000,30,3.0,28,High School or Below,male,12,10,11,10.0,14.0,19.0
496,COLLECTION_PAIDOFF,1000,15,14.0,26,High School or Below,male,12,9,26,10.0,10.0,20.0
497,COLLECTION_PAIDOFF,800,15,3.0,30,college,male,12,9,26,9.0,29.0,11.0
498,COLLECTION_PAIDOFF,1000,30,1.0,38,college,female,12,11,10,11.0,11.0,22.0


In [56]:
df.isna().sum()

loan_status         0
Principal           0
terms               0
past_due_days     300
age                 0
education           0
Gender              0
effective_day       0
due_month           0
due_day             0
paid_off_month    100
paid_off_day      100
paid_off_hour     100
dtype: int64

In [59]:
# Fill missing values with column means
for column in ['past_due_days', 'paid_off_month', 'paid_off_day', 'paid_off_hour']:
    df[column] = df[column].fillna(df[column].mean())

In [60]:
df.isna().sum()

loan_status       0
Principal         0
terms             0
past_due_days     0
age               0
education         0
Gender            0
effective_day     0
due_month         0
due_day           0
paid_off_month    0
paid_off_day      0
paid_off_hour     0
dtype: int64

In [61]:
{column: df[column].unique() for column in df.select_dtypes('object').columns}

{'loan_status': array(['PAIDOFF', 'COLLECTION', 'COLLECTION_PAIDOFF'], dtype=object),
 'education': array(['High School or Below', 'Bechalor', 'college', 'Master or Above'],
       dtype=object),
 'Gender': array(['male', 'female'], dtype=object)}

In [62]:
def binary_encode(df, column, positive_value):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    return df

def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

In [63]:
# Binary encode the Gender column
df = binary_encode(df, 'Gender', 'male')

education_ordering = [
    'High School or Below',
    'college',
    'Bechalor',
    'Master or Above'
]

# Ordinal encode the education column
df = ordinal_encode(df, 'education', education_ordering)

In [64]:
df

Unnamed: 0,loan_status,Principal,terms,past_due_days,age,education,Gender,effective_day,due_month,due_day,paid_off_month,paid_off_day,paid_off_hour
0,PAIDOFF,1000,30,36.01,45,0,1,8,10,7,9.0,14.0,19.0
1,PAIDOFF,1000,30,36.01,50,2,0,8,10,7,10.0,7.0,9.0
2,PAIDOFF,1000,30,36.01,33,2,0,8,10,7,9.0,25.0,16.0
3,PAIDOFF,1000,15,36.01,27,1,1,8,9,22,9.0,22.0,20.0
4,PAIDOFF,1000,30,36.01,28,1,0,9,10,8,9.0,23.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,COLLECTION_PAIDOFF,1000,30,3.00,28,0,1,12,10,11,10.0,14.0,19.0
496,COLLECTION_PAIDOFF,1000,15,14.00,26,0,1,12,9,26,10.0,10.0,20.0
497,COLLECTION_PAIDOFF,800,15,3.00,30,1,1,12,9,26,9.0,29.0,11.0
498,COLLECTION_PAIDOFF,1000,30,1.00,38,1,0,12,11,10,11.0,11.0,22.0


In [65]:
# Encode the label (loan_status) column
label_mapping = {'COLLECTION': 0, 
                 'PAIDOFF': 1, 
                 'COLLECTION_PAIDOFF': 2}

df['loan_status'] = df['loan_status'].replace(label_mapping)

  df['loan_status'] = df['loan_status'].replace(label_mapping)


In [66]:
df

Unnamed: 0,loan_status,Principal,terms,past_due_days,age,education,Gender,effective_day,due_month,due_day,paid_off_month,paid_off_day,paid_off_hour
0,1,1000,30,36.01,45,0,1,8,10,7,9.0,14.0,19.0
1,1,1000,30,36.01,50,2,0,8,10,7,10.0,7.0,9.0
2,1,1000,30,36.01,33,2,0,8,10,7,9.0,25.0,16.0
3,1,1000,15,36.01,27,1,1,8,9,22,9.0,22.0,20.0
4,1,1000,30,36.01,28,1,0,9,10,8,9.0,23.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,2,1000,30,3.00,28,0,1,12,10,11,10.0,14.0,19.0
496,2,1000,15,14.00,26,0,1,12,9,26,10.0,10.0,20.0
497,2,800,15,3.00,30,1,1,12,9,26,9.0,29.0,11.0
498,2,1000,30,1.00,38,1,0,12,11,10,11.0,11.0,22.0


In [67]:
# Split df into X and y
y = df['loan_status'].copy()
X = df.drop('loan_status', axis=1).copy()

In [68]:
y

0      1
1      1
2      1
3      1
4      1
      ..
495    2
496    2
497    2
498    2
499    2
Name: loan_status, Length: 500, dtype: int64

In [69]:
X

Unnamed: 0,Principal,terms,past_due_days,age,education,Gender,effective_day,due_month,due_day,paid_off_month,paid_off_day,paid_off_hour
0,1000,30,36.01,45,0,1,8,10,7,9.0,14.0,19.0
1,1000,30,36.01,50,2,0,8,10,7,10.0,7.0,9.0
2,1000,30,36.01,33,2,0,8,10,7,9.0,25.0,16.0
3,1000,15,36.01,27,1,1,8,9,22,9.0,22.0,20.0
4,1000,30,36.01,28,1,0,9,10,8,9.0,23.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...
495,1000,30,3.00,28,0,1,12,10,11,10.0,14.0,19.0
496,1000,15,14.00,26,0,1,12,9,26,10.0,10.0,20.0
497,800,15,3.00,30,1,1,12,9,26,9.0,29.0,11.0
498,1000,30,1.00,38,1,0,12,11,10,11.0,11.0,22.0


In [70]:
# Scale X with standard scaler
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [71]:
X

Unnamed: 0,Principal,terms,past_due_days,age,education,Gender,effective_day,due_month,due_day,paid_off_month,paid_off_day,paid_off_hour
0,0.493377,0.897891,0.000000,2.284043,-1.022825,0.426653,-3.126073,0.664986,-1.303142,-1.035098,-0.463997,1.339835
1,0.493377,0.897891,0.000000,3.106587,1.771779,-2.343823,-3.126073,0.664986,-1.303142,0.690066,-1.475829,-1.072109
2,0.493377,0.897891,0.000000,0.309935,1.771779,-2.343823,-3.126073,0.664986,-1.303142,-1.035098,1.126025,0.616252
3,0.493377,-0.978972,0.000000,-0.677119,0.374477,0.426653,-3.126073,-1.094236,0.724148,-1.035098,0.692382,1.581030
4,0.493377,0.897891,0.000000,-0.512610,0.374477,-2.343823,-2.209336,0.664986,-1.167989,-1.035098,0.836930,1.822224
...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.493377,0.897891,-1.780899,-0.512610,-1.022825,0.426653,0.540875,0.664986,-0.762531,0.690066,-0.463997,1.339835
496,0.493377,-0.978972,-1.187446,-0.841628,-1.022825,0.426653,0.540875,-1.094236,1.264758,0.690066,-1.042187,1.581030
497,-1.243866,-0.978972,-1.780899,-0.183592,0.374477,0.426653,0.540875,-1.094236,1.264758,-1.035098,1.704214,-0.589721
498,0.493377,0.897891,-1.888799,1.132480,0.374477,-2.343823,0.540875,2.424209,-0.897684,2.415229,-0.897640,2.063419


In [72]:
{column: len(X[column].unique()) for column in X.columns}

{'Principal': 6,
 'terms': 3,
 'past_due_days': 34,
 'age': 33,
 'education': 4,
 'Gender': 2,
 'effective_day': 7,
 'due_month': 3,
 'due_day': 19,
 'paid_off_month': 5,
 'paid_off_day': 32,
 'paid_off_hour': 24}

In [73]:
y

0      1
1      1
2      1
3      1
4      1
      ..
495    2
496    2
497    2
498    2
499    2
Name: loan_status, Length: 500, dtype: int64

### Training

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=123)

In [76]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((350, 12), (150, 12), (350,), (150,))

In [77]:
models = [
    LogisticRegression(),
    SVC(),
    DecisionTreeClassifier(),
    MLPClassifier(),
    RandomForestClassifier(),
    XGBClassifier()
]

for model in models:
    model.fit(X_train, y_train)



In [78]:
model_names = {
    "   Logistic Regression",
    "Support Vector Machine",
    "         Decision Tree",
    "        Neural Network",
    "         Random Forest",
    "               XGBoost"
}

In [79]:
list(zip(models, model_names))

[(LogisticRegression(), '        Neural Network'),
 (SVC(), '         Decision Tree'),
 (DecisionTreeClassifier(), '   Logistic Regression'),
 (MLPClassifier(), 'Support Vector Machine'),
 (RandomForestClassifier(), '         Random Forest'),
 (XGBClassifier(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=None, device=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, feature_types=None,
                feature_weights=None, gamma=None, grow_policy=None,
                importance_type=None, interaction_constraints=None,
                learning_rate=None, max_bin=None, max_cat_threshold=None,
                max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
                max_leaves=None, min_child_weight=None, missing=nan,
                monotone_constraints=None, multi_strategy=None, n_estimators=None,
                n_jobs=None, num

In [81]:
for model, name in zip(models, model_names):
    print(name + ": {:.4f}%".format(model.score(X_test, y_test)*100))

        Neural Network: 98.6667%
         Decision Tree: 98.6667%
   Logistic Regression: 100.0000%
Support Vector Machine: 100.0000%
         Random Forest: 100.0000%
               XGBoost: 100.0000%
