## Loan Payment Prediction

Given *data about loans*, let's try to predict whether a given loan will be **paid off** or not.

We will use six different models to make our predictions.

Data source: https://www.kaggle.com/datasets/zhijinzhai/loandata

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
data = pd.read_csv('Loan payments data.csv')
data

Unnamed: 0,Loan_ID,loan_status,Principal,terms,effective_date,due_date,paid_off_time,past_due_days,age,education,Gender
0,xqd20166231,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/14/2016 19:31,,45,High School or Below,male
1,xqd20168902,PAIDOFF,1000,30,9/8/2016,10/7/2016,10/7/2016 9:00,,50,Bechalor,female
2,xqd20160003,PAIDOFF,1000,30,9/8/2016,10/7/2016,9/25/2016 16:58,,33,Bechalor,female
3,xqd20160004,PAIDOFF,1000,15,9/8/2016,9/22/2016,9/22/2016 20:00,,27,college,male
4,xqd20160005,PAIDOFF,1000,30,9/9/2016,10/8/2016,9/23/2016 21:36,,28,college,female
...,...,...,...,...,...,...,...,...,...,...,...
495,xqd20160496,COLLECTION_PAIDOFF,1000,30,9/12/2016,10/11/2016,10/14/2016 19:08,3.0,28,High School or Below,male
496,xqd20160497,COLLECTION_PAIDOFF,1000,15,9/12/2016,9/26/2016,10/10/2016 20:02,14.0,26,High School or Below,male
497,xqd20160498,COLLECTION_PAIDOFF,800,15,9/12/2016,9/26/2016,9/29/2016 11:49,3.0,30,college,male
498,xqd20160499,COLLECTION_PAIDOFF,1000,30,9/12/2016,11/10/2016,11/11/2016 22:40,1.0,38,college,female


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Loan_ID         500 non-null    object 
 1   loan_status     500 non-null    object 
 2   Principal       500 non-null    int64  
 3   terms           500 non-null    int64  
 4   effective_date  500 non-null    object 
 5   due_date        500 non-null    object 
 6   paid_off_time   400 non-null    object 
 7   past_due_days   200 non-null    float64
 8   age             500 non-null    int64  
 9   education       500 non-null    object 
 10  Gender          500 non-null    object 
dtypes: float64(1), int64(3), object(7)
memory usage: 43.1+ KB


### Preprocessing

In [4]:
data.isna().sum()

Loan_ID             0
loan_status         0
Principal           0
terms               0
effective_date      0
due_date            0
paid_off_time     100
past_due_days     300
age                 0
education           0
Gender              0
dtype: int64

In [5]:
data['loan_status'].unique()

array(['PAIDOFF', 'COLLECTION', 'COLLECTION_PAIDOFF'], dtype=object)

In [6]:
{column: len(data[column].unique()) for column in data.columns}

{'Loan_ID': 500,
 'loan_status': 3,
 'Principal': 6,
 'terms': 3,
 'effective_date': 7,
 'due_date': 25,
 'paid_off_time': 321,
 'past_due_days': 34,
 'age': 33,
 'education': 4,
 'Gender': 2}

In [7]:
# Drop Loan_ID column
data = data.drop('Loan_ID', axis=1)

In [8]:
# Create date/time columns
for column in ['effective_date', 'due_date', 'paid_off_time']:
    data[column] = pd.to_datetime(data[column])

data['effective_year'] = data['effective_date'].apply(lambda x: x.year)
data['effective_month'] = data['effective_date'].apply(lambda x: x.month)
data['effective_day'] = data['effective_date'].apply(lambda x: x.day)

In [9]:
data['due_year'] = data['due_date'].apply(lambda x: x.year)
data['due_month'] = data['due_date'].apply(lambda x: x.month)
data['due_day'] = data['due_date'].apply(lambda x: x.day)

In [10]:
data

Unnamed: 0,loan_status,Principal,terms,effective_date,due_date,paid_off_time,past_due_days,age,education,Gender,effective_year,effective_month,effective_day,due_year,due_month,due_day
0,PAIDOFF,1000,30,2016-09-08,2016-10-07,2016-09-14 19:31:00,,45,High School or Below,male,2016,9,8,2016,10,7
1,PAIDOFF,1000,30,2016-09-08,2016-10-07,2016-10-07 09:00:00,,50,Bechalor,female,2016,9,8,2016,10,7
2,PAIDOFF,1000,30,2016-09-08,2016-10-07,2016-09-25 16:58:00,,33,Bechalor,female,2016,9,8,2016,10,7
3,PAIDOFF,1000,15,2016-09-08,2016-09-22,2016-09-22 20:00:00,,27,college,male,2016,9,8,2016,9,22
4,PAIDOFF,1000,30,2016-09-09,2016-10-08,2016-09-23 21:36:00,,28,college,female,2016,9,9,2016,10,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,COLLECTION_PAIDOFF,1000,30,2016-09-12,2016-10-11,2016-10-14 19:08:00,3.0,28,High School or Below,male,2016,9,12,2016,10,11
496,COLLECTION_PAIDOFF,1000,15,2016-09-12,2016-09-26,2016-10-10 20:02:00,14.0,26,High School or Below,male,2016,9,12,2016,9,26
497,COLLECTION_PAIDOFF,800,15,2016-09-12,2016-09-26,2016-09-29 11:49:00,3.0,30,college,male,2016,9,12,2016,9,26
498,COLLECTION_PAIDOFF,1000,30,2016-09-12,2016-11-10,2016-11-11 22:40:00,1.0,38,college,female,2016,9,12,2016,11,10


In [11]:
data['paid_off_year'] = data['paid_off_time'].apply(lambda x: x.year)
data['paid_off_month'] = data['paid_off_time'].apply(lambda x: x.month)
data['paid_off_day'] = data['paid_off_time'].apply(lambda x: x.day)
data['paid_off_hour'] = data['paid_off_time'].apply(lambda x: x.hour)

In [12]:
data = data.drop(['effective_date', 'due_date', 'paid_off_time'], axis=1)
data

Unnamed: 0,loan_status,Principal,terms,past_due_days,age,education,Gender,effective_year,effective_month,effective_day,due_year,due_month,due_day,paid_off_year,paid_off_month,paid_off_day,paid_off_hour
0,PAIDOFF,1000,30,,45,High School or Below,male,2016,9,8,2016,10,7,2016.0,9.0,14.0,19.0
1,PAIDOFF,1000,30,,50,Bechalor,female,2016,9,8,2016,10,7,2016.0,10.0,7.0,9.0
2,PAIDOFF,1000,30,,33,Bechalor,female,2016,9,8,2016,10,7,2016.0,9.0,25.0,16.0
3,PAIDOFF,1000,15,,27,college,male,2016,9,8,2016,9,22,2016.0,9.0,22.0,20.0
4,PAIDOFF,1000,30,,28,college,female,2016,9,9,2016,10,8,2016.0,9.0,23.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,COLLECTION_PAIDOFF,1000,30,3.0,28,High School or Below,male,2016,9,12,2016,10,11,2016.0,10.0,14.0,19.0
496,COLLECTION_PAIDOFF,1000,15,14.0,26,High School or Below,male,2016,9,12,2016,9,26,2016.0,10.0,10.0,20.0
497,COLLECTION_PAIDOFF,800,15,3.0,30,college,male,2016,9,12,2016,9,26,2016.0,9.0,29.0,11.0
498,COLLECTION_PAIDOFF,1000,30,1.0,38,college,female,2016,9,12,2016,11,10,2016.0,11.0,11.0,22.0


In [13]:
{column: len(data[column].unique()) for column in data.columns}

{'loan_status': 3,
 'Principal': 6,
 'terms': 3,
 'past_due_days': 34,
 'age': 33,
 'education': 4,
 'Gender': 2,
 'effective_year': 1,
 'effective_month': 1,
 'effective_day': 7,
 'due_year': 1,
 'due_month': 3,
 'due_day': 19,
 'paid_off_year': 2,
 'paid_off_month': 5,
 'paid_off_day': 32,
 'paid_off_hour': 24}

In [14]:
data.isna().sum()

loan_status          0
Principal            0
terms                0
past_due_days      300
age                  0
education            0
Gender               0
effective_year       0
effective_month      0
effective_day        0
due_year             0
due_month            0
due_day              0
paid_off_year      100
paid_off_month     100
paid_off_day       100
paid_off_hour      100
dtype: int64

In [15]:
# Fill missing values with column means
for column in ['past_due_days', 'paid_off_year', 'paid_off_month', 'paid_off_day', 'paid_off_hour']:
    data[column] = data[column].fillna(data[column].mean())

In [16]:
data.isna().sum().sum()

0

In [17]:
{column: list(data[column].unique()) for column in data.select_dtypes('object').columns}

{'loan_status': ['PAIDOFF', 'COLLECTION', 'COLLECTION_PAIDOFF'],
 'education': ['High School or Below',
  'Bechalor',
  'college',
  'Master or Above'],
 'Gender': ['male', 'female']}

In [18]:
def binary_encode(df, column, positive_value):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_value else 0)
    return df

def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

In [19]:
# Binary encode the Gender column
data = binary_encode(data, 'Gender', positive_value='male')

# Ordinal encode the education column
data = ordinal_encode(data, 'education', 
    [
        'High School or Below',
        'college',
        'Bechalor',
        'Master or Above'
    ]
)

In [20]:
data

Unnamed: 0,loan_status,Principal,terms,past_due_days,age,education,Gender,effective_year,effective_month,effective_day,due_year,due_month,due_day,paid_off_year,paid_off_month,paid_off_day,paid_off_hour
0,PAIDOFF,1000,30,36.01,45,0,1,2016,9,8,2016,10,7,2016.0,9.0,14.0,19.0
1,PAIDOFF,1000,30,36.01,50,2,0,2016,9,8,2016,10,7,2016.0,10.0,7.0,9.0
2,PAIDOFF,1000,30,36.01,33,2,0,2016,9,8,2016,10,7,2016.0,9.0,25.0,16.0
3,PAIDOFF,1000,15,36.01,27,1,1,2016,9,8,2016,9,22,2016.0,9.0,22.0,20.0
4,PAIDOFF,1000,30,36.01,28,1,0,2016,9,9,2016,10,8,2016.0,9.0,23.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,COLLECTION_PAIDOFF,1000,30,3.00,28,0,1,2016,9,12,2016,10,11,2016.0,10.0,14.0,19.0
496,COLLECTION_PAIDOFF,1000,15,14.00,26,0,1,2016,9,12,2016,9,26,2016.0,10.0,10.0,20.0
497,COLLECTION_PAIDOFF,800,15,3.00,30,1,1,2016,9,12,2016,9,26,2016.0,9.0,29.0,11.0
498,COLLECTION_PAIDOFF,1000,30,1.00,38,1,0,2016,9,12,2016,11,10,2016.0,11.0,11.0,22.0


In [21]:
# Encode the label (loan_status) column
label_mapping = {'COLLECTION': 0, 'PAIDOFF': 1, 'COLLECTION_PAIDOFF': 2}

data['loan_status'] = data['loan_status'].replace(label_mapping)

data

  data['loan_status'] = data['loan_status'].replace(label_mapping)


Unnamed: 0,loan_status,Principal,terms,past_due_days,age,education,Gender,effective_year,effective_month,effective_day,due_year,due_month,due_day,paid_off_year,paid_off_month,paid_off_day,paid_off_hour
0,1,1000,30,36.01,45,0,1,2016,9,8,2016,10,7,2016.0,9.0,14.0,19.0
1,1,1000,30,36.01,50,2,0,2016,9,8,2016,10,7,2016.0,10.0,7.0,9.0
2,1,1000,30,36.01,33,2,0,2016,9,8,2016,10,7,2016.0,9.0,25.0,16.0
3,1,1000,15,36.01,27,1,1,2016,9,8,2016,9,22,2016.0,9.0,22.0,20.0
4,1,1000,30,36.01,28,1,0,2016,9,9,2016,10,8,2016.0,9.0,23.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,2,1000,30,3.00,28,0,1,2016,9,12,2016,10,11,2016.0,10.0,14.0,19.0
496,2,1000,15,14.00,26,0,1,2016,9,12,2016,9,26,2016.0,10.0,10.0,20.0
497,2,800,15,3.00,30,1,1,2016,9,12,2016,9,26,2016.0,9.0,29.0,11.0
498,2,1000,30,1.00,38,1,0,2016,9,12,2016,11,10,2016.0,11.0,11.0,22.0


#### Splitting and Scaling

In [22]:
# Split df into X and y
y = data['loan_status'].copy()
X = data.drop('loan_status', axis=1).copy()

In [23]:
y

0      1
1      1
2      1
3      1
4      1
      ..
495    2
496    2
497    2
498    2
499    2
Name: loan_status, Length: 500, dtype: int64

In [24]:
X

Unnamed: 0,Principal,terms,past_due_days,age,education,Gender,effective_year,effective_month,effective_day,due_year,due_month,due_day,paid_off_year,paid_off_month,paid_off_day,paid_off_hour
0,1000,30,36.01,45,0,1,2016,9,8,2016,10,7,2016.0,9.0,14.0,19.0
1,1000,30,36.01,50,2,0,2016,9,8,2016,10,7,2016.0,10.0,7.0,9.0
2,1000,30,36.01,33,2,0,2016,9,8,2016,10,7,2016.0,9.0,25.0,16.0
3,1000,15,36.01,27,1,1,2016,9,8,2016,9,22,2016.0,9.0,22.0,20.0
4,1000,30,36.01,28,1,0,2016,9,9,2016,10,8,2016.0,9.0,23.0,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,1000,30,3.00,28,0,1,2016,9,12,2016,10,11,2016.0,10.0,14.0,19.0
496,1000,15,14.00,26,0,1,2016,9,12,2016,9,26,2016.0,10.0,10.0,20.0
497,800,15,3.00,30,1,1,2016,9,12,2016,9,26,2016.0,9.0,29.0,11.0
498,1000,30,1.00,38,1,0,2016,9,12,2016,11,10,2016.0,11.0,11.0,22.0


In [25]:
# Scale X with a Standard Scaler

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [26]:
{column: len(X[column].unique()) for column in X.columns}

{'Principal': 6,
 'terms': 3,
 'past_due_days': 34,
 'age': 33,
 'education': 4,
 'Gender': 2,
 'effective_year': 1,
 'effective_month': 1,
 'effective_day': 7,
 'due_year': 1,
 'due_month': 3,
 'due_day': 19,
 'paid_off_year': 1,
 'paid_off_month': 5,
 'paid_off_day': 32,
 'paid_off_hour': 24}

In [27]:
X = X.drop(['effective_year', 'effective_month', 'due_year', 'paid_off_year'], axis=1) # single value columns

In [28]:
X

Unnamed: 0,Principal,terms,past_due_days,age,education,Gender,effective_day,due_month,due_day,paid_off_month,paid_off_day,paid_off_hour
0,0.493377,0.897891,0.000000,2.284043,-1.022825,0.426653,-3.126073,0.664986,-1.303142,-1.035098,-0.463997,1.339835
1,0.493377,0.897891,0.000000,3.106587,1.771779,-2.343823,-3.126073,0.664986,-1.303142,0.690066,-1.475829,-1.072109
2,0.493377,0.897891,0.000000,0.309935,1.771779,-2.343823,-3.126073,0.664986,-1.303142,-1.035098,1.126025,0.616252
3,0.493377,-0.978972,0.000000,-0.677119,0.374477,0.426653,-3.126073,-1.094236,0.724148,-1.035098,0.692382,1.581030
4,0.493377,0.897891,0.000000,-0.512610,0.374477,-2.343823,-2.209336,0.664986,-1.167989,-1.035098,0.836930,1.822224
...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.493377,0.897891,-1.780899,-0.512610,-1.022825,0.426653,0.540875,0.664986,-0.762531,0.690066,-0.463997,1.339835
496,0.493377,-0.978972,-1.187446,-0.841628,-1.022825,0.426653,0.540875,-1.094236,1.264758,0.690066,-1.042187,1.581030
497,-1.243866,-0.978972,-1.780899,-0.183592,0.374477,0.426653,0.540875,-1.094236,1.264758,-1.035098,1.704214,-0.589721
498,0.493377,0.897891,-1.888799,1.132480,0.374477,-2.343823,0.540875,2.424209,-0.897684,2.415229,-0.897640,2.063419


In [30]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)

### Training

In [31]:
models = [
    LogisticRegression(),
    SVC(),
    DecisionTreeClassifier(),
    MLPClassifier(),
    RandomForestClassifier(),
    XGBClassifier()
]

In [32]:
for model in models:
    model.fit(X_train, y_train)



In [33]:
model_names = {
    "   Logistic Regression",
    "Support Vector Machine",
    "         Decision Tree",
    "        Neural Network",
    "         Random Forest",
    "               XGBoost"
}

In [35]:
for model, name in zip(models, model_names):
    print(name + ": {:.4f}%".format(model.score(X_test, y_test)*100))

   Logistic Regression: 98.6667%
Support Vector Machine: 98.6667%
         Random Forest: 98.6667%
        Neural Network: 100.0000%
               XGBoost: 100.0000%
         Decision Tree: 100.0000%
