In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier 

In [None]:
# importing data
data = pd.read_csv('/content/Loan payments data.csv')

In [None]:
data

In [None]:
data.info()

Preprocessing

In [None]:
data.isna().sum()

In [None]:
data['loan_status'].unique()

array(['PAIDOFF', 'COLLECTION', 'COLLECTION_PAIDOFF'], dtype=object)

In [None]:
{column: len(data[column].unique()) for column in data.columns}

In [None]:
# encoding
def binary_encode(df, column, positive_value):
  df = df.copy()
  df[column] = df[column].apply(lambda x: 1 if x==positive_value else 0)
  return df


def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df

In [None]:
def preprocess_inputs(df):
  df = df.copy()

  # drop Loan_ID column
  df = df.drop('Loan_ID',axis=1)

  # create date/time column
  for column in ['effective_date', 'due_date', 'paid_off_time']:
      df[column] = pd.to_datetime(df[column])
  
  df['effective_year'] = df['effective_date'].apply(lambda x: x.year)
  #df['effective_month'] = df['effective_date'].apply(lambda x: x.month)
  #df['effective_day'] = df['effective_date'].apply(lambda x: x.day)
  
  #df['due_year'] = df['due_date'].apply(lambda x: x.year)
  df['due_month'] = df['due_date'].apply(lambda x: x.month)
  df['due_day'] = df['due_date'].apply(lambda x: x.day)

  #df['paid_off_year'] = df['paid_off_time'].apply(lambda x: x.year)
  df['paid_off_month'] = df['paid_off_time'].apply(lambda x: x.month)
  df['paid_off_day'] = df['paid_off_time'].apply(lambda x: x.day)
  df['paid_off_hour'] = df['paid_off_time'].apply(lambda x: x.hour)

  # dropping date time column
  df = df.drop(['effective_date', 'due_date', 'paid_off_time'], axis=1)

  # fill missing value with column means
  for column in ['paid_off_month', 'paid_off_day', 'paid_off_hour', 'past_due_days']:
    df[column] = df[column].fillna(df[column].mean()) 

  # Binary encode the gender column 
  df = binary_encode(df, 'Gender', positive_value='male' )

  # ordinal encode the educaton column
  education_ordering = ['High School or Below','college', 'Bechalor','Master or Above'] 
  df = ordinal_encode(df, 'education', ordering=education_ordering)

  # encoding the y (loan status column)
  label_mapping = {'COLLECTION' : 0, 'PAIDOFF' : 1, 'COLLECTION_PAIDOFF' : 2}
  df['loan_status'] = df['loan_status'].replace(label_mapping)

  # split loan status column
  y=df['loan_status'].copy()
  X=df.drop('loan_status',axis=1).copy()

  # scaling X with standard scaler
  scaler = StandardScaler()
  X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)
  return X,y

In [None]:
X, y = preprocess_inputs(data)
X

In [None]:
X.isna().sum()

In [None]:
{column: list(X[column].unique()) for column in X.select_dtypes('object').columns}

In [None]:
y

In [None]:
{column: len(X[column].unique()) for column in X.columns}

Training


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.7, random_state=123)

In [None]:
models = [
          LogisticRegression(),
          SVC(),
          DecisionTreeClassifier(),
          MLPClassifier(),
          RandomForestClassifier(),
          XGBClassifier()
]

for model in models:
  model.fit(X_train, y_train)

model_names = ["Logistic Regression", "SVC", "Decision Tree Classifier", "Nueral Networks", "Random Forest Classifier", "XG Boost"]

for model, name in zip(models,model_names):
  print(name + ": {:.4f}%".format(model.score(X_test,y_test)*100) )



Logistic Regression: 98.6667%
SVC: 99.3333%
Decision Tree Classifier: 100.0000%
Nueral Networks: 100.0000%
Random Forest Classifier: 100.0000%
XG Boost: 100.0000%


In [None]:
list(zip(models, model_names))