# In this notebook we train and evaluate our models

In [9]:
import pandas as pd 
import numpy as np
import random 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

random.seed(42) #in case we will use random somewhere

data = pd.read_csv("../data/processed/processed_credit_risk_dataset.csv")

## Split test and train data



In [None]:
y = data['loan_status'].to_numpy()
X = data.drop('loan_status',axis = 1).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state= 42) #stratify to handle imbalance in target lables

## Define metrics and scoring

In [55]:
def score_model(model, X_test, y_test):
    
    """ 
    Parameters: 
         X_test (numpy array)
         y_test (numpy array)
         model: fitted model
    """

    y_pred = model.predict(X_test)
    metrics = precision_recall_fscore_support(y_test, y_pred, pos_label=1, average='binary')
    
    print('Confusion matrix)\n', confusion_matrix(y_test, y_pred))
    print('Precision is {:0.2f} %'.format(metrics[0]*100))
    print('Recall is {:0.2f} %'.format(metrics[1]*100))
    print('Fscore is {:0.2f} %'.format(metrics[2]*100))


## Models

### Decision Tree Classifier

In [56]:
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier

model_DTC =  DecisionTreeClassifier(max_depth = 10, random_state = 42) #define model parameters
model_DTC.fit(X_train, y_train) #fit model

score_model(model_DTC,X_test,y_test) 


Confusion matrix)
 [[6329   40]
 [ 475 1302]]
Precision is 97.02 %
Recall is 73.27 %
Fscore is 83.49 %


### Logistic Regression

### KNN