# ML Task 1

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_curve, precision_score, recall_score
from sklearn.metrics import roc_auc_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [2]:
def preprocess_data(X,y,training=True,drop_columns=[]):
    global book

    if drop_columns:
        X = X.drop(columns=drop_columns)
    
    cat_columns = X.select_dtypes(['object']).columns
    num_columns = list(set(X.columns)-set(cat_columns))

    if training:
        for col in cat_columns:
            book[f'{col}_label_encode'] = LabelEncoder().fit(X[col].values)
            X[col] = book[f'{col}_label_encode'].transform(X[col].values)
            
        book[f'min_max_scaler'] = MinMaxScaler().fit(X[num_columns])
        X[num_columns] = book[f'min_max_scaler'].transform(X[num_columns])
    
        book[f'y_label_encode'] = LabelEncoder().fit(y)
        y = book[f'y_label_encode'].transform(y) 
    else:
        for col in cat_columns:
            X[col] = book[f'{col}_label_encode'].transform(X[col].values)

        book[f'y_label_encode'] = LabelEncoder().fit(y)
        y = book[f'y_label_encode'].transform(y) 

    return X,y

def train_model(X,y,model_type=None):
    """
    Choose from below
    model_type : ['svm','logistic','decision_tree']
    """
    match model_type:
        case 'logistic':
            model = LogisticRegression()
        case 'decision_tree':
            model = DecisionTreeClassifier()
        case _: 
            model = SVC(probability=True)

    model.fit(X,y)
    return model

def model_scoring(model,X):
    return model.predict_proba(X)[:,1]

def evaluate_model(model,X,y,data_group,cutoff=0.5):
    """
    data_group: ['train','test','evaluate'] just give a name
    """
    y_true = y
    y_score = model_scoring(model,X)
    y_predict = [1 if prob>cutoff else 0 for prob in y_score]

    return {
        'model_name':type(model),
        'data_group':data_group,
        'roc_auc_score':roc_auc_score(y_true,y_predict),
        'classification_report':classification_report(y_true,y_predict), 
        'accuracy':accuracy_score(y_true,y_predict), 
        'precision_recall_curve':precision_recall_curve(y_true,y_score),
        'precision_score':precision_score(y_true,y_predict),
        'recall_score':recall_score(y_true,y_predict),
        'f1_score':f1_score(y_true,y_predict)
    }

def metric_to_df(metric):
    metric_df = pd.DataFrame(data = [[metric['model_name'],
                                     metric['data_group'],
                                     metric['roc_auc_score'],
                                     metric['accuracy'],
                                     metric['precision_score'],
                                     metric['recall_score'],
                                     metric['f1_score']]
                                    ],
                             columns=['model_name','data_group','roc_score','accuracy',
                                      'precision_score','recall_score',
                                      'f1_score'],)
    return metric_df

In [None]:
dataset = '../4.EDA with Pandas/loan_predication/loan_predication.csv'

# Read file
loan = pd.read_csv(dataset)
loan = loan.dropna(how='any')

# Create holder
book = {}
all_metrics = pd.DataFrame(columns=['model_name','data_group','roc_score','accuracy',
                                      'precision_score','recall_score',
                                      'f1_score'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(loan.drop(columns=['Gender']), 
                                                    loan['Gender'], test_size=0.33, random_state=42,
                                                    stratify=loan['Gender'])

In [5]:
loan.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y


In [6]:
# Preprocess data
drop_col = ['Loan_ID']
X_train_process, y_train_process = preprocess_data(X_train,y_train,drop_columns=drop_col)
X_test_process, y_test_process = preprocess_data(X_test,y_test,training=False,drop_columns=drop_col)

In [8]:
# train SVM model
model = train_model(X_train_process,y_train_process,model_type='SVM')

####### Train Set #######
# score model
score = model_scoring(model,X_train_process)
# get_metrics
metrics = evaluate_model(model,X_train_process,y_train_process,data_group='train')
metrics_df = metric_to_df(metrics)
all_metrics = pd.concat([all_metrics,metrics_df])

####### Test Set #######
# score model
score = model_scoring(model,X_test_process)
# get_metrics
metrics = evaluate_model(model,X_test_process,y_test_process,data_group='test')
metrics_df = metric_to_df(metrics)
all_metrics = pd.concat([all_metrics,metrics_df])

print('SVM model')
print(metrics['classification_report'])

SVM model
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.82      1.00      0.90       131

    accuracy                           0.82       159
   macro avg       0.41      0.50      0.45       159
weighted avg       0.68      0.82      0.74       159



In [9]:
# train Decision Tree model
model = train_model(X_train_process,y_train_process,model_type='decision_tree')

####### Train Set #######
# score model
score = model_scoring(model,X_train_process)
# get_metrics
metrics = evaluate_model(model,X_train_process,y_train_process,data_group='train')
metrics_df = metric_to_df(metrics)
all_metrics = pd.concat([all_metrics,metrics_df])

####### Test Set #######
# score model
score = model_scoring(model,X_test_process)
# get_metrics
metrics = evaluate_model(model,X_test_process,y_test_process,data_group='test')
metrics_df = metric_to_df(metrics)
all_metrics = pd.concat([all_metrics,metrics_df])

print('Decision Tree model')
print(metrics['classification_report'])

Decision Tree model
              precision    recall  f1-score   support

           0       0.32      0.71      0.44        28
           1       0.92      0.68      0.78       131

    accuracy                           0.69       159
   macro avg       0.62      0.70      0.61       159
weighted avg       0.81      0.69      0.72       159



In [10]:
# train Logistic Regression model
model = train_model(X_train_process,y_train_process,model_type='logistic')

####### Train Set #######
# score model
score = model_scoring(model,X_train_process)
# get_metrics
metrics = evaluate_model(model,X_train_process,y_train_process,data_group='train')
metrics_df = metric_to_df(metrics)
all_metrics = pd.concat([all_metrics,metrics_df])

####### Test Set #######
# score model
score = model_scoring(model,X_test_process)
# get_metrics
metrics = evaluate_model(model,X_test_process,y_test_process,data_group='test')
metrics_df = metric_to_df(metrics)
all_metrics = pd.concat([all_metrics,metrics_df])

print('Logistic Regression model')
print(metrics['classification_report'])

Logistic Regression model
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.82      1.00      0.90       131

    accuracy                           0.82       159
   macro avg       0.41      0.50      0.45       159
weighted avg       0.68      0.82      0.74       159



In [11]:
all_metrics

Unnamed: 0,model_name,data_group,roc_score,accuracy,precision_score,recall_score,f1_score
0,<class 'sklearn.svm._classes.SVC'>,train,0.556543,0.834891,0.836538,0.992395,0.907826
0,<class 'sklearn.svm._classes.SVC'>,test,0.5,0.823899,0.823899,1.0,0.903448
0,<class 'sklearn.tree._classes.DecisionTreeClas...,train,0.998099,0.996885,1.0,0.996198,0.998095
0,<class 'sklearn.tree._classes.DecisionTreeClas...,test,0.696838,0.685535,0.917526,0.679389,0.780702
0,<class 'sklearn.linear_model._logistic.Logisti...,train,0.511538,0.816199,0.822785,0.988593,0.8981
0,<class 'sklearn.linear_model._logistic.Logisti...,test,0.5,0.823899,0.823899,1.0,0.903448
