# ML Task 2

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_curve, precision_score, recall_score
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [28]:
def preprocess_data(X,training=True,drop_columns=[]):
    global book

    if drop_columns:
        X = X.drop(columns=drop_columns)
    
    cat_columns = X.select_dtypes(['object']).columns
    num_columns = list(set(X.columns)-set(cat_columns))

    if training:
        for col in cat_columns:
            book[f'{col}_label_encode'] = LabelEncoder().fit(X[col].values)
            X[col] = book[f'{col}_label_encode'].transform(X[col].values)
            
        book[f'min_max_scaler'] = MinMaxScaler().fit(X[num_columns])
        X[num_columns] = book[f'min_max_scaler'].transform(X[num_columns])
    else:
        for col in cat_columns:
            X[col] = book[f'{col}_label_encode'].transform(X[col].values)

    return X

def train_model(X,y,model_type=None):
    """
    Choose from below
    model_type : ['linear_regression','random_forest']
    """
    match model_type:
        case 'linear_regression':
            model = LogisticRegression()
        case _: 
            model = RandomForestRegressor()

    model.fit(X,y)
    return model

def model_scoring(model,X):
    return model.predict(X)

def evaluate_model(model,X,y,data_group,cutoff=0.5):
    """
    data_group: ['train','test','evaluate'] just give a name
    """
    y_true = y
    y_score = model_scoring(model,X)
    y_predict = [1 if prob>cutoff else 0 for prob in y_score]

    return {
        'model_name':type(model),
        'data_group':data_group,
        'r2_score':r2_score(y_true,y_predict),
        'MAE':mean_absolute_error(y_true,y_predict), 
        'MSE':mean_squared_error(y_true,y_predict), 
        'RMSE':root_mean_squared_error(y_true,y_score)
    }

def metric_to_df(metric):
    metric_df = pd.DataFrame(data = [[metric['model_name'],
                                     metric['data_group'],
                                     metric['r2_score'],
                                     metric['MAE'],
                                     metric['MSE'],
                                     metric['RMSE']]],
                             columns=['model_name','data_group','r2_score','MAE',
                                      'MSE','RMSE'],)
    return metric_df

In [29]:
dataset = '../4.EDA with Pandas/loan_predication/loan_predication.csv'

# Read file
loan = pd.read_csv(dataset)
loan = loan.dropna(how='any')

# Create holder
book = {}
all_metrics = pd.DataFrame(columns=['model_name','data_group','r2_score','MAE',
                                      'MSE','RMSE'])

# Split data
train_col = 'ApplicantIncome'
X_train, X_test, y_train, y_test = train_test_split(loan.drop(columns=[train_col]), 
                                                    loan[train_col], test_size=0.33, random_state=42)

In [30]:
# Preprocess data
drop_col = ['Loan_ID']
X_train_process = preprocess_data(X_train,drop_columns=drop_col)
X_test_process = preprocess_data(X_test,training=False,drop_columns=drop_col)

In [31]:
# train Linear regression model
model = train_model(X_train_process,y_train,model_type='linear_regression')

####### Train Set #######
# score model
score = model_scoring(model,X_train_process)
# get_metrics
metrics = evaluate_model(model,X_train_process,y_train,data_group='train')
metrics_df = metric_to_df(metrics)
all_metrics = pd.concat([all_metrics,metrics_df])

####### Test Set #######
# score model
score = model_scoring(model,X_test_process)
# get_metrics
metrics = evaluate_model(model,X_test_process,y_test,data_group='test')
metrics_df = metric_to_df(metrics)
all_metrics = pd.concat([all_metrics,metrics_df])

print('Linear Regression')
display(metrics_df)

Linear Regression


Unnamed: 0,model_name,data_group,r2_score,MAE,MSE,RMSE
0,<class 'sklearn.linear_model._logistic.Logisti...,test,-2.248243,5016.515723,36358810.0,4672.118247


In [32]:
# train Linear regression model
model = train_model(X_train_process,y_train,model_type='random_forest')

####### Train Set #######
# score model
score = model_scoring(model,X_train_process)
# get_metrics
metrics = evaluate_model(model,X_train_process,y_train,data_group='train')
metrics_df = metric_to_df(metrics)
all_metrics = pd.concat([all_metrics,metrics_df])

####### Test Set #######
# score model
score = model_scoring(model,X_test_process)
# get_metrics
metrics = evaluate_model(model,X_test_process,y_test,data_group='test')
metrics_df = metric_to_df(metrics)
all_metrics = pd.concat([all_metrics,metrics_df])

print('Random Forest')
display(metrics_df)

Random Forest


Unnamed: 0,model_name,data_group,r2_score,MAE,MSE,RMSE
0,<class 'sklearn.ensemble._forest.RandomForestR...,test,-2.248243,5016.515723,36358810.0,15606.855576


In [34]:
all_metrics

Unnamed: 0,model_name,data_group,r2_score,MAE,MSE,RMSE
0,<class 'sklearn.linear_model._logistic.Logisti...,train,-0.724082,5534.968847,72945840.0,4537.758336
0,<class 'sklearn.linear_model._logistic.Logisti...,test,-2.248243,5016.515723,36358810.0,4672.118247
0,<class 'sklearn.ensemble._forest.RandomForestR...,train,-0.724082,5534.968847,72945840.0,2048.442775
0,<class 'sklearn.ensemble._forest.RandomForestR...,test,-2.248243,5016.515723,36358810.0,15606.855576
