## Calssification Model Metrics

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split

def income_model_data_prep(data):
    data = pd.get_dummies(
        income.assign(
                target = np.where(data["SalStat"]==" less than or equal to 50,000", 0,1),
                nativecountry = data["nativecountry"].str.replace(" Holand-Netherlands",
                                                         " Germany"),
                JobType = data["JobType"].replace({" Never-worked":" Without-pay"}),
                occupation = data["occupation"].str.replace(" Armed-Forces"," ?")
                ).drop("SalStat",axis=1),
        drop_first=True
    )
    X = data.drop(columns=["target"],axis=1)
    y = data["target"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

    return X_train, X_test, y_train, y_test

In [3]:
## Split train and test datasets

income = pd.read_csv("../Data/income.csv")

X_train, X_test, y_train, y_test = income_model_data_prep(income)

In [6]:
## fit the classification model using Logistic Regression Algorithm
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000, solver="saga").fit(X_train, y_train)




In [9]:
## generate the confusion matrix for the model

from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, lr.predict(X_test))

array([[4691,  161],
       [1147,  397]])

In [11]:
## generate accuracy, precision, recall and F1 score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Accuracy: ", accuracy_score(y_test, lr.predict(X_test)))
print("Precision: ", precision_score(y_test, lr.predict(X_test)))
print("Recall: ", recall_score(y_test, lr.predict(X_test)))
print("F1 Score: ", f1_score(y_test, lr.predict(X_test)))

Accuracy:  0.7954971857410882
Precision:  0.7114695340501792
Recall:  0.25712435233160624
F1 Score:  0.37773549000951473


In [12]:
## Scale the features
from sklearn.preprocessing import StandardScaler

sd = StandardScaler()
X_train_std = sd.fit_transform(X_train)
X_test_std = sd.transform(X_test)

In [13]:
lr_std = LogisticRegression(max_iter=1000, solver="saga").fit(X_train_std, y_train)

In [15]:
confusion_matrix(y_test, lr_std.predict(X_test_std))

array([[4525,  327],
       [ 594,  950]])

In [16]:
print("Accuracy: ", accuracy_score(y_test, lr_std.predict(X_test_std)))
print("Precision: ", precision_score(y_test, lr_std.predict(X_test_std)))
print("Recall: ", recall_score(y_test, lr_std.predict(X_test_std)))
print("F1 Score: ", f1_score(y_test, lr_std.predict(X_test_std)))

Accuracy:  0.8560037523452158
Precision:  0.7439310884886453
Recall:  0.6152849740932642
F1 Score:  0.673520028358738


In [17]:
## tune the model through regualrization (tuning the hyperparameters)

from sklearn.model_selection import GridSearchCV

param_grid = [
    {"penalty": ["l1"], "C": np.linspace(0.1, 2, 5), "solver": ["saga"], "l1_ratio": [None]},
    {"penalty": ["l2"], "C": np.linspace(0.1, 2, 5), "solver": ["saga"], "l1_ratio": [None]},
    {"penalty": ["elasticnet"], "C": np.linspace(0.1, 2, 5), "solver": ["saga"], "l1_ratio": np.linspace(0, 1, 3)},
]

model = LogisticRegression(solver="saga",max_iter=500)
gridsearch = GridSearchCV(model,param_grid=param_grid,cv=5,n_jobs=-1)

gridsearch.fit(X_train_std, y_train)

In [18]:
gridsearch.best_params_

{'C': np.float64(0.1),
 'l1_ratio': np.float64(0.5),
 'penalty': 'elasticnet',
 'solver': 'saga'}

In [20]:
lr_reg = LogisticRegression(
    penalty=gridsearch.best_params_['penalty'],
    C=gridsearch.best_params_['C'],
    l1_ratio=gridsearch.best_params_['l1_ratio'],
    max_iter=1000,
    solver="saga",
    n_jobs=-1
).fit(X_train_std,y_train)
    

In [22]:
confusion_matrix(y_test, lr_reg.predict(X_test_std))

array([[4530,  322],
       [ 603,  941]])

In [24]:
print("Accuracy: ", accuracy_score(y_test, lr_reg.predict(X_test_std)))
print("Precision: ", precision_score(y_test, lr_reg.predict(X_test_std)))
print("Recall: ", recall_score(y_test, lr_reg.predict(X_test_std)))
print("F1 Score: ", f1_score(y_test, lr_reg.predict(X_test_std)))

Accuracy:  0.8553783614759225
Precision:  0.7450514647664291
Recall:  0.6094559585492227
F1 Score:  0.6704666904168151
