# ML Assignment 2 â€“ Classification Models

## Problem Statement
Implement multiple classification models on a public dataset and compare their performance using evaluation metrics.

## Dataset
Adult Income Dataset (Binary Classification)
Target: Predict whether income >50K or <=50K


In [1]:
!pip install xgboost




In [2]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score, matthews_corrcoef
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier


In [3]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv"

columns = [
    "age","workclass","fnlwgt","education","education-num",
    "marital-status","occupation","relationship","race","sex",
    "capital-gain","capital-loss","hours-per-week","native-country","income"
]

df = pd.read_csv(url, names=columns)
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.replace("?", np.nan, inplace=True)
df.dropna(inplace=True)

label_encoders = {}

for col in df.columns:
    if df[col].dtype == "object":
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

X = df.drop("income", axis=1)
y = df["income"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=25
)

print("Training shape:", X_train.shape)
print("Testing shape:", X_test.shape)


Training shape: (36177, 14)
Testing shape: (9045, 14)


In [5]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}


In [6]:
model_results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    model_results[name] = {
        "Accuracy": round(accuracy_score(y_test, y_pred), 4),
        "AUC": round(roc_auc_score(y_test, y_pred), 4),
        "Precision": round(precision_score(y_test, y_pred), 4),
        "Recall": round(recall_score(y_test, y_pred), 4),
        "F1 Score": round(f1_score(y_test, y_pred), 4),
        "MCC": round(matthews_corrcoef(y_test, y_pred), 4)
    }

    pickle.dump(model, open(f"{name}.pkl", "wb"))

model_results


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'Logistic Regression': {'Accuracy': 0.8272,
  'AUC': np.float64(0.7062),
  'Precision': 0.723,
  'Recall': 0.4705,
  'F1 Score': 0.57,
  'MCC': np.float64(0.4848)},
 'Decision Tree': {'Accuracy': 0.8155,
  'AUC': np.float64(0.7575),
  'Precision': 0.6156,
  'Recall': 0.6444,
  'F1 Score': 0.6297,
  'MCC': np.float64(0.5071)},
 'KNN': {'Accuracy': 0.8318,
  'AUC': np.float64(0.7575),
  'Precision': 0.6688,
  'Recall': 0.6126,
  'F1 Score': 0.6395,
  'MCC': np.float64(0.531)},
 'Naive Bayes': {'Accuracy': 0.8014,
  'AUC': np.float64(0.6467),
  'Precision': 0.6822,
  'Recall': 0.3451,
  'F1 Score': 0.4584,
  'MCC': np.float64(0.3832)},
 'Random Forest': {'Accuracy': 0.862,
  'AUC': np.float64(0.7887),
  'Precision': 0.7524,
  'Recall': 0.6458,
  'F1 Score': 0.695,
  'MCC': np.float64(0.6095)},
 'XGBoost': {'Accuracy': 0.8755,
  'AUC': np.float64(0.8105),
  'Precision': 0.7779,
  'Recall': 0.6839,
  'F1 Score': 0.7279,
  'MCC': np.float64(0.6499)}}

In [7]:
comparison_table = pd.DataFrame(model_results).T
comparison_table


Unnamed: 0,Accuracy,AUC,Precision,Recall,F1 Score,MCC
Logistic Regression,0.8272,0.7062,0.723,0.4705,0.57,0.4848
Decision Tree,0.8155,0.7575,0.6156,0.6444,0.6297,0.5071
KNN,0.8318,0.7575,0.6688,0.6126,0.6395,0.531
Naive Bayes,0.8014,0.6467,0.6822,0.3451,0.4584,0.3832
Random Forest,0.862,0.7887,0.7524,0.6458,0.695,0.6095
XGBoost,0.8755,0.8105,0.7779,0.6839,0.7279,0.6499
