In [2]:
# ===============================
# Cell 1: Import Required Libraries
# ===============================

import pandas as pd
import numpy as np

# Visualization (for confusion matrix later if needed)
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn utilities
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Metrics (ALL required in PDF)
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    confusion_matrix,
    classification_report
)

# Save models
import joblib


In [3]:
import sys 
print(sys.executable)

d:\ML_ASSIGNMENT_2\ML_Assignment_2\.venv\Scripts\python.exe


In [4]:
# ===============================
# Cell 2: Load Dataset
# ===============================

df = pd.read_csv("../adult.csv")

print("Dataset Shape:", df.shape)
df.head()


Dataset Shape: (32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [5]:
# ===============================
# Cell 3: Handle Missing Values
# ===============================

df.replace("?", np.nan, inplace=True)

print("Missing Values Before Dropping:")
print(df.isnull().sum())

df.dropna(inplace=True)

print("\nDataset Shape After Dropping Missing Values:", df.shape)


Missing Values Before Dropping:
age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64

Dataset Shape After Dropping Missing Values: (30162, 15)


In [6]:
# ===============================
# Cell 4: Separate X and y
# ===============================

X = df.drop("income", axis=1)
y = df["income"]

# Encode target variable
y = y.map({"<=50K": 0, ">50K": 1})

print("Unique target values:", y.unique())


Unique target values: [0 1]


In [7]:
# ===============================
# Cell 5: Identify Feature Types
# ===============================

numeric_features = X.select_dtypes(include=["int64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

print("Numeric Features:", numeric_features)
print("Categorical Features:", categorical_features)


Numeric Features: ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
Categorical Features: ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']


See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  categorical_features = X.select_dtypes(include=["object"]).columns.tolist()


In [8]:
# ===============================
# Cell 6: Create Preprocessor
# ===============================

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


In [9]:
# ===============================
# Cell 7: Train-Test Split
# ===============================

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (24129, 14)
Test size: (6033, 14)


In [10]:
# ===============================
# Cell 8: Evaluation Function
# ===============================

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    return {
        "Accuracy": accuracy,
        "AUC": auc,
        "Precision": precision,
        "Recall": recall,
        "F1": f1,
        "MCC": mcc
    }


In [11]:
# ===============================
# Cell 9: Logistic Regression
# ===============================

logistic_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

logistic_pipeline.fit(X_train, y_train)

logistic_results = evaluate_model(logistic_pipeline, X_test, y_test)

print("Logistic Regression Results:")
print(logistic_results)


Logistic Regression Results:
{'Accuracy': 0.8543013426156141, 'AUC': 0.9135886206017961, 'Precision': 0.7502008032128514, 'Recall': 0.6218375499334221, 'F1': 0.6800145613396432, 'MCC': 0.5910876116044996}


In [12]:
# ===============================
# Cell 10: Decision Tree
# ===============================

dt_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", DecisionTreeClassifier(random_state=42))
])

dt_pipeline.fit(X_train, y_train)

dt_results = evaluate_model(dt_pipeline, X_test, y_test)

print("Decision Tree Results:")
print(dt_results)


Decision Tree Results:
{'Accuracy': 0.8151831592905685, 'AUC': 0.7510019010920774, 'Precision': 0.6303030303030303, 'Recall': 0.6231691078561917, 'F1': 0.6267157683294275, 'MCC': 0.5039245876397218}


In [13]:
# ===============================
# Cell 11: KNN
# ===============================

knn_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", KNeighborsClassifier())
])

knn_pipeline.fit(X_train, y_train)

knn_results = evaluate_model(knn_pipeline, X_test, y_test)

print("KNN Results:")
print(knn_results)


KNN Results:
{'Accuracy': 0.8340792308967346, 'AUC': 0.8672780440469134, 'Precision': 0.6832479882955377, 'Recall': 0.6218375499334221, 'F1': 0.6510979435343325, 'MCC': 0.5436094581799102}


In [14]:
# ===============================
# Cell 12: Naive Bayes
# ===============================

# Create dense preprocessor for NB
preprocessor_nb = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features)
    ]
)

nb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor_nb),
    ("classifier", GaussianNB())
])

nb_pipeline.fit(X_train, y_train)

nb_results = evaluate_model(nb_pipeline, X_test, y_test)

print("Naive Bayes Results:")
print(nb_results)


Naive Bayes Results:
{'Accuracy': 0.6010276810873529, 'AUC': 0.8300161691275459, 'Precision': 0.3794940079893475, 'Recall': 0.9487350199733688, 'F1': 0.5421342971276394, 'MCC': 0.38756067140860234}


In [15]:
# ===============================
# Cell 13: Random Forest
# ===============================

rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

rf_pipeline.fit(X_train, y_train)

rf_results = evaluate_model(rf_pipeline, X_test, y_test)

print("Random Forest Results:")
print(rf_results)


Random Forest Results:
{'Accuracy': 0.8562904027846843, 'AUC': 0.9105267867664714, 'Precision': 0.7490196078431373, 'Recall': 0.6358189081225033, 'F1': 0.6877925819229385, 'MCC': 0.5986355120997638}


In [16]:
# ===============================
# Cell 14: XGBoost
# ===============================

xgb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    ))
])

xgb_pipeline.fit(X_train, y_train)

xgb_results = evaluate_model(xgb_pipeline, X_test, y_test)

print("XGBoost Results:")
print(xgb_results)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Results:
{'Accuracy': 0.8728659041936019, 'AUC': 0.9340653277422204, 'Precision': 0.789598108747045, 'Recall': 0.6671105193075899, 'F1': 0.7232046192710213, 'MCC': 0.6452821913060274}


In [None]:
# ===============================
# Cell 15: Comparison Table
# ===============================

results_df = pd.DataFrame({
    "Logistic Regression": logistic_results,
    "Decision Tree": dt_results,
    "KNN": knn_results,
    "Naive Bayes": nb_results,
    "Random Forest": rf_results,
    "XGBoost": xgb_results
}).T

results_df


Unnamed: 0,Accuracy,AUC,Precision,Recall,F1,MCC
Logistic Regression,0.854301,0.913589,0.750201,0.621838,0.680015,0.591088
Decision Tree,0.815183,0.751002,0.630303,0.623169,0.626716,0.503925
KNN,0.834079,0.867278,0.683248,0.621838,0.651098,0.543609
Naive Bayes,0.601028,0.830016,0.379494,0.948735,0.542134,0.387561
Random Forest,0.85629,0.910527,0.74902,0.635819,0.687793,0.598636
XGBoost,0.872866,0.934065,0.789598,0.667111,0.723205,0.645282


In [20]:
# ===============================
# Dataset Information for README
# ===============================

print("Final Dataset Shape (after cleaning):")
print(df.shape)

print("\nNumber of Features (excluding target):")
print(X.shape[1])

print("\nTrain-Test Split Sizes:")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


Final Dataset Shape (after cleaning):
(30162, 15)

Number of Features (excluding target):
14

Train-Test Split Sizes:
X_train shape: (24129, 14)
X_test shape: (6033, 14)


In [18]:
# ===============================
# Cell 16: Save Models
# ===============================

joblib.dump(logistic_pipeline, "../model/logistic.pkl")
joblib.dump(dt_pipeline, "../model/decision_tree.pkl")
joblib.dump(knn_pipeline, "../model/knn.pkl")
joblib.dump(nb_pipeline, "../model/naive_bayes.pkl")
joblib.dump(rf_pipeline, "../model/random_forest.pkl")
joblib.dump(xgb_pipeline, "../model/xgboost.pkl")

print("All models saved successfully ✅")


All models saved successfully ✅
