In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

In [2]:
data_raw = pd.read_csv('adult.csv')
data = pd.read_csv('adult.csv')
print("Shape of the dataset:", data.shape)
data.head()

Shape of the dataset: (48842, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


# Basic Cleaning

In [3]:
data.dropna(inplace=True)

data["income"] = data["income"].str.strip()
data["income"] = data["income"].map({'>50K': 1, '<=50K': 0})

numeric_cols = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
categorical_cols = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "gender", "native-country"]

data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

print("Data shape after dropping NAs and encoding:", data.shape)
data.head()

Data shape after dropping NAs and encoding: (45222, 97)


Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,25,226802,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,38,89814,9,0,0,50,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,28,336951,12,0,0,40,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,44,160323,10,7688,0,40,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0
5,34,198693,6,0,0,30,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


# Train/Test

In [4]:
X = data.drop("income", axis=1)
y = data["income"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train set:", X_train.shape, y_train.shape)
print("Test set: ", X_test.shape, y_test.shape)

Train set: (36177, 96) (36177,)
Test set:  (9045, 96) (9045,)


# ID3-like Decision Tree

In [5]:
id3_clf = DecisionTreeClassifier(
    criterion="entropy",
    max_depth=5,
    random_state=42
)
id3_clf.fit(X_train, y_train)
id3_preds = id3_clf.predict(X_test)

id3_acc = accuracy_score(y_test, id3_preds)
print("ID3  Decision Tree Accuracy:", id3_acc)
print("\nClassification Report:")
print(classification_report(y_test, id3_preds))

ID3  Decision Tree Accuracy: 0.8443338861249309

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.95      0.90      6803
           1       0.78      0.51      0.62      2242

    accuracy                           0.84      9045
   macro avg       0.82      0.73      0.76      9045
weighted avg       0.84      0.84      0.83      9045



# CART Decision Tree

In [6]:
cart_clf = DecisionTreeClassifier(
    criterion="gini",
    max_depth=5,
    random_state=42
)
cart_clf.fit(X_train, y_train)
cart_preds = cart_clf.predict(X_test)

cart_acc = accuracy_score(y_test, cart_preds)
print("CART (Gini) Decision Tree Accuracy:", cart_acc)
print("\nClassification Report:")
print(classification_report(y_test, cart_preds))

CART (Gini) Decision Tree Accuracy: 0.8458817025981205

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.95      0.90      6803
           1       0.79      0.52      0.62      2242

    accuracy                           0.85      9045
   macro avg       0.82      0.74      0.76      9045
weighted avg       0.84      0.85      0.83      9045



# Random Forest

In [7]:
rf_clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    random_state=42
)
rf_clf.fit(X_train, y_train)
rf_preds = rf_clf.predict(X_test)

rf_acc = accuracy_score(y_test, rf_preds)
print("Random Forest Accuracy:", rf_acc)
print("\nClassification Report:")
print(classification_report(y_test, rf_preds))

Random Forest Accuracy: 0.8288557213930349

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.99      0.90      6803
           1       0.90      0.35      0.50      2242

    accuracy                           0.83      9045
   macro avg       0.86      0.67      0.70      9045
weighted avg       0.84      0.83      0.80      9045



# Gradient Boosting

In [8]:
gb_clf = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
gb_clf.fit(X_train, y_train)
gb_preds = gb_clf.predict(X_test)

gb_acc = accuracy_score(y_test, gb_preds)
print("Gradient Boosted Trees Accuracy:", gb_acc)
print("\nClassification Report:")
print(classification_report(y_test, gb_preds))

Gradient Boosted Trees Accuracy: 0.8656716417910447

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.94      0.91      6803
           1       0.78      0.64      0.70      2242

    accuracy                           0.87      9045
   macro avg       0.83      0.79      0.81      9045
weighted avg       0.86      0.87      0.86      9045



# XGBoost

In [9]:
xgb_clf = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
    eval_metric="logloss"
)
xgb_clf.fit(X_train, y_train)
xgb_preds = xgb_clf.predict(X_test)

xgb_acc = accuracy_score(y_test, xgb_preds)
print("XGBoost Accuracy:", xgb_acc)
print("\nClassification Report:")
print(classification_report(y_test, xgb_preds))

XGBoost Accuracy: 0.8650082918739636

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.94      0.91      6803
           1       0.78      0.64      0.70      2242

    accuracy                           0.87      9045
   macro avg       0.83      0.79      0.81      9045
weighted avg       0.86      0.87      0.86      9045



# Results Comparison

In [10]:
models = ["ID3 (Entropy)", "CART (Gini)", "Random Forest", "Gradient Boosting", "XGBoost"]
accuracies = [id3_acc, cart_acc, rf_acc, gb_acc, xgb_acc]

results_df = pd.DataFrame({"Model": models, "Accuracy": accuracies})
# results_df.sort_values("Accuracy", ascending=False, inplace=True)
results_df.reset_index(drop=True, inplace=True)
dataname = "adult income"
print(f"Dataset: {dataname}, Train Shape: {X_train.shape}, Test Shape: {X_test.shape}")
print(results_df)

Dataset: adult income, Train Shape: (36177, 96), Test Shape: (9045, 96)
               Model  Accuracy
0      ID3 (Entropy)  0.844334
1        CART (Gini)  0.845882
2      Random Forest  0.828856
3  Gradient Boosting  0.865672
4            XGBoost  0.865008
