In [None]:
# ----- Core Python & Visualization ----- #
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ----- Data Preprocessing ----- #
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# ----- Models ----- #
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

# ----- Evaluation Metrics ----- #
from sklearn.metrics import r2_score, mean_squared_error, classification_report, confusion_matrix, accuracy_score

# ----- Neural Network (Keras) ----- #
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, SGD



In [4]:
# loading data
df = pd.read_csv("/Users/anhuynh/Downloads/income_evaluation.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# some data cleaning
df[' income'] = df[' income'].astype(str).str.strip()
df[' income'] = df[' income'].map({'>50K': 1, '<=50K': 0})

In [76]:
# First, make sure there are no leading spaces in column names
df.columns = df.columns.str.strip()

# Now count 0s and 1s
print(df['income'].value_counts())


income
0    24720
1     7841
Name: count, dtype: int64


In [24]:
print(df.dtypes)


age                  int64
 workclass          object
 fnlwgt              int64
 education          object
 education-num       int64
 marital-status     object
 occupation         object
 relationship       object
 race               object
 sex                object
 capital-gain        int64
 capital-loss        int64
 hours-per-week      int64
 native-country     object
 income            float64
dtype: object


In [60]:
# ---- (1) Load raw data ---- #
df = pd.read_csv("/Users/anhuynh/Downloads/income_evaluation.csv")
df.columns = df.columns.str.strip()


# Clean 'income' column
df['income'] = df['income'].astype(str).str.strip()
df['income'] = df['income'].map({'>50K': 1, '<=50K': 0})

# Now you can split and preprocess!
X = df.drop(columns=['income'])
y = df['income']


categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include='number').columns.tolist()

# ---- (2) Train/test split BEFORE preprocessing ---- #
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---- (3) Define Preprocessor ---- #
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])



In [73]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1. Define the Logistic Regression model
logreg = LogisticRegression(solver='liblinear', random_state=42)

# 2. Build the Logistic Regression pipeline
log_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # your preprocessor for scaling + encoding
    ('classifier', logreg)
])

# 3. Fit the pipeline
log_pipeline.fit(X_train, y_train)

# 4. Predict
y_pred_log = log_pipeline.predict(X_test)

# 5. Evaluate
print("Accuracy of the Logistic Regression model:", accuracy_score(y_test, y_pred_log))
print("Confusion matrix of the Logistic Regression model:\n", confusion_matrix(y_test, y_pred_log))
print("\nClassification report of the Logistic Regression model:\n", classification_report(y_test, y_pred_log))


Accuracy of the Logistic Regression model: 0.8582834331337326
Confusion matrix of the Logistic Regression model:
 [[4623  319]
 [ 604  967]]

Classification report of the Logistic Regression model:
               precision    recall  f1-score   support

           0       0.88      0.94      0.91      4942
           1       0.75      0.62      0.68      1571

    accuracy                           0.86      6513
   macro avg       0.82      0.78      0.79      6513
weighted avg       0.85      0.86      0.85      6513



In [74]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# 1. Define Decision Tree model
dt = DecisionTreeClassifier(max_depth=5, random_state=42)

# 2. Build Decision Tree pipeline
dt_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', dt)
])

# 3. Fit
dt_pipeline.fit(X_train, y_train)

# 4. Predict
y_pred_dt = dt_pipeline.predict(X_test)

# 5. Evaluate
print("Accuracy of the Decision Tree model:", accuracy_score(y_test, y_pred_dt))
print("Confusion matrix of the Decision Tree model:\n", confusion_matrix(y_test, y_pred_dt))
print("\nClassification report of the Decision Tree model:\n", classification_report(y_test, y_pred_dt))


Accuracy of the Decision Tree model: 0.8504529402732995
Confusion matrix of the Decision Tree model:
 [[4724  218]
 [ 756  815]]

Classification report of the Decision Tree model:
               precision    recall  f1-score   support

           0       0.86      0.96      0.91      4942
           1       0.79      0.52      0.63      1571

    accuracy                           0.85      6513
   macro avg       0.83      0.74      0.77      6513
weighted avg       0.84      0.85      0.84      6513



In [75]:
from sklearn.ensemble import RandomForestClassifier

# 1. Define Random Forest model
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

# 2. Build Random Forest pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', rf)
])

# 3. Fit
rf_pipeline.fit(X_train, y_train)

# 4. Predict
y_pred_rf = rf_pipeline.predict(X_test)

# 5. Evaluate
print("Accuracy of the Random Forest model:", accuracy_score(y_test, y_pred_rf))
print("Confusion matrix of the Random Forest model:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification report of the Random Forest model:\n", classification_report(y_test, y_pred_rf))


Accuracy of the Random Forest model: 0.8556732688469215
Confusion matrix of the Random Forest model:
 [[4727  215]
 [ 725  846]]

Classification report of the Random Forest model:
               precision    recall  f1-score   support

           0       0.87      0.96      0.91      4942
           1       0.80      0.54      0.64      1571

    accuracy                           0.86      6513
   macro avg       0.83      0.75      0.78      6513
weighted avg       0.85      0.86      0.85      6513



In [78]:
# 1. Define the Gaussian Naive Bayes model
gnb = GaussianNB()

# 2. Build the GNB pipeline
log_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # same preprocessor as others
    ('classifier', gnb)
])

# 3. Fit the GNB pipeline
log_pipeline.fit(X_train, y_train)

# 4. Predict
y_pred_gnb = gnb_pipeline.predict(X_test)

# 5. Evaluate
print("Accuracy of the Gaussian Naive Bayes model:", accuracy_score(y_test, y_pred_gnb))
print("Confusion matrix of the Gaussian Naive Bayes model:\n", confusion_matrix(y_test, y_pred_gnb))
print("\nClassification report of the Gaussian Naive Bayes model:\n", classification_report(y_test, y_pred_gnb))

Accuracy of the Gaussian Naive Bayes model: 0.5312452019038846
Confusion matrix of the Gaussian Naive Bayes model:
 [[1948 2994]
 [  59 1512]]

Classification report of the Gaussian Naive Bayes model:
               precision    recall  f1-score   support

           0       0.97      0.39      0.56      4942
           1       0.34      0.96      0.50      1571

    accuracy                           0.53      6513
   macro avg       0.65      0.68      0.53      6513
weighted avg       0.82      0.53      0.55      6513



In [71]:
# ---- (4) Build Model Pipelines ---- #
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline

xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

ada_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', AdaBoostClassifier(random_state=42))
])

# ---- (5) Train Pipelines ---- #
xgb_pipeline.fit(X_train, y_train)
ada_pipeline.fit(X_train, y_train)

# ---- (6) Predict & Evaluate ---- #
from sklearn.metrics import accuracy_score, classification_report

xgb_preds = xgb_pipeline.predict(X_test)
ada_preds = ada_pipeline.predict(X_test)

print("XGBoost Accuracy:", accuracy_score(y_test, xgb_preds))
print("Confusion matrix of the XGboost:\n", confusion_matrix(y_test, xgb_preds))
print("\nClassification report of the XGboost model:\n", classification_report(y_test, xgb_preds))



Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.8739444188545985
Confusion matrix of the XGboost:
 [[4641  301]
 [ 520 1051]]

Classification report of the XGboost model:
               precision    recall  f1-score   support

           0       0.90      0.94      0.92      4942
           1       0.78      0.67      0.72      1571

    accuracy                           0.87      6513
   macro avg       0.84      0.80      0.82      6513
weighted avg       0.87      0.87      0.87      6513



In [72]:
print("AdaBoost Accuracy:", accuracy_score(y_test, ada_preds))
print("Confusion matrix of the XGboost:\n", confusion_matrix(y_test, ada_preds))
print("\nClassification report of the XGboost model:\n", classification_report(y_test, ada_preds))

AdaBoost Accuracy: 0.8633502226316597
Confusion matrix of the XGboost:
 [[4637  305]
 [ 585  986]]

Classification report of the XGboost model:
               precision    recall  f1-score   support

           0       0.89      0.94      0.91      4942
           1       0.76      0.63      0.69      1571

    accuracy                           0.86      6513
   macro avg       0.83      0.78      0.80      6513
weighted avg       0.86      0.86      0.86      6513



In [68]:
from sklearn.naive_bayes import GaussianNB

# 1. Define the Gaussian Naive Bayes model
gnb = GaussianNB()

# 2. Build the GNB pipeline
gnb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # same preprocessor as others
    ('classifier', gnb)
])

# 3. Fit the GNB pipeline
gnb_pipeline.fit(X_train, y_train)

# 4. Predict
y_pred_gnb = gnb_pipeline.predict(X_test)

# 5. Evaluate
print("Accuracy of the Gaussian Naive Bayes model:", accuracy_score(y_test, y_pred_gnb))
print("Confusion matrix of the Gaussian Naive Bayes model:\n", confusion_matrix(y_test, y_pred_gnb))
print("\nClassification report of the Gaussian Naive Bayes model:\n", classification_report(y_test, y_pred_gnb))

Accuracy of the Gaussian Naive Bayes model: 0.5312452019038846
Confusion matrix of the Gaussian Naive Bayes model:
 [[1948 2994]
 [  59 1512]]

Classification report of the Gaussian Naive Bayes model:
               precision    recall  f1-score   support

           0       0.97      0.39      0.56      4942
           1       0.34      0.96      0.50      1571

    accuracy                           0.53      6513
   macro avg       0.65      0.68      0.53      6513
weighted avg       0.82      0.53      0.55      6513



In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# 1. Define the MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)

# 2. Build the pipeline
mlp_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # reuse the same preprocessor
    ('classifier', mlp)
])

# 3. Fit the pipeline
mlp_pipeline.fit(X_train, y_train)

# 4. Predict
y_pred_mlp = mlp_pipeline.predict(X_test)

# 5. Evaluate
print("Accuracy of the Neural Network (MLP) model:", accuracy_score(y_test, y_pred_mlp))
print("Confusion matrix of the Neural Network (MLP) model:\n", confusion_matrix(y_test, y_pred_mlp))
print("\nClassification report of the Neural Network (MLP) model:\n", classification_report(y_test, y_pred_mlp))


Accuracy of the Neural Network (MLP) model: 0.8369415016121603
Confusion matrix of the Neural Network (MLP) model:
 [[4453  489]
 [ 573  998]]

Classification report of the Neural Network (MLP) model:
               precision    recall  f1-score   support

           0       0.89      0.90      0.89      4942
           1       0.67      0.64      0.65      1571

    accuracy                           0.84      6513
   macro avg       0.78      0.77      0.77      6513
weighted avg       0.83      0.84      0.84      6513



My XGBoost model demonstrated the strongest performance among all models tested. It achieved a high overall accuracy of 86% in predicting whether an individual earns more or less than $50,000 per year. The model was particularly effective at correctly identifying individuals earning less than $50,000, while also maintaining solid performance in recognizing higher earners. Overall, these results indicate that the model is both reliable and well-balanced, making it the most suitable option for accurately assessing income levels based on the available data