## Adult Income Data

In [10]:
import pandas as pd

In [11]:
data = pd.read_csv('adult.data', header=None, na_values=' ?')

In [12]:
# Assign column names 
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
    'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
    'hours_per_week', 'native_country', 'income'
]
data.columns = columns

# Look at dataset
print(data.head())
print(data.info())

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              40   United-States   <=50

In [13]:
data.dropna(inplace=True)

In [14]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data['income'] = label_encoder.fit_transform(data['income'])

data = pd.get_dummies(data, drop_first=True)

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_cols = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

In [8]:
from sklearn.model_selection import train_test_split

X = data.drop(columns=['income'])
y = data['income']

# 20/80
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

# 50/50
X_train_50, X_test_50, y_train_50, y_test_50 = train_test_split(X, y, test_size=0.5, random_state=42)

# 80/20
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np
import pandas as pd

# Define classifiers and their hyper-parameter grids
classifiers = {
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    },
    "SVM": {
        "model": SVC(random_state=42),
        "params": {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']
        }
    },
    "KNN": {
        "model": KNeighborsClassifier(),
        "params": {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan']
        }
    }
}

# Data partition settings
partitions = {
    "20/80": 0.8,
    "50/50": 0.5,
    "80/20": 0.2
}

# Function to tune hyperparameters and evaluate classifier
def tune_and_evaluate(classifier_name, model, param_grid, X, y, test_size):
    best_params = []
    accuracies = []
    for trial in range(3):  # Conduct 3 trials
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=trial)
        grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params.append(grid_search.best_params_)
        accuracy = best_model.score(X_test, y_test)
        accuracies.append(accuracy)
    return np.mean(accuracies), best_params

# Storage for results and hyperparameters
results = []
tuned_parameters = {}

# Run experiments
for partition_name, test_size in partitions.items():
    for classifier_name, classifier_data in classifiers.items():
        avg_accuracy, best_params = tune_and_evaluate(
            classifier_name,
            classifier_data["model"],
            classifier_data["params"],
            X,  # Feature data
            y,  # Target data
            test_size  # Partition ratio
        )
        results.append((classifier_name, partition_name, avg_accuracy))
        tuned_parameters[(classifier_name, partition_name)] = best_params
        print(f"{classifier_name} ({partition_name}): {avg_accuracy:.4f}")

# Convert results to a readable format
results_df = pd.DataFrame(results, columns=["Classifier", "Partition", "Average Accuracy"])
print("\nFinal Results:\n")
print(results_df)

# Display tuned hyperparameters
print("\nTuned Hyperparameters:\n")
for key, params in tuned_parameters.items():
    print(f"{key}:")
    for i, trial_params in enumerate(params):
        print(f"  Trial {i+1}: {trial_params}")


Random Forest (20/80): 0.8548
SVM (20/80): 0.8481
KNN (20/80): 0.8227
Random Forest (50/50): 0.8563
SVM (50/50): 0.8511
KNN (50/50): 0.8277
Random Forest (80/20): 0.8597
SVM (80/20): 0.8515
KNN (80/20): 0.8311

Final Results:

      Classifier Partition  Average Accuracy
0  Random Forest     20/80          0.854787
1            SVM     20/80          0.848114
2            KNN     20/80          0.822724
3  Random Forest     50/50          0.856331
4            SVM     50/50          0.851115
5            KNN     50/50          0.827730
6  Random Forest     80/20          0.859661
7            SVM     80/20          0.851539
8            KNN     80/20          0.831096

Tuned Hyperparameters:

('Random Forest', '20/80'):
  Trial 1: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}
  Trial 2: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}
  Trial 3: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 100}
('SVM', '20/80'):
  Trial 1: {'C': 10, 'g