# Applying MLP

#### 1a: Load and preprocess the dataset

In [122]:
# Import packages and libraries
import sys
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from itertools import combinations

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [123]:
# Reading in the datasets
file_path = 'data/project_adult.csv'
file_path2 = 'data/project_validation_inputs.csv'

# converting to dataframes
df_adult = pd.read_csv(file_path, header=0, encoding='utf-8')
df_validation = pd.read_csv(file_path2, header=0, encoding='utf-8')

In [124]:
# Preprocessing - handling missing values 
df_adult.replace('?', np.nan, inplace=True)
null_counts = df_adult.isnull().sum()
print(null_counts)

Unnamed: 0           0
age                  0
workclass         1447
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1454
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     458
income               0
dtype: int64


In [125]:
# Make sure the validation set is similar 
df_validation.replace('?', np.nan, inplace=True)
null_counts = df_validation.isnull().sum()
print(null_counts)

Unnamed: 0          0
age                 0
workclass         389
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        389
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country    125
dtype: int64


This reveals that we need to handle missing data for workclass, occupation, and native-county, which are all categorical. Because they're all categorical, we can use mode imputation.

In [126]:
# Preprocess the dataset: handle missing values, encode categorical features, standardize numerical features, 
# and separate features from target.
def preprocess_data(df, target_column='income', fit_scaler=True, return_split=False, test_size=0.2, random_state=42):
    
    global trained_scaler
    
    print(f"Original data shape: {df.shape}")
    print(f"Original data columns: {df.columns.tolist()}")

    #####################[ Handle missing values ]#####################

    # Convert ? to null for easier handling
    df.replace('?', np.nan, inplace=True)

    # Use mode imputation to handle missing values in categorical columns
    categorical_cols = df.select_dtypes(include=['object','category']).columns
    print(f"Categorical columns: {categorical_cols.tolist()}\n\n")

    print("MODE IMPUTATION: \n")

    for col in categorical_cols:
        null_count = df[col].isnull().sum()
        print(f"Null count for {col} : {null_count}")
        if null_count > 0:
            col_mode = df[col].mode()[0]
            df[col].fillna(col_mode, inplace=True)
            print(f"-> Imputed '{col}' with mode: {col_mode}")


    #####################[ Encoding features ]#####################

    # Keep track of rows indexes
    df = df.reset_index(drop=True)

    # SEPARATE X AND y ONLY IF target_column IS PROVIDED AND EXISTS
    if target_column is not None and target_column in df.columns:
        y = df[target_column]
        X = df.drop(columns=[target_column])
        print(f"\nAfter separation: \n\tX shape: {X.shape} \n\ty shape: {y.shape}")
    else:
        X = df.copy()
        y = None
        print(f"No target column specified or found. Using all columns as features. X shape: {X.shape}")
    
    # Encode categorical features (only on X, not y)    
    # One-hot encoding
    cols_to_encode = [col for col in categorical_cols if col in X.columns]
    X = pd.get_dummies(X, columns=cols_to_encode, drop_first=True)
    
    print(f"After encoding categorical features X shape: {X.shape}")    

    # Ensure all columns are numeric
    for col in X.columns:
        if X[col].dtype == 'object':
            X[col] = pd.to_numeric(X[col], errors='coerce')


    #####################[ Standardization ]#####################

    # Global scaling for ALL features
    if fit_scaler:
        final_scaler = StandardScaler()
        X = pd.DataFrame(final_scaler.fit_transform(X), columns=X.columns)
    
    
    #####################[ Train/test split ]#####################

    if return_split and y is not None:
        # Split into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=y
        )
        print(f"Train/test split: \n\tX_train: {X_train.shape} \n\tX_test: {X_test.shape}")
        return X_train, X_test, y_train, y_test
    elif return_split and y is None:
        raise ValueError("Cannot return split when no target column is available")
    else:
        if y is not None:
            return X, y
        else:
            return X  # Return only X if no target column

In [127]:
# Preprocess and split data
X_train, X_test, y_train, y_test = preprocess_data(
    df_adult, 
    target_column='income', 
    fit_scaler=True, 
    return_split=True,
    test_size=0.2,
    random_state=42
)

Original data shape: (26048, 16)
Original data columns: ['Unnamed: 0', 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
Categorical columns: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']


MODE IMPUTATION: 

Null count for workclass : 1447
-> Imputed 'workclass' with mode: Private
Null count for education : 0
Null count for marital-status : 0
Null count for occupation : 1454
-> Imputed 'occupation' with mode: Prof-specialty
Null count for relationship : 0
Null count for race : 0
Null count for sex : 0
Null count for native-country : 458
-> Imputed 'native-country' with mode: United-States
Null count for income : 0

After separation: 
	X shape: (26048, 15) 
	y shape: (26048,)
After encoding categorical features X shape: (26048, 98)
Train/test split: 
	X_train: (20838

#### 1b: Train and test MLP model on dataset while tunning parameters to develop multiple candidates
#### 1c: Evaluate models using appropriate metrics