# Applying MLP

#### 1a: Load and preprocess the dataset

In [1]:
# Import packages and libraries
import sys
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from itertools import combinations

In [None]:
# Reading in the datasets
file_path = 'data/project_adult.csv'
file_path2 = 'data/project_validation_inputs.csv'

# converting to dataframes
df_adult = pd.read_csv(file_path, header=0, encoding='utf-8')
df_validation = pd.read_csv(file_path2, header=0, encoding='utf-8')

In [None]:
# Preprocess the dataset: handle missing values, encode categorical features, standardize numerical features, 
# and separate features from target.
def preprocess_data(df, target_column='income', fit_scaler=True, return_split=False, test_size=0.2, random_state=42):
    
    global trained_scaler
    
    print(f"Original data shape: {df.shape}")
    print(f"Original data columns: {df.columns.tolist()}")

    # Handle missing values - do not drop! 
    ########################################[ revisit this ]########################################
    df.replace('?', np.nan, inplace=True)
    
    print(f"After handling missing values shape: {df.shape}")

    # Keep track of rows indexes
    df = df.reset_index(drop=True)

    # SEPARATE X AND y ONLY IF target_column IS PROVIDED AND EXISTS
    if target_column is not None and target_column in df.columns:
        y = df[target_column]
        X = df.drop(columns=[target_column])
        print(f"After separation - X shape: {X.shape}, y shape: {y.shape}")
    else:
        X = df.copy()
        y = None
        print(f"No target column specified or found. Using all columns as features. X shape: {X.shape}")
    
    # Encode categorical features (only on X, not y)
    categorical_cols = X.select_dtypes(include=['object']).columns
    print(f"Categorical columns to encode: {categorical_cols.tolist()}")
    
    
    X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
    
    print(f"After encoding categorical features shape: {X.shape}")

    # Ensure all columns are numeric
    for col in X.columns:
        if X[col].dtype == 'object':
            X[col] = pd.to_numeric(X[col], errors='coerce')
    
    # Fill any NaN values
    X = X.fillna(0)

    # NEW: global scaling for ALL features
    if fit_scaler:
        final_scaler = StandardScaler()
        X = pd.DataFrame(final_scaler.fit_transform(X), columns=X.columns)
    
    print(f"Final processed features shape: {X.shape}")
    
    if return_split and y is not None:
        # Split into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=y
        )
        print(f"Train/test split - X_train: {X_train.shape}, X_test: {X_test.shape}")
        return X_train, X_test, y_train, y_test
    elif return_split and y is None:
        raise ValueError("Cannot return split when no target column is available")
    else:
        if y is not None:
            return X, y
        else:
            return X  # Return only X if no target column

#### 1b: Train and test MLP model on dataset while tunning parameters to develop multiple candidates
#### 1c: Evaluate models using appropriate metrics