In [2]:
import numpy as np
import pandas as pd

# -------------------------
# Utility Functions
# -------------------------
def relu(Z):
    return np.maximum(0, Z)

def relu_derivative(Z):
    return (Z > 0).astype(float)

def softmax(Z):
    # subtract max for numerical stability
    expZ = np.exp(Z - np.max(Z, axis=1, keepdims=True))
    return expZ / np.sum(expZ, axis=1, keepdims=True)

def mse_loss(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def cross_entropy_loss(y_true, y_pred):
    eps = 1e-15
    # Clip probabilities for numerical stability.
    y_pred_clipped = np.clip(y_pred, eps, 1 - eps)
    return -np.mean(np.sum(y_true * np.log(y_pred_clipped), axis=1))

# -------------------------
# Neural Network Class
# -------------------------
class NeuralNetwork:
    def __init__(self, layer_sizes, loss_type='cross_entropy', dropout_prob=0.0, learning_rate=0.01, seed=None):
        """
        layer_sizes: list containing the number of neurons in each layer.
                     For example, for 3 hidden layers: [input_dim, h1, h2, h3, output_dim]
        loss_type: 'cross_entropy' or 'mse'
        dropout_prob: probability of dropping a neuron (applied in hidden layers)
        learning_rate: learning rate for gradient descent
        """
        self.layer_sizes = layer_sizes
        self.num_layers = len(layer_sizes) - 1  # number of weight layers
        self.loss_type = loss_type
        self.dropout_prob = dropout_prob
        self.learning_rate = learning_rate
        if seed is not None:
            np.random.seed(seed)
        self.params = {}
        # Initialize weights and biases (using He initialization for ReLU)
        for l in range(1, len(layer_sizes)):
            self.params['W' + str(l)] = np.random.randn(layer_sizes[l-1], layer_sizes[l]) * np.sqrt(2.0 / layer_sizes[l-1])
            self.params['b' + str(l)] = np.zeros((1, layer_sizes[l]))

    def forward(self, X, training=True):
        """
        Forward propagation through the network.
        Applies ReLU activation in hidden layers. For the output layer:
          - if using cross-entropy loss, applies softmax activation.
          - if using MSE loss, uses a linear activation.
        Also applies dropout in hidden layers during training.
        """
        caches = {}
        A = X
        caches['A0'] = X
        for l in range(1, self.num_layers + 1):
            W = self.params['W' + str(l)]
            b = self.params['b' + str(l)]
            Z = np.dot(A, W) + b
            caches['Z' + str(l)] = Z
            if l != self.num_layers:  # hidden layers
                A = relu(Z)
                if training and self.dropout_prob > 0:
                    # create dropout mask: neurons with value 0 are dropped
                    D = (np.random.rand(*A.shape) > self.dropout_prob).astype(float)
                    A = A * D / (1 - self.dropout_prob)  # scale activations to maintain expectation
                    caches['D' + str(l)] = D
            else:
                # output layer activation depends on loss function choice
                if self.loss_type == 'cross_entropy':
                    A = softmax(Z)
                else:
                    A = Z  # linear activation for MSE
            caches['A' + str(l)] = A
        return A, caches

    def compute_loss(self, y_true, y_pred):
        if self.loss_type == 'cross_entropy':
            return cross_entropy_loss(y_true, y_pred)
        else:
            return mse_loss(y_true, y_pred)

    def backward(self, X, y, caches):
        """
        Backpropagation through the network.
        Computes gradients for weights and biases.
        """
        grads = {}
        m = X.shape[0]
        L = self.num_layers
        A_final = caches['A' + str(L)]
        # Derivative at output layer:
        if self.loss_type == 'cross_entropy':
            dZ = A_final - y  # derivative for softmax + cross entropy
        else:
            dZ = 2 * (A_final - y) / m  # derivative for MSE with linear output

        for l in range(L, 0, -1):
            A_prev = caches['A' + str(l-1)]
            W = self.params['W' + str(l)]
            dW = np.dot(A_prev.T, dZ)
            db = np.sum(dZ, axis=0, keepdims=True)
            grads['dW' + str(l)] = dW
            grads['db' + str(l)] = db
            if l > 1:  # no need to propagate for input layer
                dA_prev = np.dot(dZ, W.T)
                # If dropout was applied in the previous layer, use the same mask for backprop.
                if self.dropout_prob > 0 and ('D' + str(l-1)) in caches:
                    dA_prev = dA_prev * caches['D' + str(l-1)] / (1 - self.dropout_prob)
                dZ = dA_prev * relu_derivative(caches['Z' + str(l-1)])
        return grads

    def update_params(self, grads):
        """Update weights and biases using gradient descent."""
        for l in range(1, self.num_layers + 1):
            self.params['W' + str(l)] -= self.learning_rate * grads['dW' + str(l)]
            self.params['b' + str(l)] -= self.learning_rate * grads['db' + str(l)]

    def train(self, X_train, y_train, X_val, y_val, batch_size, max_epochs=200, patience=10, verbose=False):
        """
        Trains the network using either stochastic (batch_size=1) or mini-batch gradient descent.
        Early stopping is applied based on validation loss.
        Returns the training history and the epoch number with the best validation loss.
        """
        best_val_loss = np.inf
        best_epoch = 0
        best_params = None
        n_train = X_train.shape[0]
        history = {'train_loss': [], 'val_loss': []}

        for epoch in range(1, max_epochs + 1):
            # Shuffle the training data
            indices = np.arange(n_train)
            np.random.shuffle(indices)
            X_train = X_train[indices]
            y_train = y_train[indices]
            epoch_loss = 0
            num_batches = int(np.ceil(n_train / batch_size))
            for i in range(num_batches):
                start = i * batch_size
                end = min(start + batch_size, n_train)
                X_batch = X_train[start:end]
                y_batch = y_train[start:end]
                # Forward propagation on the batch
                y_pred, caches = self.forward(X_batch, training=True)
                loss = self.compute_loss(y_batch, y_pred)
                epoch_loss += loss
                # Backpropagation
                grads = self.backward(X_batch, y_batch, caches)
                self.update_params(grads)
            epoch_loss /= num_batches

            # Compute validation loss (without dropout)
            y_val_pred, _ = self.forward(X_val, training=False)
            val_loss = self.compute_loss(y_val, y_val_pred)
            history['train_loss'].append(epoch_loss)
            history['val_loss'].append(val_loss)
            if verbose:
                print(f"Epoch {epoch:3d}: Train Loss = {epoch_loss:.4f}, Val Loss = {val_loss:.4f}")

            # Early stopping check
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_epoch = epoch
                best_params = {key: val.copy() for key, val in self.params.items()}
            elif epoch - best_epoch >= patience:
                if verbose:
                    print(f"Early stopping at epoch {epoch}. Best epoch was {best_epoch}")
                break

        # Restore best parameters
        self.params = best_params
        return history, best_epoch

    def predict(self, X):
        """
        Returns the predicted class labels.
        (For regression with MSE, we assume one-hot encoded targets and use argmax.)
        """
        y_pred, _ = self.forward(X, training=False)
        return np.argmax(y_pred, axis=1)

# -------------------------
# Data Preprocessing
# -------------------------
# Load dataset (adjust the file path if necessary)
data = pd.read_csv("./obesity_data.csv")

# Assume that all columns except the last are features and the last column is the label.
features = data.iloc[:, :-1].values
labels = data.iloc[:, -1].values

# Convert labels to numerical values and then to one-hot encoding.
# (If labels are already numeric, this step may be simplified.)
classes, labels_encoded = np.unique(labels, return_inverse=True)
num_classes = len(classes)
y_onehot = np.eye(num_classes)[labels_encoded]

numerical_features = data.select_dtypes(include=np.number).iloc[:, :-1].values # Select numerical features only, excluding the label column
categorical_features = data.select_dtypes(exclude=np.number).iloc[:, :-1] # Select categorical features only, excluding the label column

# Standardize features (zero mean, unit variance)
X_num = numerical_features.astype(float)
X_num_mean = X_num.mean(axis=0)
X_num_std = X_num.std(axis=0) + 1e-8
X_num = (X_num - X_num_mean) / X_num_std

# One-hot encode categorical features
X_cat = pd.get_dummies(categorical_features).values  # Use pandas get_dummies to one-hot encode

# Combine standardized numerical features and one-hot encoded categorical features
X = np.concatenate([X_num, X_cat], axis=1)

# Split data into training and validation sets (80/20 split)
indices = np.arange(X.shape[0])
np.random.shuffle(indices)
train_size = int(0.8 * X.shape[0])
train_idx = indices[:train_size]
val_idx = indices[train_size:]
X_train, X_val = X[train_idx], X[val_idx]
y_train, y_val = y_onehot[train_idx], y_onehot[val_idx]

# -------------------------
# Experiment Settings
# -------------------------
# Define two architectures:
# N1: 3 hidden layers, e.g., [input_dim, 64, 32, 16, output_dim]
# N2: 4 hidden layers, e.g., [input_dim, 64, 32, 16, 8, output_dim]
input_dim = X_train.shape[1]
output_dim = num_classes

architecture_N1 = [input_dim, 64, 32, 16, output_dim]
architecture_N2 = [input_dim, 64, 32, 16, 8, output_dim]

# Settings for dropout, learning rate, and training
dropout_prob = 0.2
learning_rate = 0.01
max_epochs = 200
patience = 10

# We will run experiments for both loss functions and both gradient descent strategies:
loss_options = ['cross_entropy', 'mse']
batch_options = {
    'stochastic': 1,    # stochastic gradient descent (one sample at a time)
    'mini-batch': 40    # mini-batch gradient descent with batch size 40
}

# -------------------------
# Run Experiments
# -------------------------
results = {}

for net_name, arch in zip(["N1 (3 hidden layers)", "N2 (4 hidden layers)"], [architecture_N1, architecture_N2]):
    for loss in loss_options:
        for grad_desc, batch_size in batch_options.items():
            exp_name = f"{net_name} | Loss: {loss} | {grad_desc}"
            print("Running experiment:", exp_name)
            nn = NeuralNetwork(layer_sizes=arch, loss_type=loss, dropout_prob=dropout_prob,
                               learning_rate=learning_rate, seed=42)
            history, best_epoch = nn.train(X_train, y_train, X_val, y_val, batch_size=batch_size,
                                           max_epochs=max_epochs, patience=patience, verbose=True)
            y_pred = nn.predict(X_val)
            y_true = np.argmax(y_val, axis=1)
            accuracy = np.mean(y_pred == y_true)
            results[exp_name] = {"Best Epoch": best_epoch,
                                 "Validation Accuracy": accuracy,
                                 "Final Train Loss": history['train_loss'][-1],
                                 "Final Val Loss": history['val_loss'][-1]}
            print(f"Experiment: {exp_name}")
            print(f"  Best Epoch: {best_epoch}")
            print(f"  Final Validation Loss: {history['val_loss'][-1]:.4f}")
            print(f"  Validation Accuracy: {accuracy*100:.2f}%")
            print("-" * 50)

# -------------------------
# Summary of Results
# -------------------------
print("\nSummary of all experiments:")
for exp_name, res in results.items():
    print(exp_name)
    print(f"  Best Epoch: {res['Best Epoch']}")
    print(f"  Final Validation Loss: {res['Final Val Loss']:.4f}")
    print(f"  Validation Accuracy: {res['Validation Accuracy']*100:.2f}%")
    print("-" * 50)


Running experiment: N1 (3 hidden layers) | Loss: cross_entropy | stochastic
Epoch   1: Train Loss = 0.9258, Val Loss = 0.4753
Epoch   2: Train Loss = 0.6058, Val Loss = 0.4299
Epoch   3: Train Loss = 0.5278, Val Loss = 0.2991
Epoch   4: Train Loss = 0.4416, Val Loss = 0.2482
Epoch   5: Train Loss = 0.4443, Val Loss = 0.2464
Epoch   6: Train Loss = 0.4107, Val Loss = 0.2088
Epoch   7: Train Loss = 0.3522, Val Loss = 0.3344
Epoch   8: Train Loss = 0.3340, Val Loss = 0.1403
Epoch   9: Train Loss = 0.3229, Val Loss = 0.1511
Epoch  10: Train Loss = 0.3053, Val Loss = 0.1416
Epoch  11: Train Loss = 0.2740, Val Loss = 0.1715
Epoch  12: Train Loss = 0.2620, Val Loss = 0.1753
Epoch  13: Train Loss = 0.2687, Val Loss = 0.1273
Epoch  14: Train Loss = 0.2397, Val Loss = 0.1682
Epoch  15: Train Loss = 0.2547, Val Loss = 0.2205
Epoch  16: Train Loss = 0.2322, Val Loss = 0.1037
Epoch  17: Train Loss = 0.2057, Val Loss = 0.1024
Epoch  18: Train Loss = 0.2417, Val Loss = 0.1311
Epoch  19: Train Loss = 