# # Multiclass Logistic Regression on the Iris Dataset
# 
# **Objective:** Implement a multiclass logistic regression classifier on the Iris dataset.
# 
# **Steps:**
# 1. Load and split the dataset.
# 2. Preprocess using three approaches:
#    - Unprocessed
#    - Normalized (Min-Max Scaling)
#    - Standardized (Z-score Normalization)
# 3. Define the softmax function and cross-entropy loss.
# 4. Implement training with both gradient descent (GD) and stochastic gradient descent (SGD).
# 5. Evaluate and visualize the training performance.

# ## 1. Import Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns  # For a prettier confusion matrix plot (optional)
import pandas as pd

# ## 2. Load the Iris Dataset
# 
# We load the dataset using `sklearn.datasets.load_iris()`, extract the four features and class labels.

In [None]:
# Load the Iris dataset
iris = load_iris()
X = iris.data    # shape: (150, 4)
y = iris.target  # shape: (150,)

# Check the shapes and classes
print("Features shape:", X.shape)
print("Labels shape:", y.shape)
print("Unique classes:", np.unique(y))

# ## 3. Preprocess the Data
# 
# We first split the data into training (60%) and testing (40%) sets. Then we define two preprocessing functions:
# 
# - **Normalization (Min-Max Scaling):** Rescales features to the range [0, 1].  
# - **Standardization (Z-score):** Transforms features to have mean 0 and standard deviation 1.

In [None]:
# Split the dataset (60% training, 40% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

def normalize(X):
    """Min-Max Normalization to [0,1]"""
    return (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

def standardize(X):
    """Z-score Standardization"""
    return (X - X.mean(axis=0)) / X.std(axis=0)

# Create preprocessed versions of the data
X_train_norm = normalize(X_train)
X_test_norm  = normalize(X_test)

X_train_std = standardize(X_train)
X_test_std  = standardize(X_test)

# You can also train on the unprocessed data (X_train and X_test)

## 4. Define Helper Functions
# 
# We now define the following:
# 
# - **Softmax function:** Computes probabilities from logits.
# - **One-hot encoding:** Converts integer class labels into one-hot vectors.
# - **Categorical cross-entropy loss:** Computes the loss over m samples.


In [None]:
def softmax(z):
    """
    Compute the softmax of each row of the input z.
    """
    # subtract max for numerical stability
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def one_hot_encode(y, num_classes):
    """
    Convert label vector y to one-hot encoded matrix.
    """
    m = y.shape[0]
    y_encoded = np.zeros((m, num_classes))
    y_encoded[np.arange(m), y] = 1
    return y_encoded

def compute_loss(y_true, y_pred):
    """
    Compute the categorical cross-entropy loss.
    """
    m = y_true.shape[0]
    # Add a small value to avoid log(0)
    loss = -np.sum(y_true * np.log(y_pred + 1e-15)) / m
    return loss

# ## 5. Implement the Multiclass Logistic Regression Model
# 
# We create a class `LogisticRegressionMulti` that implements:
# 
# - **Batch Gradient Descent (GD):** The weight update is performed using all samples.
# - **Stochastic Gradient Descent (SGD):** The weight update is performed on one sample at a time.
# 
# Both methods track the training loss at every epoch.

In [None]:
class LogisticRegressionMulti:
    def __init__(self, learning_rate=0.01, epochs=1000, verbose=False):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.verbose = verbose
    
    def fit_gd(self, X, y):
        """
        Train using Batch Gradient Descent.
        """
        m, n = X.shape
        num_classes = len(np.unique(y))
        self.W = np.zeros((n, num_classes))
        y_encoded = one_hot_encode(y, num_classes)
        self.loss_history = []
        
        for epoch in range(self.epochs):
            logits = np.dot(X, self.W)
            probs = softmax(logits)
            loss = compute_loss(y_encoded, probs)
            self.loss_history.append(loss)
            
            # Gradient calculation
            grad = np.dot(X.T, (probs - y_encoded)) / m
            self.W -= self.learning_rate * grad
            
            if self.verbose and epoch % 100 == 0:
                print(f"[GD] Epoch {epoch}/{self.epochs} - Loss: {loss:.4f}")
        return self
    
    def fit_sgd(self, X, y):
        """
        Train using Stochastic Gradient Descent.
        """
        m, n = X.shape
        num_classes = len(np.unique(y))
        self.W = np.zeros((n, num_classes))
        y_encoded = one_hot_encode(y, num_classes)
        self.loss_history = []
        
        for epoch in range(self.epochs):
            indices = np.arange(m)
            np.random.shuffle(indices)
            for i in indices:
                xi = X[i:i+1]  # shape (1, n)
                yi = y_encoded[i:i+1]  # shape (1, num_classes)
                logits = np.dot(xi, self.W)
                probs = softmax(logits)
                grad = np.dot(xi.T, (probs - yi))
                self.W -= self.learning_rate * grad
            # Compute loss at the end of epoch
            logits = np.dot(X, self.W)
            probs = softmax(logits)
            loss = compute_loss(y_encoded, probs)
            self.loss_history.append(loss)
            
            if self.verbose and epoch % 100 == 0:
                print(f"[SGD] Epoch {epoch}/{self.epochs} - Loss: {loss:.4f}")
        return self
    
    def predict(self, X):
        logits = np.dot(X, self.W)
        probs = softmax(logits)
        return np.argmax(probs, axis=1)

# ## 6. Train and Evaluate the Model
# 
# We now run experiments using both GD and SGD. In each case, we demonstrate training on (for example) normalized data.
# 
# You can repeat the following steps with:
# - The unprocessed data: `X_train, X_test`
# - The standardized data: `X_train_std, X_test_std`

# ### 6.1 Training with Batch Gradient Descent (GD) on Normalized Data

In [None]:
# Create and train the model using GD on normalized data
model_gd = LogisticRegressionMulti(learning_rate=0.1, epochs=1000, verbose=True)
model_gd.fit_gd(X_train_norm, y_train)

# Make predictions on test set
y_pred_gd = model_gd.predict(X_test_norm)

# Evaluate the performance
accuracy_gd = accuracy_score(y_test, y_pred_gd)
print("\n[GD] Test Accuracy: {:.2f}%".format(accuracy_gd * 100))
print("\nClassification Report:\n", classification_report(y_test, y_pred_gd))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gd))

# ### 6.2 Visualize the Loss Curve for GD

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(model_gd.loss_history, label="GD Loss")
plt.xlabel("Epochs")
plt.ylabel("Cross-Entropy Loss")
plt.title("Training Loss Curve (Gradient Descent)")
plt.legend()
plt.grid(True)
plt.show()

 ### 6.3 Training with Stochastic Gradient Descent (SGD) on Normalized Data

In [None]:
# Create and train the model using SGD on normalized data
model_sgd = LogisticRegressionMulti(learning_rate=0.01, epochs=1000, verbose=True)
model_sgd.fit_sgd(X_train_norm, y_train)

# Make predictions on test set
y_pred_sgd = model_sgd.predict(X_test_norm)

# Evaluate the performance
accuracy_sgd = accuracy_score(y_test, y_pred_sgd)
print("\n[SGD] Test Accuracy: {:.2f}%".format(accuracy_sgd * 100))
print("\nClassification Report:\n", classification_report(y_test, y_pred_sgd))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_sgd))

# ### 6.4 Visualize the Loss Curve for SGD

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(model_sgd.loss_history, label="SGD Loss", color='orange')
plt.xlabel("Epochs")
plt.ylabel("Cross-Entropy Loss")
plt.title("Training Loss Curve (Stochastic Gradient Descent)")
plt.legend()
plt.grid(True)
plt.show()