## Data Loading

In [2]:
import pandas as pd
import numpy as np
import re
import pickle # To save vocabulary
from collections import Counter

# --- Load Data ---
# URLs for the raw CSV files on GitHub
train_url = 'https://raw.githubusercontent.com/islnlp/Advance-NLP-assingment-/main/Corona_NLP_train.csv'
test_url = 'https://raw.githubusercontent.com/islnlp/Advance-NLP-assingment-/main/Corona_NLP_test.csv'

# It's good practice to specify encoding
df_train = pd.read_csv(train_url, encoding='latin1')
df_test = pd.read_csv(test_url, encoding='latin1')

print("--- Training Data Info ---")
df_train.info()
print("\n--- Sample Training Data ---")
print(df_train.head())

# --- Basic Exploration ---
print(f"\nTraining data shape: {df_train.shape}")
print(f"Test data shape: {df_test.shape}")
print(f"\nSentiment distribution in training data:\n{df_train['Sentiment'].value_counts()}")

--- Training Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB

--- Sample Training Data ---
   UserName  ScreenName   Location     TweetAt  \
0      3799       48751     London  16-03-2020   
1      3800       48752         UK  16-03-2020   
2      3801       48753  Vagabonds  16-03-2020   
3      3802       48754        NaN  16-03-2020   
4      3803       48755        NaN  16-03-2020   

                                       OriginalTweet           Sentiment  
0  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...             

In [3]:
# List of unique classes
classes = df_train['Sentiment'].unique()
print("Available Classes:", classes)

Available Classes: ['Neutral' 'Positive' 'Extremely Negative' 'Negative' 'Extremely Positive']


## Part 1. Data Preprocessing Step

### 1. Text Cleaning and Tokenization

In [4]:
def preprocess_text(text):
    """
    Cleans and tokenizes a single text string.
    """
    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove Twitter handles
    text = re.sub(r'@\w+', '', text)
    # Remove non-alphanumeric characters (keep spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize by splitting on whitespace
    tokens = text.split()
    return tokens

# Apply preprocessing to the tweet column
df_train['tokens'] = df_train['OriginalTweet'].apply(preprocess_text)
df_test['tokens'] = df_test['OriginalTweet'].apply(preprocess_text)

print("\n--- Sample Processed Tokens ---")
print(df_train[['OriginalTweet', 'tokens']].head())


--- Sample Processed Tokens ---
                                       OriginalTweet  \
0  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...   
1  advice Talk to your neighbours family to excha...   
2  Coronavirus Australia: Woolworths to give elde...   
3  My food stock is not the only one which is emp...   
4  Me, ready to go at supermarket during the #COV...   

                                              tokens  
0                                         [and, and]  
1  [advice, talk, to, your, neighbours, family, t...  
2  [coronavirus, australia, woolworths, to, give,...  
3  [my, food, stock, is, not, the, only, one, whi...  
4  [me, ready, to, go, at, supermarket, during, t...  


In [5]:
# Calculate the length of each token list
df_train['token_len'] = df_train['tokens'].apply(len)

In [6]:
# Analyze the distribution of token lengths
print("\n--- Analysis of Tweet Lengths ---")
print(df_train['token_len'].describe(percentiles=[0.90, 0.95, 0.98, 0.99]))


--- Analysis of Tweet Lengths ---
count    41157.000000
mean        28.770051
std         11.526695
min          0.000000
50%         30.000000
90%         43.000000
95%         46.000000
98%         49.000000
99%         51.000000
max         62.000000
Name: token_len, dtype: float64


### 2. Vocabulary Building

In [7]:
def build_vocab(token_lists, max_vocab_size=10000):
    """
    Builds a vocabulary from a list of token lists.
    """
    # Count word frequencies
    word_counts = Counter(word for tokens in token_lists for word in tokens)

    # Get the most common words
    most_common_words = word_counts.most_common(max_vocab_size - 2) # Reserve space for padding and unknown

    # Create word-to-index mapping
    # <PAD>: Padding token, index 0
    # <UNK>: Unknown word token, index 1
    word_to_idx = {'<PAD>': 0, '<UNK>': 1}
    for i, (word, _) in enumerate(most_common_words):
        word_to_idx[word] = i + 2

    return word_to_idx

# Build vocabulary from the training data
vocab = build_vocab(df_train['tokens'])
print(f"\nVocabulary size: {len(vocab)}")

# Save the vocabulary for later use
with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)


Vocabulary size: 10000


### 3. Text Encoding and Padding

In [8]:
def encode_and_pad(tokens, word_to_idx, max_len=60):
    """
    Encodes tokens to indices and pads/truncates the sequence.
    """
    # Encode tokens to indices, using <UNK> for out-of-vocab words
    encoded = [word_to_idx.get(token, word_to_idx['<UNK>']) for token in tokens]

    # Pad or truncate
    if len(encoded) < max_len:
        # Pad with <PAD> token (index 0)
        encoded += [word_to_idx['<PAD>']] * (max_len - len(encoded))
    else:
        # Truncate
        encoded = encoded[:max_len]

    return encoded

# Define a fixed sequence length
MAX_SEQUENCE_LENGTH = 50

# Apply encoding and padding
df_train['encoded'] = df_train['tokens'].apply(lambda x: encode_and_pad(x, vocab, MAX_SEQUENCE_LENGTH))
df_test['encoded'] = df_test['tokens'].apply(lambda x: encode_and_pad(x, vocab, MAX_SEQUENCE_LENGTH))

print("\n--- Sample Encoded and Padded Sequence ---")
print(f"Original Tokens: {df_train['tokens'].iloc[0]}")
print(f"Encoded Sequence: {df_train['encoded'].iloc[0]}")


--- Sample Encoded and Padded Sequence ---
Original Tokens: ['and', 'and']
Encoded Sequence: [4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### 4. Label Encoding

In [9]:
# Create a mapping from sentiment to integer
label_map = {
    'Extremely Negative': 0,
    'Negative': 1,
    'Neutral': 2,
    'Positive': 3,
    'Extremely Positive': 4
}
# And the reverse mapping for interpretation later
idx_to_label = {v: k for k, v in label_map.items()}

df_train['label'] = df_train['Sentiment'].map(label_map)
df_test['label'] = df_test['Sentiment'].map(label_map)

print("\n--- Label Mapping ---")
print(label_map)


--- Label Mapping ---
{'Extremely Negative': 0, 'Negative': 1, 'Neutral': 2, 'Positive': 3, 'Extremely Positive': 4}


### 5. Final Data Preparation

In [10]:
# Convert final data into NumPy arrays
X_train = np.array(df_train['encoded'].tolist())
y_train = np.array(df_train['label'].tolist())

X_test = np.array(df_test['encoded'].tolist())
y_test = np.array(df_test['label'].tolist())

print(f"\nFinal training data shape: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Final testing data shape: X_test={X_test.shape}, y_test={y_test.shape}")


Final training data shape: X_train=(41157, 50), y_train=(41157,)
Final testing data shape: X_test=(3798, 50), y_test=(3798,)


## Part 2. Building Essentials for Model (Layers, Activationn Optimizers, Loss Function)

### 1. Base Layer and Activation Classes

In [11]:
class Layer:
    """Base class for all layers."""
    def __init__(self):
        self.params = {} # For weights and biases
        self.grads = {}  # For gradients

    def forward(self, inputs):
        raise NotImplementedError

    def backward(self, grad):
        raise NotImplementedError

class Tanh:
    """Tanh activation function."""
    def forward(self, x):
        self.y = np.tanh(x)
        return self.y

    def backward(self, grad):
        return grad * (1 - self.y**2)

class Sigmoid:
    """Sigmoid activation function."""
    def forward(self, x):
        self.y = 1 / (1 + np.exp(-x))
        return self.y

    def backward(self, grad):
        return grad * self.y * (1 - self.y)

### 2. Core Network Layers

#### Embedding Layer

In [12]:
class Embedding(Layer):
    """
    Embedding layer: turns positive integers (indexes) into dense vectors of fixed size.
    e.g. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
    """
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

        # Xavier Glorot initialization
        limit = np.sqrt(6 / (vocab_size + embed_dim))
        self.params['W'] = np.random.uniform(-limit, limit, (vocab_size, embed_dim))
        self.grads['W'] = np.zeros_like(self.params['W'])

    def forward(self, inputs):
        """
        Inputs shape: (batch_size, seq_len)
        Output shape: (batch_size, seq_len, embed_dim)
        """
        self.inputs = inputs
        return self.params['W'][inputs]

    def backward(self, grad):
        """
        Gradient shape: (batch_size, seq_len, embed_dim)
        """
        # The gradient of the embedding matrix W is the sum of the gradients
        # for each word that appeared in the input.
        # Np.add.at is used for efficient in-place addition at specific indices.
        np.add.at(self.grads['W'], self.inputs, grad)
        return None # This is the first layer, so no gradient to pass back

#### Dense (Fully-Connected) Layer

In [13]:
class Dense(Layer):
    """
    A fully-connected layer.
    Updated to handle both 2D and 3D input tensors.
    """
    def __init__(self, input_size, output_size):
        super().__init__()
        # Xavier Glorot initialization
        limit = np.sqrt(6 / (input_size + output_size))
        self.params['W'] = np.random.uniform(-limit, limit, (input_size, output_size))
        self.params['b'] = np.zeros(output_size)

    def forward(self, inputs):
        """
        Inputs shape: (batch_size, ..., input_size)
        Output shape: (batch_size, ..., output_size)
        """
        self.inputs = inputs
        return inputs @ self.params['W'] + self.params['b']

    def backward(self, grad):
        """
        grad shape: (batch_size, ..., output_size)
        Handles both 2D and 3D cases for Transformer compatibility.
        """
        # For the bias gradient, sum over all dimensions except the last one (features).
        sum_axes = tuple(range(grad.ndim - 1))
        self.grads['b'] = np.sum(grad, axis=sum_axes)

        # For the weight gradient, we need (D_in, N) @ (N, D_out)
        # If input is 3D (N, S, D_in), we reshape to (N*S, D_in)
        if self.inputs.ndim == 3:
            N, S, D_in = self.inputs.shape
            D_out = grad.shape[-1]
            # Reshape inputs and grad to be 2D for the matmul
            inputs_reshaped = self.inputs.reshape(N * S, D_in)
            grad_reshaped = grad.reshape(N * S, D_out)
            self.grads['W'] = inputs_reshaped.T @ grad_reshaped
        else: # Original 2D case for RNN/LSTM and the final Dense layer in Transformer
            self.grads['W'] = self.inputs.T @ grad

        # The gradient w.r.t input is still a simple matmul
        return grad @ self.params['W'].T

#### 3. Loss Function

In [14]:
class SoftmaxCrossEntropy:
    """
    Computes the cross-entropy loss after applying softmax.
    y_pred is expected to be logits (raw outputs from the last dense layer).
    y_true is expected to be integer class labels.
    """
    def forward(self, y_pred, y_true):
        """
        y_pred shape: (batch_size, num_classes)
        y_true shape: (batch_size,)
        """
        self.y_true = y_true
        batch_size = y_pred.shape[0]

        # Stabilize softmax by subtracting the max logit
        exp_preds = np.exp(y_pred - np.max(y_pred, axis=1, keepdims=True))
        self.probs = exp_preds / np.sum(exp_preds, axis=1, keepdims=True)

        # Calculate loss
        log_likelihood = -np.log(self.probs[range(batch_size), y_true])
        loss = np.sum(log_likelihood) / batch_size
        return loss

    def backward(self):
        """
        Calculates the gradient of the loss with respect to y_pred (the logits).
        """
        batch_size = self.probs.shape[0]
        grad = self.probs.copy()

        # The gradient is simply (probs - 1) for the correct class
        grad[range(batch_size), self.y_true] -= 1
        grad /= batch_size
        return grad

#### 4. The Adam Optimizer


In [15]:
class Adam:
    """
    The Adam optimizer with gradient clipping.
    """
    def __init__(self, layers, learning_rate=0.001, beta1=0.9, beta2=0.999,
                 epsilon=1e-8, clip_value=1.0, clip_norm=None):
        self.layers = layers
        self.lr = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.clip_value = clip_value  # Value clipping threshold
        self.clip_norm = clip_norm    # Gradient norm clipping threshold
        self.t = 0

        # Initialize moment vectors for each parameter in each layer
        self.m = {}
        self.v = {}
        for i, layer in enumerate(self.layers):
            # Only initialize moments for layers that have parameters
            if hasattr(layer, 'params'):
                for key in layer.params:
                    param_key = f'layer_{i}_{key}'
                    self.m[param_key] = np.zeros_like(layer.params[key])
                    self.v[param_key] = np.zeros_like(layer.params[key])

    def _clip_gradients(self, grads):
        """
        Apply gradient clipping to prevent exploding gradients.
        Supports both value clipping and norm clipping.
        """
        if self.clip_value is not None:
            # Value clipping: clip each gradient element individually
            grads = np.clip(grads, -self.clip_value, self.clip_value)

        if self.clip_norm is not None:
            # Norm clipping: scale gradients if their norm exceeds threshold
            grad_norm = np.linalg.norm(grads)
            if grad_norm > self.clip_norm:
                grads = grads * (self.clip_norm / (grad_norm + self.epsilon))

        return grads

    def step(self):
        """
        Performs a single optimization step with gradient clipping.
        """
        self.t += 1

        for i, layer in enumerate(self.layers):
            # Only update parameters for layers that have them
            if hasattr(layer, 'params'):
                for key, param in layer.params.items():
                    param_key = f'layer_{i}_{key}'

                    # Ensure the layer has gradients for this parameter
                    if key in layer.grads:
                        grad = layer.grads[key].copy()  # Make a copy to avoid modifying original

                        # Apply gradient clipping
                        grad = self._clip_gradients(grad)

                        # Update biased first moment estimate
                        self.m[param_key] = self.beta1 * self.m[param_key] + (1 - self.beta1) * grad

                        # Update biased second raw moment estimate
                        self.v[param_key] = self.beta2 * self.v[param_key] + (1 - self.beta2) * (grad**2)

                        # Compute bias-corrected first moment estimate
                        m_hat = self.m[param_key] / (1 - self.beta1**self.t)

                        # Compute bias-corrected second raw moment estimate
                        v_hat = self.v[param_key] / (1 - self.beta2**self.t)

                        # Update parameters
                        update = self.lr * m_hat / (np.sqrt(v_hat) + self.epsilon)
                        param -= update

    def zero_grad(self):
        """
        Reset gradients for all layers (optional but good practice)
        """
        for layer in self.layers:
            if hasattr(layer, 'grads'):
                for key in layer.grads:
                    layer.grads[key] = np.zeros_like(layer.grads[key])

#### 5. Classification Report Function

In [16]:
import numpy as np

def classification_report_from_scratch(y_true, y_pred, class_names=None):
    """
    Generates and prints a text report showing the main classification metrics.
    Now includes micro, macro, and weighted averages for F1-score.
    """
    # Get unique class labels, sorted for consistent ordering
    unique_labels = sorted(np.unique(np.concatenate((y_true, y_pred))))

    if class_names is None:
        class_names = [f"Class {label}" for label in unique_labels]

    max_name_len = max([len(name) for name in class_names] + [len("weighted avg")])

    # --- Print Header ---
    header = (
        f"{'':<{max_name_len}}  "
        f"{'precision':>10}  {'recall':>10}  {'f1-score':>10}  {'support':>10}\n\n"
    )
    report_str = header

    # Initialize for calculations
    precisions, recalls, f1_scores, supports = [], [], [], []
    total_tp, total_fp, total_fn = 0, 0, 0

    # --- Calculate and format metrics for each class ---
    for i, label in enumerate(unique_labels):
        true_class = (y_true == label)
        pred_class = (y_pred == label)

        tp = np.sum(true_class & pred_class)
        fp = np.sum(~true_class & pred_class)
        fn = np.sum(true_class & ~pred_class)

        support = np.sum(true_class)

        # Aggregate totals for micro-average calculation
        total_tp += tp
        total_fp += fp
        total_fn += fn

        epsilon = 1e-8
        precision = tp / (tp + fp + epsilon)
        recall = tp / (tp + fn + epsilon)
        f1_score = 2 * (precision * recall) / (precision + recall + epsilon)

        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1_score)
        supports.append(support)

        report_str += (
            f"{class_names[i]:<{max_name_len}}  "
            f"{precision:>10.2f}  "
            f"{recall:>10.2f}  "
            f"{f1_score:>10.2f}  "
            f"{support:>10}\n"
        )

    report_str += "\n"
    total_samples = np.sum(supports)

    # --- Calculate and format overall and average metrics ---
    accuracy = np.sum(y_true == y_pred) / total_samples

    # Accuracy
    report_str += (
        f"{'accuracy':<{max_name_len}}  "
        f"{'':>10}  {'':>10}  "
        f"{accuracy:>10.2f}  {total_samples:>10}\n"
    )

    # --- NEW: Micro Average Calculation ---
    # Note: In multiclass, micro-precision, micro-recall, and micro-f1 are all equal to accuracy.
    micro_precision = total_tp / (total_tp + total_fp + 1e-8)
    micro_recall = total_tp / (total_tp + total_fn + 1e-8)
    micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall + 1e-8)
    report_str += (
        f"{'micro avg':<{max_name_len}}  "
        f"{micro_precision:>10.2f}  "
        f"{micro_recall:>10.2f}  "
        f"{micro_f1:>10.2f}  "
        f"{total_samples:>10}\n"
    )

    # Macro average
    report_str += (
        f"{'macro avg':<{max_name_len}}  "
        f"{np.mean(precisions):>10.2f}  "
        f"{np.mean(recalls):>10.2f}  "
        f"{np.mean(f1_scores):>10.2f}  "
        f"{total_samples:>10}\n"
    )

    # Weighted average
    report_str += (
        f"{'weighted avg':<{max_name_len}}  "
        f"{np.sum(np.array(precisions) * np.array(supports)) / total_samples:>10.2f}  "
        f"{np.sum(np.array(recalls) * np.array(supports)) / total_samples:>10.2f}  "
        f"{np.sum(np.array(f1_scores) * np.array(supports)) / total_samples:>10.2f}  "
        f"{total_samples:>10}\n"
    )

    print(report_str)

#### 6. Dropout Layer

In [17]:
class Dropout(Layer):
    """
    Implements the Dropout layer for regularization.
    """
    def __init__(self, p=0.5):
        super().__init__()
        self.p = p
        self.mask = None

    def forward(self, x, training=True):
        """
        Applies dropout during training.
        During evaluation (training=False), it does nothing.
        """
        if training:
            # Create a mask and apply inverted dropout
            # We scale the outputs by 1/(1-p) during training
            self.mask = (np.random.rand(*x.shape) > self.p) / (1 - self.p)
            return x * self.mask
        return x

    def backward(self, grad):
        """
        Applies the same mask to the gradients.
        """
        # Gradients only flow through the neurons that were not dropped out
        return grad * self.mask

#### 7. Save the Model

In [18]:
# --- Saving Weights while Training ---

def save_weights(model, path):
    """Saves model weights to a file."""
    weights = {}
    for i, layer in enumerate(model.layers):
        for key, param in layer.params.items():
            weights[f'layer_{i}_{key}'] = param
    np.savez(path, **weights)
    print(f"Weights saved to {path}")

## Part 3. Model 1 - RNN from scratch

### 1. Simple RNN

In [19]:
import numpy as np

# Note: The classes from Part 2 (Layer, Tanh, Embedding, Dense, etc.) are assumed to be defined.

class SimpleRNN(Layer):
    """
    A simple Recurrent Neural Network layer.
    """
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Xavier initialization
        xavier_std = np.sqrt(2.0 / (input_size + hidden_size))
        self.params['W_xh'] = np.random.randn(input_size, hidden_size) * xavier_std
        self.params['W_hh'] = np.random.randn(hidden_size, hidden_size) * 0.01  # Smaller for recurrent weights
        self.params['b_h'] = np.zeros(hidden_size)

    def forward(self, inputs):
        """
        Processes a sequence of inputs.
        Inputs shape: (batch_size, seq_len, input_size)
        Output shape: (batch_size, hidden_size) -> The final hidden state
        """
        self.inputs = inputs
        batch_size, seq_len, _ = inputs.shape

        # Initialize hidden state and storage for backprop
        self.h_states = np.zeros((batch_size, seq_len + 1, self.hidden_size))
        self.tanh_inputs = np.zeros((batch_size, seq_len, self.hidden_size))

        # Initial hidden state h_0 is all zeros
        h = self.h_states[:, 0, :]

        for t in range(seq_len):
            x_t = inputs[:, t, :]
            pre_activation = x_t @ self.params['W_xh'] + h @ self.params['W_hh'] + self.params['b_h']
            self.tanh_inputs[:, t, :] = pre_activation
            h = np.tanh(pre_activation)
            self.h_states[:, t+1, :] = h

        return h # Return the final hidden state

    def backward(self, grad):
        """
        Performs Backpropagation Through Time (BPTT).
        grad shape: (batch_size, hidden_size) -> gradient of the final hidden state
        Returns gradient w.r.t inputs: (batch_size, seq_len, input_size)
        """
        batch_size, seq_len, _ = self.inputs.shape

        # Initialize gradients for parameters and input
        self.grads['W_xh'] = np.zeros_like(self.params['W_xh'])
        self.grads['W_hh'] = np.zeros_like(self.params['W_hh'])
        self.grads['b_h'] = np.zeros_like(self.params['b_h'])
        grad_inputs = np.zeros_like(self.inputs)

        # Gradient of the hidden state, starting from the end and flowing backwards
        grad_h_next = grad

        for t in reversed(range(seq_len)):
            # Gradient through the tanh activation
            grad_tanh = (1 - self.h_states[:, t+1, :]**2) * grad_h_next

            # Gradients for parameters at this time step
            self.grads['b_h'] += np.sum(grad_tanh, axis=0)
            self.grads['W_xh'] += self.inputs[:, t, :].T @ grad_tanh
            self.grads['W_hh'] += self.h_states[:, t, :].T @ grad_tanh # Use h_{t-1}

            # Gradient to pass to the previous hidden state
            grad_h_prev = grad_tanh @ self.params['W_hh'].T

            # Gradient for the input at this time step
            grad_inputs[:, t, :] = grad_tanh @ self.params['W_xh'].T

            # Update the hidden state gradient for the next iteration (t-1)
            grad_h_next = grad_h_prev

        return grad_inputs

### 2. Assembling the Full RNN Classifier

In [20]:
class RNNClassifier:
    """
    A complete RNN model for classification.
    (Your __init__ and other methods remain the same)
    """
    def __init__(self, vocab_size, embed_dim, hidden_size, num_classes, dropout_p=0.3):
        self.embedding = Embedding(vocab_size, embed_dim)
        self.rnn = SimpleRNN(embed_dim, hidden_size)
        self.dropout = Dropout(dropout_p)
        self.dense = Dense(hidden_size, num_classes)
        self.layers = [self.embedding, self.rnn, self.dropout, self.dense]

    def forward(self, inputs, training=True):
        """
        Forward pass for the classifier.
        The 'training' flag controls the dropout layer.
        """
        x = self.embedding.forward(inputs)
        x = self.rnn.forward(x)
        # Apply dropout only during training
        x = self.dropout.forward(x, training=training)
        x = self.dense.forward(x)
        return x

    def backward(self, grad):
        """
        Backward pass for the classifier.
        """
        grad = self.dense.backward(grad)
        # Add the backward pass for the dropout layer
        grad = self.dropout.backward(grad)
        grad = self.rnn.backward(grad)
        self.embedding.backward(grad)

    def get_params(self):
        params = []
        for layer in self.layers:
            params.extend(layer.params.values())
        return params

    def get_grads(self):
        grads = []
        for layer in self.layers:
            grads.extend(layer.grads.values())
        return grads

### 3. Training RNN and Weight Download

In [21]:
# import os
# import requests
# import io
# from tqdm import tqdm # For a nice progress bar


# # --- Model and Training Hyperparameters ---

# VOCAB_SIZE = len(vocab)
# EMBED_DIM = 100 # As required by the assignment
# HIDDEN_SIZE = 256
# NUM_CLASSES = 5
# EPOCHS = 10 # A few epochs for demonstration
# BATCH_SIZE = 32
# LEARNING_RATE = 0.0005

# # --- Initialization ---

# model = RNNClassifier(VOCAB_SIZE, EMBED_DIM, HIDDEN_SIZE, NUM_CLASSES)
# loss_fn = SoftmaxCrossEntropy()
# optimizer = Adam(
#     model.layers,
#     learning_rate=0.0005,
#     beta1=0.9,
#     beta2=0.999,
#     epsilon=1e-8,
#     clip_value=1.0,      # Clip gradient values between -1.0 and 1.0
#     clip_norm=5.0        # Additionally clip gradient norm to 5.0
# )

# # --- Training Loop ---

# num_batches = len(X_train) // BATCH_SIZE

# for epoch in range(EPOCHS):
#     epoch_loss = 0
#     # Shuffle training data
#     permutation = np.random.permutation(len(X_train))
#     X_train_shuffled = X_train[permutation]
#     y_train_shuffled = y_train[permutation]

#     # Create a progress bar
#     pbar = tqdm(range(num_batches), desc=f"Epoch {epoch+1}/{EPOCHS}")

#     for i in pbar:
#         # Create a mini-batch
#         start = i * BATCH_SIZE
#         end = start + BATCH_SIZE
#         X_batch = X_train_shuffled[start:end]
#         y_batch = y_train_shuffled[start:end]

#         # 1. Forward pass
#         logits = model.forward(X_batch, training=True)

#         # 2. Compute loss
#         loss = loss_fn.forward(logits, y_batch)
#         epoch_loss += loss

#         # 3. Backward pass
#         grad = loss_fn.backward()
#         model.backward(grad)

#         # 4. Update weights
#         optimizer.step()

#         # Update progress bar description
#         pbar.set_postfix({'loss': f'{loss:.4f}'})

#     print(f"Epoch {epoch+1} Average Loss: {epoch_loss / num_batches:.4f}")

# # --- Save the trained weights ---
# if not os.path.exists('saved_weights'):
#     os.makedirs('saved_weights')

# save_weights(model, 'saved_weights/rnn_model_weights.npz')

# # --- Evaluation ---
# def evaluate(model, X, y):
#     logits = model.forward(X, training=False)
#     predictions = np.argmax(logits, axis=1)
#     accuracy = np.mean(predictions == y)
#     return accuracy

# train_accuracy = evaluate(model, X_train[:500], y_train[:500]) # On a subset for speed
# test_accuracy = evaluate(model, X_test, y_test)
# print(f"\nTraining Accuracy: {train_accuracy:.4f}")
# print(f"Test Accuracy: {test_accuracy:.4f}")

## Part 5. Model 2 - LSTM from scratch

### 1. LSTM Layer

In [23]:
class LSTM(Layer):
    """A Long Short-Term Memory (LSTM) layer."""
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # We concatenate weights for all 4 gates for efficiency
        # W_x maps input to the 4 gates, W_h maps hidden state to the 4 gates
        size_sum = input_size + hidden_size
        limit_x = np.sqrt(6 / (size_sum))
        limit_h = np.sqrt(6 / (hidden_size + hidden_size))

        self.params['W_x'] = np.random.uniform(-limit_x, limit_x, (input_size, 4 * hidden_size))
        self.params['W_h'] = np.random.uniform(-limit_h, limit_h, (hidden_size, 4 * hidden_size))
        self.params['b'] = np.zeros(4 * hidden_size)

    def forward(self, inputs):
        self.inputs = inputs
        batch_size, seq_len, _ = inputs.shape
        h_size = self.hidden_size

        # Caches for backpropagation
        self.h_states = np.zeros((batch_size, seq_len + 1, h_size))
        self.c_states = np.zeros((batch_size, seq_len + 1, h_size))
        self.gates_pre = np.zeros((batch_size, seq_len, 4 * h_size))
        self.gates = np.zeros((batch_size, seq_len, 4 * h_size))

        # Initial hidden and cell states are zeros
        h = self.h_states[:, 0, :]
        c = self.c_states[:, 0, :]

        for t in range(seq_len):
            x_t = inputs[:, t, :]

            # Combined matrix multiplication for all gates
            pre_activation = x_t @ self.params['W_x'] + h @ self.params['W_h'] + self.params['b']
            self.gates_pre[:, t, :] = pre_activation

            # Split into individual gates
            f_gate = self._sigmoid(pre_activation[:, :h_size])
            i_gate = self._sigmoid(pre_activation[:, h_size:2*h_size])
            g_gate = np.tanh(pre_activation[:, 2*h_size:3*h_size])
            o_gate = self._sigmoid(pre_activation[:, 3*h_size:])
            self.gates[:, t, :] = np.hstack((f_gate, i_gate, g_gate, o_gate))

            # Update cell and hidden states
            c = f_gate * c + i_gate * g_gate
            h = o_gate * np.tanh(c)

            self.c_states[:, t+1, :] = c
            self.h_states[:, t+1, :] = h

        return h

    def backward(self, grad_h_final):
        batch_size, seq_len, _ = self.inputs.shape
        h_size = self.hidden_size

        # Initialize gradients
        self.grads['W_x'] = np.zeros_like(self.params['W_x'])
        self.grads['W_h'] = np.zeros_like(self.params['W_h'])
        self.grads['b'] = np.zeros_like(self.params['b'])
        grad_inputs = np.zeros_like(self.inputs)

        # Initialize gradients for hidden and cell states flowing backwards
        grad_h = grad_h_final
        grad_c = np.zeros_like(grad_h)

        for t in reversed(range(seq_len)):
            # Retrieve cached values for this timestep
            h_prev = self.h_states[:, t, :]
            c_prev = self.c_states[:, t, :]
            c_t = self.c_states[:, t+1, :]
            x_t = self.inputs[:, t, :]

            f_t, i_t, g_t, o_t = (
                self.gates[:, t, :h_size],
                self.gates[:, t, h_size:2*h_size],
                self.gates[:, t, 2*h_size:3*h_size],
                self.gates[:, t, 3*h_size:]
            )

            # Backprop through h_t = o_t * tanh(c_t)
            grad_o = grad_h * np.tanh(c_t)
            grad_c += grad_h * o_t * (1 - np.tanh(c_t)**2)

            # Backprop through c_t = f_t * c_prev + i_t * g_t
            grad_f = grad_c * c_prev
            grad_i = grad_c * g_t
            grad_g = grad_c * i_t
            grad_c_prev = grad_c * f_t

            # Backprop through gate activations
            d_o_pre = grad_o * o_t * (1 - o_t) # Sigmoid derivative
            d_g_pre = grad_g * (1 - g_t**2)      # Tanh derivative
            d_i_pre = grad_i * i_t * (1 - i_t) # Sigmoid derivative
            d_f_pre = grad_f * f_t * (1 - f_t) # Sigmoid derivative

            # Concatenate gate gradients
            d_gates = np.hstack((d_f_pre, d_i_pre, d_g_pre, d_o_pre))

            # Calculate gradients for parameters and inputs
            self.grads['b'] += np.sum(d_gates, axis=0)
            self.grads['W_x'] += x_t.T @ d_gates
            self.grads['W_h'] += h_prev.T @ d_gates

            grad_inputs[:, t, :] = d_gates @ self.params['W_x'].T

            # Update gradients for the next (previous) timestep
            grad_h = d_gates @ self.params['W_h'].T
            grad_c = grad_c_prev

        return grad_inputs

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

### 2. Assembling LSTM Classifier

In [24]:
class LSTMClassifier:
    """A complete LSTM model for classification."""
    def __init__(self, vocab_size, embed_dim, hidden_size, num_classes):
        self.embedding = Embedding(vocab_size, embed_dim)
        self.lstm = LSTM(embed_dim, hidden_size)
        self.dense = Dense(hidden_size, num_classes)
        self.layers = [self.embedding, self.lstm, self.dense]

    def forward(self, inputs):
        x = self.embedding.forward(inputs)
        x = self.lstm.forward(x)
        x = self.dense.forward(x)
        return x

    def backward(self, grad):
        grad = self.dense.backward(grad)
        grad = self.lstm.backward(grad)
        self.embedding.backward(grad)

# --- Model and Training Hyperparameters ---
# Same as before, but we can give the model a new name
VOCAB_SIZE = len(vocab)
EMBED_DIM = 100
HIDDEN_SIZE = 128
NUM_CLASSES = 5
EPOCHS = 5
BATCH_SIZE = 64
LEARNING_RATE = 0.001

# --- Initialization ---
lstm_model = LSTMClassifier(VOCAB_SIZE, EMBED_DIM, HIDDEN_SIZE, NUM_CLASSES)
loss_fn = SoftmaxCrossEntropy()
optimizer = Adam(lstm_model.layers, learning_rate=LEARNING_RATE)

### 3. Training the LSTM Classifier and Weight Download

In [25]:
# # --- Training Loop (This code is identical to the RNN training loop) ---
# num_batches = len(X_train) // BATCH_SIZE

# for epoch in range(EPOCHS):
#     epoch_loss = 0
#     permutation = np.random.permutation(len(X_train))
#     X_train_shuffled = X_train[permutation]
#     y_train_shuffled = y_train[permutation]

#     pbar = tqdm(range(num_batches), desc=f"Epoch {epoch+1}/{EPOCHS} (LSTM)")

#     for i in pbar:
#         start = i * BATCH_SIZE
#         end = start + BATCH_SIZE
#         X_batch, y_batch = X_train_shuffled[start:end], y_train_shuffled[start:end]

#         logits = lstm_model.forward(X_batch)
#         loss = loss_fn.forward(logits, y_batch)
#         epoch_loss += loss

#         grad = loss_fn.backward()
#         lstm_model.backward(grad)
#         optimizer.step()

#         pbar.set_postfix({'loss': f'{loss:.4f}'})

#     print(f"Epoch {epoch+1} Average Loss: {epoch_loss / num_batches:.4f}")

# # --- Save and Evaluate ---
# save_weights(lstm_model, 'saved_weights/lstm_model_weights.npz')

# # --- Evaluation ---
# def evaluate(model, X, y):
#     logits = model.forward(X)
#     predictions = np.argmax(logits, axis=1)
#     accuracy = np.mean(predictions == y)
#     return accuracy

# print("\n--- LSTM Model Evaluation ---")
# train_accuracy = evaluate(lstm_model, X_train[:500], y_train[:500])
# test_accuracy = evaluate(lstm_model, X_test, y_test)
# print(f"Training Accuracy: {train_accuracy:.4f}")
# print(f"Test Accuracy: {test_accuracy:.4f}")

## Part 5. Model 3 - Transformer from Scratch

### 1. Positional Encoding

In [27]:
import numpy as np

class PositionalEncoding(Layer):
    """
    Injects positional information into the input embeddings.
    """
    def __init__(self, max_seq_len, embed_dim):
        super().__init__()
        self.embed_dim = embed_dim
        self.max_seq_len = max_seq_len

        # Pre-calculate the positional encoding matrix
        pe = np.zeros((max_seq_len, embed_dim))
        position = np.arange(0, max_seq_len).reshape(-1, 1)
        div_term = np.exp(np.arange(0, embed_dim, 2) * -(np.log(10000.0) / embed_dim))

        pe[:, 0::2] = np.sin(position * div_term)
        pe[:, 1::2] = np.cos(position * div_term)

        # Add a batch dimension for broadcasting
        self.pe = pe.reshape(1, max_seq_len, embed_dim)

    def forward(self, x):
        """
        Adds positional encoding to the input.
        x shape: (batch_size, seq_len, embed_dim)
        """
        # x is the output from the embedding layer
        self.inputs = x
        seq_len = x.shape[1]

        # Add the pre-computed encodings to the input embeddings
        return x + self.pe[:, :seq_len, :]

    def backward(self, grad):
        """
        Passes the gradient through, as this layer has no trainable parameters.
        """
        # The gradient of the input is just the upstream gradient,
        # as the operation is a simple addition.
        return grad

### 2. Multi-Head Self Attention

In [28]:
def softmax(x):
    """Numerically stable softmax for the last axis."""
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / np.sum(e_x, axis=-1, keepdims=True)

class MultiHeadAttention(Layer):
    """
    Multi-Head Self-Attention Layer.
    """
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by number of heads."

        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        # Xavier initialization
        xavier_std = np.sqrt(2.0 / (embed_dim * 2))

        # We can combine W_q, W_k, W_v into one matrix for efficiency
        self.params['W_qkv'] = np.random.randn(embed_dim, embed_dim * 3) * xavier_std
        self.params['W_o'] = np.random.randn(embed_dim, embed_dim) * xavier_std

    def forward(self, x):
        """
        Forward pass for Multi-Head Attention.
        x shape: (batch_size, seq_len, embed_dim)
        """
        batch_size, seq_len, _ = x.shape
        self.x_shape = x.shape # Store original shape

        # 1. Project to Q, K, V
        qkv = x @ self.params['W_qkv']

        # 2. Reshape and split Q, K, V for each head
        qkv = qkv.reshape(batch_size, seq_len, 3, self.num_heads, self.head_dim)
        qkv = qkv.transpose(2, 0, 3, 1, 4) # (3, batch_size, num_heads, seq_len, head_dim)
        q, k, v = qkv[0], qkv[1], qkv[2]

        # 3. Scaled Dot-Product Attention
        scores = (q @ k.transpose(0, 1, 3, 2)) / np.sqrt(self.head_dim)

        # 4. Apply softmax to get attention weights
        self.attention_weights = softmax(scores)

        # 5. Apply weights to values
        weighted_v = self.attention_weights @ v

        # 6. Concatenate heads
        # Transpose back: (batch_size, seq_len, num_heads, head_dim)
        weighted_v = weighted_v.transpose(0, 2, 1, 3)
        # Reshape to (batch_size, seq_len, embed_dim)
        concat_v = weighted_v.reshape(batch_size, seq_len, self.embed_dim)

        # 7. Final linear projection
        output = concat_v @ self.params['W_o']

        # Cache values for backward pass
        self.cache = (x, q, k, v, concat_v)
        return output

    def backward(self, grad_output):
        """
        Backward pass for Multi-Head Attention. (This is complex!)
        grad_output shape: (batch_size, seq_len, embed_dim)
        """
        x, q, k, v, concat_v = self.cache
        batch_size, seq_len, _ = self.x_shape

        # 1. Gradient of the final projection
        self.grads['W_o'] = concat_v.reshape(-1, self.embed_dim).T @ grad_output.reshape(-1, self.embed_dim)
        grad_concat_v = grad_output @ self.params['W_o'].T

        # 2. Un-concatenate heads
        grad_weighted_v = grad_concat_v.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
        grad_weighted_v = grad_weighted_v.transpose(0, 2, 1, 3) # (batch, heads, seq_len, head_dim)

        # 3. Gradient through attention_weights @ v
        grad_attention_weights = grad_weighted_v @ v.transpose(0, 1, 3, 2)
        grad_v = self.attention_weights.transpose(0, 1, 3, 2) @ grad_weighted_v

        # 4. Gradient through softmax
        s = self.attention_weights
        grad_scores = s * (grad_attention_weights - np.sum(grad_attention_weights * s, axis=-1, keepdims=True))

        # 5. Gradient through scaling
        grad_scores /= np.sqrt(self.head_dim)

        # 6. Gradient through q @ k.T
        grad_q = grad_scores @ k
        grad_k = grad_scores.transpose(0, 1, 3, 2) @ q

        # 7. Combine head gradients for Q, K, V
        # Transpose from (batch, head, seq, dim) to (3, batch, head, seq, dim)
        grad_qkv = np.array([grad_q, grad_k, grad_v])
        # Transpose back to (batch, seq, 3, head, dim)
        grad_qkv = grad_qkv.transpose(1, 3, 0, 2, 4)
        # Reshape to (batch, seq, 3 * embed_dim)
        grad_qkv = grad_qkv.reshape(batch_size, seq_len, 3 * self.embed_dim)

        # 8. Gradient of the initial projection W_qkv
        self.grads['W_qkv'] = x.reshape(-1, self.embed_dim).T @ grad_qkv.reshape(-1, 3 * self.embed_dim)

        # 9. Gradient for the input x
        grad_x = grad_qkv @ self.params['W_qkv'].T

        return grad_x

### 3. Position Wise Feedforward Network

In [29]:
class PositionwiseFeedForward(Layer):
    """
    Implements the Position-wise Feed-Forward Network.
    """
    def __init__(self, embed_dim, ffn_dim):
        super().__init__()
        # He initialization for ReLU
        he_std_1 = np.sqrt(2.0 / embed_dim)
        he_std_2 = np.sqrt(2.0 / ffn_dim)

        self.params['W_1'] = np.random.randn(embed_dim, ffn_dim) * he_std_1
        self.params['b_1'] = np.zeros(ffn_dim)
        self.params['W_2'] = np.random.randn(ffn_dim, embed_dim) * he_std_2
        self.params['b_2'] = np.zeros(embed_dim)

    def forward(self, x):
        """
        x shape: (batch_size, seq_len, embed_dim)
        """
        # First linear layer + ReLU
        linear_1 = x @ self.params['W_1'] + self.params['b_1']
        relu_out = np.maximum(0, linear_1) # ReLU activation

        # Second linear layer
        output = relu_out @ self.params['W_2'] + self.params['b_2']

        # Cache for backprop
        self.cache = (x, linear_1, relu_out)
        return output

    def backward(self, grad_output):
        x, linear_1, relu_out = self.cache

        # Backprop through second linear layer
        self.grads['W_2'] = relu_out.reshape(-1, relu_out.shape[-1]).T @ grad_output.reshape(-1, grad_output.shape[-1])
        self.grads['b_2'] = np.sum(grad_output, axis=(0, 1))
        grad_relu_out = grad_output @ self.params['W_2'].T

        # Backprop through ReLU
        grad_linear_1 = grad_relu_out * (linear_1 > 0)

        # Backprop through first linear layer
        self.grads['W_1'] = x.reshape(-1, x.shape[-1]).T @ grad_linear_1.reshape(-1, grad_linear_1.shape[-1])
        self.grads['b_1'] = np.sum(grad_linear_1, axis=(0, 1))

        # Gradient to pass to the previous layer
        grad_x = grad_linear_1 @ self.params['W_1'].T
        return grad_x

### 4. Layer Normalization

In [30]:
class LayerNormalization(Layer):
    """
    Implements Layer Normalization.
    """
    def __init__(self, embed_dim, epsilon=1e-5):
        super().__init__()
        self.epsilon = epsilon
        # Learnable parameters: gamma (scale) and beta (shift)
        self.params['gamma'] = np.ones(embed_dim)
        self.params['beta'] = np.zeros(embed_dim)

    def forward(self, x):
        # Calculate mean and variance along the last dimension (features)
        mean = np.mean(x, axis=-1, keepdims=True)
        var = np.var(x, axis=-1, keepdims=True)
        std = np.sqrt(var + self.epsilon)

        # Normalize x
        x_norm = (x - mean) / std

        # Apply scale and shift
        output = self.params['gamma'] * x_norm + self.params['beta']

        self.cache = (x, mean, std, x_norm)
        return output

    def backward(self, grad_output):
        x, mean, std, x_norm = self.cache
        N, T, D = x.shape

        # Gradients for learnable parameters gamma and beta
        self.grads['gamma'] = np.sum(grad_output * x_norm, axis=(0, 1))
        self.grads['beta'] = np.sum(grad_output, axis=(0, 1))

        # Gradient for the normalized output
        grad_x_norm = grad_output * self.params['gamma']

        # Gradient for the standard deviation
        grad_std = -np.sum(grad_x_norm * (x - mean), axis=-1, keepdims=True) / (std**2)

        # Gradient for the variance
        grad_var = 0.5 * grad_std / std

        # Gradient for the mean
        grad_mean = -np.sum(grad_x_norm / std, axis=-1, keepdims=True) - (2.0/D) * grad_var * np.sum(x - mean, axis=-1, keepdims=True)

        # Gradient for the input x
        grad_x = (grad_x_norm / std) + (2.0/D) * grad_var * (x - mean) + (1.0/D) * grad_mean

        return grad_x

### 5. Transformer Encoder Block

In [31]:
class TransformerEncoderBlock(Layer):
    """
    A single block of the Transformer Encoder.
    """
    def __init__(self, embed_dim, num_heads, ffn_dim, dropout_p=0.1):
        # We REMOVE the super().__init__() call here.
        # This class is a container and its params/grads are handled by the @property methods.

        self.attention = MultiHeadAttention(embed_dim, num_heads)
        self.norm1 = LayerNormalization(embed_dim)
        self.feed_forward = PositionwiseFeedForward(embed_dim, ffn_dim)
        self.norm2 = LayerNormalization(embed_dim)
        self.dropout1 = Dropout(dropout_p)
        self.dropout2 = Dropout(dropout_p)

        # Store sub-layers in a list to make collecting params/grads easy
        self.sub_layers = [self.attention, self.norm1, self.feed_forward, self.norm2, self.dropout1, self.dropout2]

    def forward(self, x, training=True):
        # 1. Attention sub-layer
        attn_output = self.attention.forward(x)
        attn_output = self.dropout1.forward(attn_output, training=training)
        # Residual connection and normalization
        sublayer1_output = self.norm1.forward(x + attn_output)

        # 2. Feed-forward sub-layer
        ffn_output = self.feed_forward.forward(sublayer1_output)
        ffn_output = self.dropout2.forward(ffn_output, training=training)
        # Residual connection and normalization
        output = self.norm2.forward(sublayer1_output + ffn_output)

        return output

    def backward(self, grad_output):
        # Backpropagate in reverse order
        grad_sublayer2 = self.norm2.backward(grad_output)
        grad_sublayer1_output = grad_sublayer2
        grad_ffn_output = grad_sublayer2

        grad_ffn_output = self.dropout2.backward(grad_ffn_output)
        grad_sublayer1_output += self.feed_forward.backward(grad_ffn_output)

        grad_sublayer1 = self.norm1.backward(grad_sublayer1_output)
        grad_x = grad_sublayer1
        grad_attn_output = grad_sublayer1

        grad_attn_output = self.dropout1.backward(grad_attn_output)
        grad_x += self.attention.backward(grad_attn_output)

        return grad_x

    # These properties dynamically gather params and grads from all sub-layers
    @property
    def params(self):
        params = {}
        for i, layer in enumerate(self.sub_layers):
            # Use layer's class name for better readability
            layer_name = f"{layer.__class__.__name__}_{i}"
            for key, val in layer.params.items():
                params[f'{layer_name}_{key}'] = val
        return params

    @property
    def grads(self):
        grads = {}
        for i, layer in enumerate(self.sub_layers):
            layer_name = f"{layer.__class__.__name__}_{i}"
            for key, val in layer.grads.items():
                grads[f'{layer_name}_{key}'] = val
        return grads

### 6. Global Average Pooling Layer

In [32]:
class GlobalAveragePooling1D(Layer):
    """
    Performs global average pooling over the time/sequence dimension.
    """
    def __init__(self):
        super().__init__()

    def forward(self, x):
        """
        x shape: (batch_size, seq_len, embed_dim)
        output shape: (batch_size, embed_dim)
        """
        self.cache_input_shape = x.shape
        return np.mean(x, axis=1)

    def backward(self, grad):
        """
        Distributes the gradient evenly back across the sequence length.
        grad shape: (batch_size, embed_dim)
        """
        _, seq_len, _ = self.cache_input_shape

        # Expand grad to be broadcastable to the input shape
        grad_expanded = np.expand_dims(grad, 1)

        # The gradient for each time step is the upstream grad divided by the sequence length
        grad_input = np.tile(grad_expanded, (1, seq_len, 1)) / seq_len

        return grad_input

### 7. Transformer Classifier

In [33]:
class TransformerClassifier:
    """
    A full Transformer-based text classifier.
    """
    def __init__(self, vocab_size, max_seq_len, embed_dim, num_heads, ffn_dim, num_layers, num_classes, dropout_p=0.1):
        self.layers = []

        # 1. Input layers
        self.embedding = Embedding(vocab_size, embed_dim)
        self.pos_encoding = PositionalEncoding(max_seq_len, embed_dim)
        self.embed_dropout = Dropout(dropout_p)
        self.layers.extend([self.embedding, self.pos_encoding, self.embed_dropout])

        # 2. Stack of N Transformer Encoder Blocks
        self.encoder_blocks = []
        for _ in range(num_layers):
            block = TransformerEncoderBlock(embed_dim, num_heads, ffn_dim, dropout_p)
            self.encoder_blocks.append(block)
            self.layers.append(block)

        # 3. Output layers
        self.pooling = GlobalAveragePooling1D()
        self.dense = Dense(embed_dim, num_classes)
        self.layers.extend([self.pooling, self.dense])

    def forward(self, inputs, training=True):
        # Input processing
        x = self.embedding.forward(inputs)
        x = self.pos_encoding.forward(x)
        x = self.embed_dropout.forward(x, training=training)

        # Pass through the stack of encoder blocks
        for block in self.encoder_blocks:
            x = block.forward(x, training=training)

        # Pooling and final classification
        x = self.pooling.forward(x)
        logits = self.dense.forward(x)
        return logits

    def backward(self, grad):
        # Backpropagate in reverse order
        grad = self.dense.backward(grad)
        grad = self.pooling.backward(grad)

        for block in reversed(self.encoder_blocks):
            grad = block.backward(grad)

        grad = self.embed_dropout.backward(grad)
        grad = self.pos_encoding.backward(grad)
        self.embedding.backward(grad)

### 8. Model Training and Weight Download

In [34]:
# from tqdm.notebook import tqdm
# import os

# MAX_SEQ_LEN = 50

# VOCAB_SIZE = len(vocab)
# EMBED_DIM = 100
# NUM_HEADS = 5          # Number of attention heads (standard choice)
# FFN_DIM = 400         # Hidden layer size in FFN (usually 4 * embed_dim)
# NUM_LAYERS = 4         # Number of Transformer blocks to stack (a deep model)
# NUM_CLASSES = 5
# DROPOUT_P = 0.1        # Standard dropout for Transformers

# EPOCHS = 10            # Train for longer to allow the model to converge
# BATCH_SIZE = 64        # Can use a larger batch size if memory allows
# LEARNING_RATE = 0.0001 # Transformers often prefer a smaller learning rate

# # --- Initialization ---

# print("Initializing Transformer Classifier...")
# transformer_model = TransformerClassifier(
#     vocab_size=VOCAB_SIZE,
#     max_seq_len=MAX_SEQ_LEN,
#     embed_dim=EMBED_DIM,
#     num_heads=NUM_HEADS,
#     ffn_dim=FFN_DIM,
#     num_layers=NUM_LAYERS,
#     num_classes=NUM_CLASSES,
#     dropout_p=DROPOUT_P
# )

# loss_fn = SoftmaxCrossEntropy()
# optimizer = Adam(
#     transformer_model.layers,
#     learning_rate=LEARNING_RATE,
#     beta1=0.9,
#     beta2=0.98,        # Values commonly used for Transformers
#     epsilon=1e-9,      # Values commonly used for Transformers
#     clip_norm=1.0      # Gradient norm clipping is very important
# )

# # --- Training Loop ---
# num_batches = len(X_train) // BATCH_SIZE

# for epoch in range(EPOCHS):
#     epoch_loss = 0
#     permutation = np.random.permutation(len(X_train))
#     X_train_shuffled = X_train[permutation]
#     y_train_shuffled = y_train[permutation]

#     pbar = tqdm(range(num_batches), desc=f"Epoch {epoch+1}/{EPOCHS} (transformer)")

#     for i in pbar:
#         start = i * BATCH_SIZE
#         end = start + BATCH_SIZE
#         X_batch, y_batch = X_train_shuffled[start:end], y_train_shuffled[start:end]

#         logits = transformer_model.forward(X_batch, training=True)
#         loss = loss_fn.forward(logits, y_batch)
#         epoch_loss += loss

#         grad = loss_fn.backward()
#         transformer_model.backward(grad)
#         optimizer.step()

#         pbar.set_postfix({'loss': f'{loss:.4f}'})

#     print(f"Epoch {epoch+1} Average Loss: {epoch_loss / num_batches:.4f}")

# # --- Save the trained weights ---
# if not os.path.exists('saved_weights'):
#     os.makedirs('saved_weights')

# # --- Save and Evaluate ---
# save_weights(transformer_model, 'saved_weights/transformer_model_weights.npz')

# # --- Evaluation ---
# def evaluate(model, X, y):
#     logits = model.forward(X, training=False)
#     predictions = np.argmax(logits, axis=1)
#     accuracy = np.mean(predictions == y)
#     return accuracy

# print("\n--- Transformer Model Evaluation ---")
# train_accuracy = evaluate(transformer_model, X_train[:500], y_train[:500])
# test_accuracy = evaluate(transformer_model, X_test, y_test)
# print(f"Training Accuracy: {train_accuracy:.4f}")
# print(f"Test Accuracy: {test_accuracy:.4f}")

In [35]:
# print("Evaluating on the test set...")
# logits_test = transformer_model.forward(X_test)
# predictions_test = np.argmax(logits_test, axis=1)


# # Class Names
# class_labels = ['Neutral','Positive','Extremely Negative','Negative','Extremely Positive']

# # Generate and print the report
# print("\n--- Classification Report ---")
# classification_report_from_scratch(y_test, predictions_test, class_names=class_labels)


# test_accuracy = np.mean(predictions_test == y_test)
# print(f"\nFinal Test Accuracy: {test_accuracy:.4f}")

## Loading Weights for Each Model from Github Repo

### Defining Load Weight Function

In [36]:
def load_weights_from_url(model, url):
    """Downloads and loads weights from a URL."""
    print(f"Downloading weights from {url}...")
    response = requests.get(url)
    response.raise_for_status() # Raise an exception for bad status codes
    with io.BytesIO(response.content) as f:
        data = np.load(f)
        for i, layer in enumerate(model.layers):
            for key in layer.params:
                layer.params[key] = data[f'layer_{i}_{key}']
    print("Weights loaded successfully from URL.")

### For RNN

In [45]:
import requests
import io

# Define Model Hyperparameters
# These must match the parameters used to train the saved model.
VOCAB_SIZE = len(vocab)
EMBED_DIM = 100
HIDDEN_SIZE = 256
NUM_CLASSES = 5

# --- 2. Initialize a New, Untrained Model Instance ---
print("Initializing a model structure...")
rnn_model_from_git = RNNClassifier(VOCAB_SIZE, EMBED_DIM, HIDDEN_SIZE, NUM_CLASSES)

# --- 3. Define the Raw URL for the Weights File ---
rnn_weights_url = "https://raw.githubusercontent.com/anirudha22-stack/nlp_assignment_models/main/rnn_model_weights.npz"

# --- Evaluation ---

def evaluate_and_report_rnn(model, X, y, class_names=None, batch_size=64):
    """
    Evaluates the RNN model in mini-batches, prints a full classification report,
    and returns the final accuracy.
    """
    num_samples = len(X)
    num_batches = (num_samples + batch_size - 1) // batch_size

    all_predictions = []

    print(f"Evaluating on {num_samples} samples in batches of {batch_size}...")
    pbar = tqdm(range(num_batches), desc="RNN Evaluation Progress")

    for i in pbar:
        start_idx = i * batch_size
        end_idx = start_idx + batch_size
        X_batch = X[start_idx:end_idx]

        # NOTE: The original RNN forward pass does not have a 'training' flag.
        logits = model.forward(X_batch)
        predictions = np.argmax(logits, axis=1)
        all_predictions.extend(predictions.tolist())

    all_predictions = np.array(all_predictions)

    # --- Generate the Detailed Report ---
    print("\n--- Classification Report for RNN from GitHub ---")
    classification_report_from_scratch(y, all_predictions, class_names=class_names)

    # Calculate and return the final accuracy
    accuracy = np.mean(all_predictions == y)
    return accuracy

# --- 4. Load the Weights from the URL ---
try:
    print("Attempting to load RNN weights from URL...")
    load_weights_from_url(rnn_model_from_git, rnn_weights_url)
    print(" RNN weights successfully loaded.")

    # --- 5. Evaluate the Model's Performance ---

    # Define class names for the report
    class_labels = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']

    # Call the new function to get the report and accuracy
    test_accuracy = evaluate_and_report_rnn(
        rnn_model_from_git,
        X_test,
        y_test,
        class_names=class_labels
    )

    # --- 6. Show the Final Accuracy ---
    print("\n" + "="*40)
    print(f"Final RNN Model Accuracy from GitHub: {test_accuracy:.4f}")
    print("="*40)

except Exception as e:
    print(f"\n An error occurred: {e}")
    print("Please ensure the URL is correct and the file is accessible.")

Initializing a model structure...
Attempting to load RNN weights from URL...
Downloading weights from https://raw.githubusercontent.com/anirudha22-stack/nlp_assignment_models/main/rnn_model_weights.npz...
Downloaded file contains the following keys: ['layer_0_W', 'layer_1_W_xh', 'layer_1_W_hh', 'layer_1_b_h', 'layer_3_W', 'layer_3_b']

--- Attempting to match and load weights into model layers ---
 Successfully loaded layer_0_W
 Successfully loaded layer_1_W_xh
 Successfully loaded layer_1_W_hh
 Successfully loaded layer_1_b_h
 Successfully loaded layer_3_W
 Successfully loaded layer_3_b
 RNN weights successfully loaded.
Evaluating on 3798 samples in batches of 64...


RNN Evaluation Progress:   0%|          | 0/60 [00:00<?, ?it/s]


--- Classification Report for RNN from GitHub ---
                     precision      recall    f1-score     support

Extremely Negative        0.22        0.11        0.15         592
Negative                  0.30        0.22        0.25        1041
Neutral                   0.15        0.10        0.12         619
Positive                  0.25        0.54        0.35         947
Extremely Positive        0.24        0.12        0.16         599

accuracy                                          0.25        3798
micro avg                 0.25        0.25        0.25        3798
macro avg                 0.23        0.22        0.20        3798
weighted avg              0.24        0.25        0.22        3798


Final RNN Model Accuracy from GitHub: 0.2480


### For LSTM

In [47]:
# Define Model Hyperparameters
# These must match the parameters used to train the saved model.
VOCAB_SIZE = len(vocab)
EMBED_DIM = 100
HIDDEN_SIZE = 128
NUM_CLASSES = 5

# --- 2. Initialize a New, Untrained Model Instance ---
print("Initializing a model structure...")
lstm_model_from_git = LSTMClassifier(VOCAB_SIZE, EMBED_DIM, HIDDEN_SIZE, NUM_CLASSES)

# --- 3. Define the Raw URL for the Weights File ---
lstm_weights_url = "https://raw.githubusercontent.com/anirudha22-stack/nlp_assignment_models/main/lstm_model_weights.npz"

# --- Evaluation ---

def evaluate_and_report_lstm(model, X, y, class_names=None, batch_size=64):
    """
    Evaluates the LSTM model in mini-batches, prints a full classification report,
    and returns the final accuracy.
    """
    num_samples = len(X)
    num_batches = (num_samples + batch_size - 1) // batch_size

    all_predictions = []

    print(f"Evaluating on {num_samples} samples in batches of {batch_size}...")
    pbar = tqdm(range(num_batches), desc="LSTM Evaluation Progress")

    for i in pbar:
        start_idx = i * batch_size
        end_idx = start_idx + batch_size
        X_batch = X[start_idx:end_idx]

        # Check if the LSTM's forward pass uses a 'training' flag.
        # Adjust if necessary based on your specific LSTMClassifier implementation.
        # Assuming it does for consistency with the corrected RNN.
        logits = model.forward(X_batch)
        predictions = np.argmax(logits, axis=1)
        all_predictions.extend(predictions.tolist())

    all_predictions = np.array(all_predictions)

    # --- Generate the Detailed Report ---
    print("\n--- Classification Report for LSTM from GitHub ---")
    classification_report_from_scratch(y, all_predictions, class_names=class_names)

    # Calculate and return the final accuracy
    accuracy = np.mean(all_predictions == y)
    return accuracy

# --- 4. Load the Weights from the URL ---
try:
    print("Attempting to load LSTM weights from URL...")
    load_weights_from_url(lstm_model_from_git, lstm_weights_url)
    print("LSTM weights successfully loaded.")

    # --- 5. Evaluate the Model's Performance ---

    # Define class names for the report
    class_labels = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']

    # Call the new function to get the report and accuracy
    test_accuracy = evaluate_and_report_lstm(
        lstm_model_from_git,
        X_test,
        y_test,
        class_names=class_labels
    )

    # --- 6. Show the Final Accuracy ---
    print("\n" + "="*40)
    print(f"Final LSTM Model Accuracy from GitHub: {test_accuracy:.4f}")
    print("="*40)

except Exception as e:
    print(f"\n An error occurred: {e}")
    print("Please ensure the URL is correct and the file is accessible.")

Initializing a model structure...
Attempting to load LSTM weights from URL...
Downloading weights from https://raw.githubusercontent.com/anirudha22-stack/nlp_assignment_models/main/lstm_model_weights.npz...
Downloaded file contains the following keys: ['layer_0_W', 'layer_1_W_x', 'layer_1_W_h', 'layer_1_b', 'layer_2_W', 'layer_2_b']

--- Attempting to match and load weights into model layers ---
 Successfully loaded layer_0_W
 Successfully loaded layer_1_W_x
 Successfully loaded layer_1_W_h
 Successfully loaded layer_1_b
 Successfully loaded layer_2_W
 Successfully loaded layer_2_b
LSTM weights successfully loaded.
Evaluating on 3798 samples in batches of 64...


LSTM Evaluation Progress:   0%|          | 0/60 [00:00<?, ?it/s]


--- Classification Report for LSTM from GitHub ---
                     precision      recall    f1-score     support

Extremely Negative        0.73        0.66        0.69         592
Negative                  0.66        0.67        0.67        1041
Neutral                   0.82        0.77        0.79         619
Positive                  0.66        0.72        0.69         947
Extremely Positive        0.76        0.75        0.76         599

accuracy                                          0.71        3798
micro avg                 0.71        0.71        0.71        3798
macro avg                 0.73        0.71        0.72        3798
weighted avg              0.71        0.71        0.71        3798


Final LSTM Model Accuracy from GitHub: 0.7098


### For Transformer

In [43]:
import requests
import io

def load_weights_from_url(model, url):
    """
    Downloads and loads weights, matching the original simple saving function's keys.
    """
    print(f"Downloading weights from {url}...")
    response = requests.get(url)
    response.raise_for_status() # Will raise an error if download fails

    # Load the weights from the downloaded content into memory
    weights_from_file = np.load(io.BytesIO(response.content))
    print("Downloaded file contains the following keys:", list(weights_from_file.keys()))

    print("\n--- Attempting to match and load weights into model layers ---")

    # This loop MUST perfectly mirror your original save_weights function
    # Iterate through the model's main layers with an index (i)
    for i, layer in enumerate(model.layers):

        # Iterate through the parameters within that layer
        # For TransformerEncoderBlock, this correctly calls the @property
        for key, param_obj in layer.params.items():

            # Reconstruct the exact key name that was saved in the file
            param_key = f'layer_{i}_{key}'

            # Check if this reconstructed key exists in the loaded file
            if param_key in weights_from_file:
                # Sanity check: ensure the shapes match
                assert param_obj.shape == weights_from_file[param_key].shape, \
                    f"Shape mismatch for '{param_key}': Model expects {param_obj.shape}, file has {weights_from_file[param_key].shape}"

                # Assign the loaded weights to the model's parameter object
                param_obj[:] = weights_from_file[param_key]
                print(f" Successfully loaded {param_key}")
            else:
                print(f" WARNING: Weight key '{param_key}' not found in the downloaded file.")

In [48]:
from tqdm.notebook import tqdm

# --- 1. Define Model Hyperparameters ---
# These must match the parameters used to train the saved model.
MAX_SEQ_LEN = 50
VOCAB_SIZE = len(vocab)
EMBED_DIM = 100
NUM_HEADS = 5
FFN_DIM = 400
NUM_LAYERS = 4
NUM_CLASSES = 5
DROPOUT_P = 0.1

# --- 2. Initialize a New, Untrained Model Instance ---
print("Initializing a new model structure...")
transformer_model_from_git = TransformerClassifier(
    vocab_size=VOCAB_SIZE,
    max_seq_len=MAX_SEQ_LEN,
    embed_dim=EMBED_DIM,
    num_heads=NUM_HEADS,
    ffn_dim=FFN_DIM,
    num_layers=NUM_LAYERS,
    num_classes=NUM_CLASSES,
    dropout_p=DROPOUT_P
)

# --- 3. Define the Raw URL for the Weights File ---
transformer_weights_url = "https://raw.githubusercontent.com/anirudha22-stack/nlp_assignment_models/main/transformer_model_weights.npz"

# --- 4. Define the Batched Evaluation and Reporting Function ---
def evaluate_and_report_transformer(model, X, y, class_names=None, batch_size=64):
    """
    Evaluates the Transformer model in mini-batches, prints a full classification report,
    and returns the final accuracy.
    """
    num_samples = len(X)
    num_batches = (num_samples + batch_size - 1) // batch_size

    all_predictions = []

    print(f"Evaluating on {num_samples} samples in batches of {batch_size}...")
    pbar = tqdm(range(num_batches), desc="Transformer Evaluation Progress")

    for i in pbar:
        start_idx = i * batch_size
        end_idx = start_idx + batch_size
        X_batch = X[start_idx:end_idx]

        logits = model.forward(X_batch, training=False)
        predictions = np.argmax(logits, axis=1)
        all_predictions.extend(predictions.tolist())

    all_predictions = np.array(all_predictions)

    # --- Generate the Detailed Report ---
    print("\n--- Classification Report for Transformer from GitHub ---")
    classification_report_from_scratch(y, all_predictions, class_names=class_names)

    # Calculate and return the final accuracy
    accuracy = np.mean(all_predictions == y)
    return accuracy

# --- 5. Load Weights and Evaluate the Model ---
try:
    # --- Step 5a: Load the Weights ---
    print("\nAttempting to load Transformer weights from URL...")
    load_weights_from_url(transformer_model_from_git, transformer_weights_url)
    print("Transformer weights successfully loaded.")

    # --- Step 5b: Evaluate the Model's Performance ---

    # Define class names for the report
    class_labels = ['Extremely Negative', 'Negative', 'Neutral', 'Positive', 'Extremely Positive']

    # Call the new function to get the report and accuracy
    test_accuracy = evaluate_and_report_transformer(
        transformer_model_from_git,
        X_test,
        y_test,
        class_names=class_labels,
        batch_size=64
    )

    # --- Step 5c: Show the Final Accuracy ---
    print("\n" + "="*40)
    print(f"Final Transformer Model Accuracy from GitHub: {test_accuracy:.4f}")
    print("="*40)

except Exception as e:
    print(f"\nAn error occurred: {e}")
    print("Please ensure the URL is correct and the weights file is accessible.")

Initializing a new model structure...

Attempting to load Transformer weights from URL...
Downloading weights from https://raw.githubusercontent.com/anirudha22-stack/nlp_assignment_models/main/transformer_model_weights.npz...
Downloaded file contains the following keys: ['layer_0_W', 'layer_3_MultiHeadAttention_0_W_qkv', 'layer_3_MultiHeadAttention_0_W_o', 'layer_3_LayerNormalization_1_gamma', 'layer_3_LayerNormalization_1_beta', 'layer_3_PositionwiseFeedForward_2_W_1', 'layer_3_PositionwiseFeedForward_2_b_1', 'layer_3_PositionwiseFeedForward_2_W_2', 'layer_3_PositionwiseFeedForward_2_b_2', 'layer_3_LayerNormalization_3_gamma', 'layer_3_LayerNormalization_3_beta', 'layer_4_MultiHeadAttention_0_W_qkv', 'layer_4_MultiHeadAttention_0_W_o', 'layer_4_LayerNormalization_1_gamma', 'layer_4_LayerNormalization_1_beta', 'layer_4_PositionwiseFeedForward_2_W_1', 'layer_4_PositionwiseFeedForward_2_b_1', 'layer_4_PositionwiseFeedForward_2_W_2', 'layer_4_PositionwiseFeedForward_2_b_2', 'layer_4_Layer

Transformer Evaluation Progress:   0%|          | 0/60 [00:00<?, ?it/s]


--- Classification Report for Transformer from GitHub ---
                     precision      recall    f1-score     support

Extremely Negative        0.67        0.78        0.72         592
Negative                  0.67        0.67        0.67        1041
Neutral                   0.86        0.78        0.82         619
Positive                  0.73        0.65        0.69         947
Extremely Positive        0.74        0.82        0.78         599

accuracy                                          0.72        3798
micro avg                 0.72        0.72        0.72        3798
macro avg                 0.73        0.74        0.73        3798
weighted avg              0.73        0.72        0.72        3798


Final Transformer Model Accuracy from GitHub: 0.7227
