In [3]:
import numpy as np
import pandas as pd
import json
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from collections import Counter

# ============================================================================
# DATA LOADING AND PREPROCESSING
# ============================================================================

def parse_atis_sample(sample):
    """Parse ATIS JSON sample"""
    if 'tokens' in sample:
        tokens = sample['tokens']
    elif 'text' in sample:
        tokens = sample['text'].split() if isinstance(sample['text'], str) else sample['text']
    else:
        tokens = []

    intent = sample.get('intent', sample.get('label', sample.get('intent_label', '')))

    if 'slots' in sample:
        slots = sample['slots']
    elif 'slot_labels' in sample:
        slots = sample['slot_labels']
    else:
        slots = ['O'] * len(tokens)

    return tokens, intent, slots

def parse_slurp_sample(sample):
    """Parse SLURP JSON sample - handles multiple formats"""
    # Try to get tokens from various possible fields
    tokens = []
    if 'tokens' in sample:
        tokens = sample['tokens']
    elif 'sentence' in sample:
        # If sentence is a string, split it
        sentence = sample['sentence']
        if isinstance(sentence, str):
            tokens = sentence.split()
        else:
            tokens = sentence
    elif 'text' in sample:
        text = sample['text']
        if isinstance(text, str):
            tokens = text.split()
        else:
            tokens = text
    
    # Try to get intent from various possible fields
    scenario = sample.get('scenario', '')
    action = sample.get('action', '')
    
    # Check if intent is directly provided
    if 'intent' in sample:
        intent = sample['intent']
    elif scenario and action:
        intent = f"{scenario}_{action}"
    elif scenario:
        intent = scenario
    elif action:
        intent = action
    else:
        intent = ''
    
    # Get entities/slots
    entities = sample.get('entities', [])
    slots = ['O'] * len(tokens)

    for entity in entities:
        slot_type = entity.get('type', 'entity')
        start = entity.get('start', 0)
        end = entity.get('end', 0)

        if start < len(slots):
            slots[start] = f'B-{slot_type}'
        for i in range(start + 1, min(end, len(slots))):
            slots[i] = f'I-{slot_type}'

    return tokens, intent, slots

def load_json_dataset(file_path, dataset_type='atis'):
    """Load dataset from JSON Lines (.jsonl) file"""
    print(f"Loading {file_path}...")

    parsed_data = []
    parse_fn = parse_atis_sample if dataset_type == 'atis' else parse_slurp_sample
    error_count = 0
    sample_shown = False

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                line = line.strip()
                if not line:  # Skip empty lines
                    continue
                    
                try:
                    sample = json.loads(line)
                    
                    # Show first sample for debugging
                    if not sample_shown and line_num == 1:
                        print(f"\n  First sample structure: {list(sample.keys())}")
                        print(f"  Sample preview: {str(sample)[:200]}...")
                        sample_shown = True
                    
                    tokens, intent, slots = parse_fn(sample)
                    
                    # Validate the sample
                    if tokens and intent:
                        # Ensure slots match tokens length
                        if len(slots) != len(tokens):
                            slots = slots[:len(tokens)] + ['O'] * max(0, len(tokens) - len(slots))
                        parsed_data.append((tokens, intent, slots))
                    else:
                        error_count += 1
                        if error_count <= 3:  # Show first 3 errors with details
                            print(f"  Warning line {line_num}: Invalid sample")
                            print(f"    - Has tokens: {bool(tokens)} (length: {len(tokens)})")
                            print(f"    - Has intent: {bool(intent)} (value: '{intent}')")
                            
                except json.JSONDecodeError as e:
                    error_count += 1
                    if error_count <= 3:
                        print(f"  Error line {line_num}: JSON decode error - {e}")
                except Exception as e:
                    error_count += 1
                    if error_count <= 3:
                        print(f"  Error line {line_num}: {e}")
    
    except FileNotFoundError:
        print(f"ERROR: File not found - {file_path}")
        return []
    except Exception as e:
        print(f"ERROR reading file: {e}")
        return []

    if error_count > 3:
        print(f"  ... and {error_count - 3} more errors")
    
    print(f"Loaded {len(parsed_data)} valid samples")
    
    if len(parsed_data) == 0:
        print(f"\n⚠️  WARNING: No valid samples loaded from {file_path}!")
        print("\nPlease check the file format. The parser is looking for:")
        if dataset_type == 'slurp':
            print("  - 'tokens' OR 'sentence' OR 'text': the input words")
            print("  - 'scenario' + 'action' OR 'intent': the intent label")
            print("  - 'entities': entity annotations (optional)")
        else:
            print("  - 'tokens' or 'text': words/sentence")
            print("  - 'intent': intent label")
            print("  - 'slots': slot labels")
    
    return parsed_data

def build_vocab_and_mappings(train_data, val_data, min_freq=1):
    """Build vocabulary and label mappings from data"""
    print("\nBuilding vocabulary and mappings...")

    if len(train_data) == 0 and len(val_data) == 0:
        print("ERROR: No training or validation data available!")
        return {'<PAD>': 0, '<UNK>': 1}, {}, {}

    # Collect all tokens, intents, and slots
    all_tokens = []
    all_intents = []
    all_slots = []

    for tokens, intent, slots in train_data + val_data:
        all_tokens.extend([t.lower() for t in tokens])
        all_intents.append(intent)
        all_slots.extend(slots)

    # Build vocabulary
    token_counter = Counter(all_tokens)
    vocab = {'<PAD>': 0, '<UNK>': 1}
    idx = 2
    for token, freq in token_counter.most_common():
        if freq >= min_freq:
            vocab[token] = idx
            idx += 1

    # Build intent mapping
    unique_intents = sorted(set(all_intents))
    intent2idx = {intent: idx for idx, intent in enumerate(unique_intents)}

    # Build slot mapping
    unique_slots = sorted(set(all_slots))
    slot2idx = {slot: idx for idx, slot in enumerate(unique_slots)}

    print(f"Vocabulary size: {len(vocab)}")
    print(f"Number of intents: {len(intent2idx)}")
    print(f"Number of slots: {len(slot2idx)}")

    return vocab, intent2idx, slot2idx

def convert_to_ids(data, vocab, intent2idx, slot2idx):
    """Convert tokens and labels to IDs"""
    processed = []

    for tokens, intent, slots in data:
        # Convert tokens to IDs
        input_ids = [vocab.get(t.lower(), vocab['<UNK>']) for t in tokens]

        # Convert intent to ID
        intent_id = intent2idx.get(intent, 0)

        # Convert slots to IDs
        slot_ids = [slot2idx.get(s, 0) for s in slots]

        processed.append({
            'input_ids': input_ids,
            'intent': intent_id,
            'slots': slot_ids
        })

    return processed



def load_atis_csv(file_path):
    """Load ATIS data from CSV file - handles flexible column names"""
    print(f"Loading ATIS CSV from {file_path}...")
    
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"ERROR: File not found - {file_path}")
        return []
    
    print(f"CSV columns found: {list(df.columns)}")
    
    parsed_data = []
    
    # Find tokens column (could be 'tokens', 'text', 'sentence', etc.)
    tokens_col = None
    for col in ['tokens', 'text', 'sentence', 'words']:
        if col in df.columns:
            tokens_col = col
            break
    
    if tokens_col is None:
        print(f"ERROR: No tokens column found. Available columns: {list(df.columns)}")
        print("Expected one of: 'tokens', 'text', 'sentence', or 'words'")
        return []
    
    print(f"Using '{tokens_col}' column for tokens")
    
    # Find intent column
    if 'intent' not in df.columns:
        print(f"ERROR: 'intent' column not found. Available columns: {list(df.columns)}")
        return []
    
    # Slots are optional
    has_slots = 'slots' in df.columns
    
    for idx, row in df.iterrows():
        tokens_str = str(row[tokens_col]).strip()
        
        # Parse tokens - handle different formats
        if tokens_str.startswith('['):
            # JSON-like format: "[token1, token2, ...]"
            try:
                tokens = eval(tokens_str)
            except:
                tokens = tokens_str.split()
        else:
            # Space-separated: "token1 token2 ..."
            tokens = tokens_str.split()
        
        intent = str(row['intent']).strip()
        
        if has_slots:
            slots_str = str(row['slots']).strip()
            if slots_str.startswith('['):
                try:
                    slots = eval(slots_str)
                except:
                    slots = slots_str.split()
            else:
                slots = slots_str.split()
        else:
            slots = ['O'] * len(tokens)
        
        if tokens and intent:
            # Ensure slots match token length
            if len(slots) != len(tokens):
                slots = slots[:len(tokens)] + ['O'] * max(0, len(tokens) - len(slots))
            
            parsed_data.append((tokens, intent, slots))
    
    print(f"Loaded {len(parsed_data)} samples from CSV\n")
    return parsed_data


def create_train_val_split(train_data, val_split=0.2, random_state=42):
    """Split training data into train and validation sets"""
    train_split, val_split_data = train_test_split(
        train_data, 
        test_size=val_split, 
        random_state=random_state
    )
    return train_split, val_split_data


# ============================================================================
# ACTIVATION FUNCTIONS
# ============================================================================

def tanh(x):
    return np.tanh(x)

def tanh_derivative(x):
    return 1 - np.tanh(x)**2

def sigmoid(x):
    return 1 / (1 + np.exp(-np.clip(x, -500, 500)))

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

# ============================================================================
# RNN CELL FROM SCRATCH
# ============================================================================

class RNNCell:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size

        limit_ih = np.sqrt(6 / (input_size + hidden_size))
        limit_hh = np.sqrt(6 / (hidden_size + hidden_size))

        self.W_ih = np.random.uniform(-limit_ih, limit_ih, (input_size, hidden_size))
        self.W_hh = np.random.uniform(-limit_hh, limit_hh, (hidden_size, hidden_size))
        self.b_h = np.zeros((1, hidden_size))

        self.m_W_ih = np.zeros_like(self.W_ih)
        self.v_W_ih = np.zeros_like(self.W_ih)
        self.m_W_hh = np.zeros_like(self.W_hh)
        self.v_W_hh = np.zeros_like(self.W_hh)
        self.m_b_h = np.zeros_like(self.b_h)
        self.v_b_h = np.zeros_like(self.b_h)

    def forward(self, x, h_prev):
        self.x = x
        self.h_prev = h_prev
        self.h = tanh(x @ self.W_ih + h_prev @ self.W_hh + self.b_h)
        return self.h

    def backward(self, dh_next):
        dh_raw = dh_next * tanh_derivative(self.h)

        self.dW_ih = self.x.T @ dh_raw
        self.dW_hh = self.h_prev.T @ dh_raw
        self.db_h = np.sum(dh_raw, axis=0, keepdims=True)

        dx = dh_raw @ self.W_ih.T
        dh_prev = dh_raw @ self.W_hh.T

        return dx, dh_prev

# ============================================================================
# BIDIRECTIONAL RNN LAYER
# ============================================================================

class BiRNN:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.forward_cells = []
        self.backward_cells = []
        self.max_cells = 0

    def forward(self, X, seq_lengths=None):
        batch_size, max_seq_len, _ = X.shape

        # Dynamically create cells if needed
        while len(self.forward_cells) < max_seq_len:
            self.forward_cells.append(RNNCell(self.input_size, self.hidden_size))
            self.backward_cells.append(RNNCell(self.input_size, self.hidden_size))
        
        self.max_cells = max(self.max_cells, max_seq_len)

        h_forward = np.zeros((batch_size, self.hidden_size))
        forward_hiddens = []

        for t in range(max_seq_len):
            h_forward = self.forward_cells[t].forward(X[:, t, :], h_forward)
            forward_hiddens.append(h_forward)

        h_backward = np.zeros((batch_size, self.hidden_size))
        backward_hiddens = []

        for t in range(max_seq_len - 1, -1, -1):
            h_backward = self.backward_cells[t].forward(X[:, t, :], h_backward)
            backward_hiddens.insert(0, h_backward)

        outputs = []
        for t in range(max_seq_len):
            outputs.append(np.concatenate([forward_hiddens[t], backward_hiddens[t]], axis=1))

        outputs = np.stack(outputs, axis=1)
        return outputs

    def backward(self, doutputs):
        batch_size, max_seq_len, _ = doutputs.shape

        dforward = doutputs[:, :, :self.hidden_size]
        dbackward = doutputs[:, :, self.hidden_size:]

        dh_next = np.zeros((batch_size, self.hidden_size))
        for t in range(max_seq_len - 1, -1, -1):
            dh = dforward[:, t, :] + dh_next
            _, dh_next = self.forward_cells[t].backward(dh)

        dh_next = np.zeros((batch_size, self.hidden_size))
        for t in range(max_seq_len):
            dh = dbackward[:, t, :] + dh_next
            _, dh_next = self.backward_cells[t].backward(dh)


import numpy as np

# ============================================================================
# LSTM CELL FROM SCRATCH
# ============================================================================

import numpy as np

# ============================================================================
# LSTM CELL FROM SCRATCH
# ============================================================================

import numpy as np

# ============================================================================
# LSTM CELL FROM SCRATCH
# ============================================================================

class LSTMCell:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Xavier initialization
        limit_ih = np.sqrt(6 / (input_size + hidden_size))
        limit_hh = np.sqrt(6 / (hidden_size + hidden_size))

        # Forget gate parameters
        self.W_if = np.random.uniform(-limit_ih, limit_ih, (input_size, hidden_size))
        self.W_hf = np.random.uniform(-limit_hh, limit_hh, (hidden_size, hidden_size))
        self.b_f = np.zeros((1, hidden_size))

        # Input gate parameters
        self.W_ii = np.random.uniform(-limit_ih, limit_ih, (input_size, hidden_size))
        self.W_hi = np.random.uniform(-limit_hh, limit_hh, (hidden_size, hidden_size))
        self.b_i = np.zeros((1, hidden_size))

        # Cell gate parameters
        self.W_ig = np.random.uniform(-limit_ih, limit_ih, (input_size, hidden_size))
        self.W_hg = np.random.uniform(-limit_hh, limit_hh, (hidden_size, hidden_size))
        self.b_g = np.zeros((1, hidden_size))

        # Output gate parameters
        self.W_io = np.random.uniform(-limit_ih, limit_ih, (input_size, hidden_size))
        self.W_ho = np.random.uniform(-limit_hh, limit_hh, (hidden_size, hidden_size))
        self.b_o = np.zeros((1, hidden_size))

        # Initialize Adam optimizer parameters for all weights
        self.m_W_if = np.zeros_like(self.W_if)
        self.v_W_if = np.zeros_like(self.W_if)
        self.m_W_hf = np.zeros_like(self.W_hf)
        self.v_W_hf = np.zeros_like(self.W_hf)
        self.m_b_f = np.zeros_like(self.b_f)
        self.v_b_f = np.zeros_like(self.b_f)

        self.m_W_ii = np.zeros_like(self.W_ii)
        self.v_W_ii = np.zeros_like(self.W_ii)
        self.m_W_hi = np.zeros_like(self.W_hi)
        self.v_W_hi = np.zeros_like(self.W_hi)
        self.m_b_i = np.zeros_like(self.b_i)
        self.v_b_i = np.zeros_like(self.b_i)

        self.m_W_ig = np.zeros_like(self.W_ig)
        self.v_W_ig = np.zeros_like(self.W_ig)
        self.m_W_hg = np.zeros_like(self.W_hg)
        self.v_W_hg = np.zeros_like(self.W_hg)
        self.m_b_g = np.zeros_like(self.b_g)
        self.v_b_g = np.zeros_like(self.b_g)

        self.m_W_io = np.zeros_like(self.W_io)
        self.v_W_io = np.zeros_like(self.W_io)
        self.m_W_ho = np.zeros_like(self.W_ho)
        self.v_W_ho = np.zeros_like(self.W_ho)
        self.m_b_o = np.zeros_like(self.b_o)
        self.v_b_o = np.zeros_like(self.b_o)

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))

    def tanh(self, x):
        return np.tanh(x)

    def forward(self, x, h_prev, c_prev):
        self.x = x
        self.h_prev = h_prev
        self.c_prev = c_prev

        # Forget gate
        self.f_gate = self.sigmoid(x @ self.W_if + h_prev @ self.W_hf + self.b_f)
        
        # Input gate
        self.i_gate = self.sigmoid(x @ self.W_ii + h_prev @ self.W_hi + self.b_i)
        
        # Cell gate (candidate)
        self.g_gate = self.tanh(x @ self.W_ig + h_prev @ self.W_hg + self.b_g)
        
        # Output gate
        self.o_gate = self.sigmoid(x @ self.W_io + h_prev @ self.W_ho + self.b_o)

        # New cell state
        self.c = self.f_gate * c_prev + self.i_gate * self.g_gate
        
        # New hidden state
        self.h = self.o_gate * self.tanh(self.c)

        return self.h, self.c

    def backward(self, dh_next, dc_next):
        # Gradient through output gate
        dtanh_c = dh_next * self.o_gate
        dc = dc_next + dtanh_c * (1 - self.tanh(self.c) ** 2)

        # Gradient through output gate
        do_raw = dh_next * self.tanh(self.c)
        do = do_raw * self.o_gate * (1 - self.o_gate)

        # Gradient through cell state
        df_raw = dc * self.c_prev
        df = df_raw * self.f_gate * (1 - self.f_gate)

        di_raw = dc * self.g_gate
        di = di_raw * self.i_gate * (1 - self.i_gate)

        dg_raw = dc * self.i_gate
        dg = dg_raw * (1 - self.g_gate ** 2)

        # Compute weight gradients
        self.dW_if = self.x.T @ df
        self.dW_hf = self.h_prev.T @ df
        self.db_f = np.sum(df, axis=0, keepdims=True)

        self.dW_ii = self.x.T @ di
        self.dW_hi = self.h_prev.T @ di
        self.db_i = np.sum(di, axis=0, keepdims=True)

        self.dW_ig = self.x.T @ dg
        self.dW_hg = self.h_prev.T @ dg
        self.db_g = np.sum(dg, axis=0, keepdims=True)

        self.dW_io = self.x.T @ do
        self.dW_ho = self.h_prev.T @ do
        self.db_o = np.sum(do, axis=0, keepdims=True)

        # Compute gradients for inputs
        dx = (df @ self.W_if.T + di @ self.W_ii.T + 
              dg @ self.W_ig.T + do @ self.W_io.T)
        
        dh_prev = (df @ self.W_hf.T + di @ self.W_hi.T + 
                   dg @ self.W_hg.T + do @ self.W_ho.T)
        
        dc_prev = dc * self.f_gate

        return dx, dh_prev, dc_prev


# ============================================================================
# BIDIRECTIONAL LSTM LAYER
# ============================================================================

class BiLSTM:
    def __init__(self, input_size, hidden_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.forward_cells = []
        self.backward_cells = []
        self.max_cells = 0

    def forward(self, X, seq_lengths=None):
        batch_size, max_seq_len, _ = X.shape

        # Dynamically create cells if needed
        while len(self.forward_cells) < max_seq_len:
            self.forward_cells.append(LSTMCell(self.input_size, self.hidden_size))
            self.backward_cells.append(LSTMCell(self.input_size, self.hidden_size))
        
        self.max_cells = max(self.max_cells, max_seq_len)

        # Forward pass
        h_forward = np.zeros((batch_size, self.hidden_size))
        c_forward = np.zeros((batch_size, self.hidden_size))
        forward_hiddens = []

        for t in range(max_seq_len):
            h_forward, c_forward = self.forward_cells[t].forward(
                X[:, t, :], h_forward, c_forward
            )
            forward_hiddens.append(h_forward)

        # Backward pass
        h_backward = np.zeros((batch_size, self.hidden_size))
        c_backward = np.zeros((batch_size, self.hidden_size))
        backward_hiddens = []

        for t in range(max_seq_len - 1, -1, -1):
            h_backward, c_backward = self.backward_cells[t].forward(
                X[:, t, :], h_backward, c_backward
            )
            backward_hiddens.insert(0, h_backward)

        # Concatenate forward and backward outputs
        outputs = []
        for t in range(max_seq_len):
            outputs.append(np.concatenate([forward_hiddens[t], backward_hiddens[t]], axis=1))

        outputs = np.stack(outputs, axis=1)
        return outputs

    def backward(self, doutputs):
        batch_size, max_seq_len, _ = doutputs.shape

        dforward = doutputs[:, :, :self.hidden_size]
        dbackward = doutputs[:, :, self.hidden_size:]

        # Backward pass through forward direction
        dh_next = np.zeros((batch_size, self.hidden_size))
        dc_next = np.zeros((batch_size, self.hidden_size))
        
        for t in range(max_seq_len - 1, -1, -1):
            dh = dforward[:, t, :] + dh_next
            _, dh_next, dc_next = self.forward_cells[t].backward(dh, dc_next)

        # Backward pass through backward direction
        dh_next = np.zeros((batch_size, self.hidden_size))
        dc_next = np.zeros((batch_size, self.hidden_size))
        
        for t in range(max_seq_len):
            dh = dbackward[:, t, :] + dh_next
            _, dh_next, dc_next = self.backward_cells[t].backward(dh, dc_next)


# ============================================================================
# INTENT CLASSIFIER WITH LSTM
# ============================================================================

class IntentClassifierLSTM:
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_intents):
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_intents = num_intents

        self.embeddings = np.random.randn(vocab_size, embed_dim) * 0.01
        self.encoder = BiLSTM(embed_dim, hidden_dim)

        self.W_out = np.random.randn(2 * hidden_dim, num_intents) * 0.01
        self.b_out = np.zeros((1, num_intents))

        self.m_W_out = np.zeros_like(self.W_out)
        self.v_W_out = np.zeros_like(self.W_out)
        self.m_b_out = np.zeros_like(self.b_out)
        self.v_b_out = np.zeros_like(self.b_out)
        self.m_emb = np.zeros_like(self.embeddings)
        self.v_emb = np.zeros_like(self.embeddings)

        self.t = 0

    def forward(self, input_ids, training=True):
        self.input_ids = input_ids
        batch_size, seq_len = input_ids.shape

        self.embeds = self.embeddings[input_ids]

        if training:
            self.dropout_mask = (np.random.rand(*self.embeds.shape) > 0.3).astype(float) / 0.7
            embeds_dropped = self.embeds * self.dropout_mask
        else:
            embeds_dropped = self.embeds

        self.rnn_outputs = self.encoder.forward(embeds_dropped)
        self.final_hidden = self.rnn_outputs[:, -1, :]
        self.logits = self.final_hidden @ self.W_out + self.b_out

        return self.logits

    def backward(self, dloss):
        batch_size = dloss.shape[0]

        dW_out = self.final_hidden.T @ dloss
        db_out = np.sum(dloss, axis=0, keepdims=True)

        dfinal_hidden = dloss @ self.W_out.T

        drnn_outputs = np.zeros_like(self.rnn_outputs)
        drnn_outputs[:, -1, :] = dfinal_hidden

        self.encoder.backward(drnn_outputs)

        return dW_out, db_out

    def update(self, dW_out, db_out, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
        self.t += 1

        # Update output layer
        self.m_W_out = beta1 * self.m_W_out + (1 - beta1) * dW_out
        self.v_W_out = beta2 * self.v_W_out + (1 - beta2) * (dW_out ** 2)
        m_hat = self.m_W_out / (1 - beta1 ** self.t)
        v_hat = self.v_W_out / (1 - beta2 ** self.t)
        self.W_out -= lr * m_hat / (np.sqrt(v_hat) + eps)

        self.m_b_out = beta1 * self.m_b_out + (1 - beta1) * db_out
        self.v_b_out = beta2 * self.v_b_out + (1 - beta2) * (db_out ** 2)
        m_hat = self.m_b_out / (1 - beta1 ** self.t)
        v_hat = self.v_b_out / (1 - beta2 ** self.t)
        self.b_out -= lr * m_hat / (np.sqrt(v_hat) + eps)

        # Update LSTM cells (same as IntentClassifierLSTM)
        for cell in self.encoder.forward_cells + self.encoder.backward_cells:
            # Update forget gate weights
            cell.m_W_if = beta1 * cell.m_W_if + (1 - beta1) * cell.dW_if
            cell.v_W_if = beta2 * cell.v_W_if + (1 - beta2) * (cell.dW_if ** 2)
            m_hat_if = cell.m_W_if / (1 - beta1 ** self.t)
            v_hat_if = cell.v_W_if / (1 - beta2 ** self.t)
            cell.W_if -= lr * m_hat_if / (np.sqrt(v_hat_if) + eps)

            cell.m_W_hf = beta1 * cell.m_W_hf + (1 - beta1) * cell.dW_hf
            cell.v_W_hf = beta2 * cell.v_W_hf + (1 - beta2) * (cell.dW_hf ** 2)
            m_hat_hf = cell.m_W_hf / (1 - beta1 ** self.t)
            v_hat_hf = cell.v_W_hf / (1 - beta2 ** self.t)
            cell.W_hf -= lr * m_hat_hf / (np.sqrt(v_hat_hf) + eps)

            cell.m_b_f = beta1 * cell.m_b_f + (1 - beta1) * cell.db_f
            cell.v_b_f = beta2 * cell.v_b_f + (1 - beta2) * (cell.db_f ** 2)
            m_hat_bf = cell.m_b_f / (1 - beta1 ** self.t)
            v_hat_bf = cell.v_b_f / (1 - beta2 ** self.t)
            cell.b_f -= lr * m_hat_bf / (np.sqrt(v_hat_bf) + eps)

            # Update input gate weights
            cell.m_W_ii = beta1 * cell.m_W_ii + (1 - beta1) * cell.dW_ii
            cell.v_W_ii = beta2 * cell.v_W_ii + (1 - beta2) * (cell.dW_ii ** 2)
            m_hat_ii = cell.m_W_ii / (1 - beta1 ** self.t)
            v_hat_ii = cell.v_W_ii / (1 - beta2 ** self.t)
            cell.W_ii -= lr * m_hat_ii / (np.sqrt(v_hat_ii) + eps)

            cell.m_W_hi = beta1 * cell.m_W_hi + (1 - beta1) * cell.dW_hi
            cell.v_W_hi = beta2 * cell.v_W_hi + (1 - beta2) * (cell.dW_hi ** 2)
            m_hat_hi = cell.m_W_hi / (1 - beta1 ** self.t)
            v_hat_hi = cell.v_W_hi / (1 - beta2 ** self.t)
            cell.W_hi -= lr * m_hat_hi / (np.sqrt(v_hat_hi) + eps)

            cell.m_b_i = beta1 * cell.m_b_i + (1 - beta1) * cell.db_i
            cell.v_b_i = beta2 * cell.v_b_i + (1 - beta2) * (cell.db_i ** 2)
            m_hat_bi = cell.m_b_i / (1 - beta1 ** self.t)
            v_hat_bi = cell.v_b_i / (1 - beta2 ** self.t)
            cell.b_i -= lr * m_hat_bi / (np.sqrt(v_hat_bi) + eps)

            # Update cell gate weights
            cell.m_W_ig = beta1 * cell.m_W_ig + (1 - beta1) * cell.dW_ig
            cell.v_W_ig = beta2 * cell.v_W_ig + (1 - beta2) * (cell.dW_ig ** 2)
            m_hat_ig = cell.m_W_ig / (1 - beta1 ** self.t)
            v_hat_ig = cell.v_W_ig / (1 - beta2 ** self.t)
            cell.W_ig -= lr * m_hat_ig / (np.sqrt(v_hat_ig) + eps)

            cell.m_W_hg = beta1 * cell.m_W_hg + (1 - beta1) * cell.dW_hg
            cell.v_W_hg = beta2 * cell.v_W_hg + (1 - beta2) * (cell.dW_hg ** 2)
            m_hat_hg = cell.m_W_hg / (1 - beta1 ** self.t)
            v_hat_hg = cell.v_W_hg / (1 - beta2 ** self.t)
            cell.W_hg -= lr * m_hat_hg / (np.sqrt(v_hat_hg) + eps)

            cell.m_b_g = beta1 * cell.m_b_g + (1 - beta1) * cell.db_g
            cell.v_b_g = beta2 * cell.v_b_g + (1 - beta2) * (cell.db_g ** 2)
            m_hat_bg = cell.m_b_g / (1 - beta1 ** self.t)
            v_hat_bg = cell.v_b_g / (1 - beta2 ** self.t)
            cell.b_g -= lr * m_hat_bg / (np.sqrt(v_hat_bg) + eps)

            # Update output gate weights
            cell.m_W_io = beta1 * cell.m_W_io + (1 - beta1) * cell.dW_io
            cell.v_W_io = beta2 * cell.v_W_io + (1 - beta2) * (cell.dW_io ** 2)
            m_hat_io = cell.m_W_io / (1 - beta1 ** self.t)
            v_hat_io = cell.v_W_io / (1 - beta2 ** self.t)
            cell.W_io -= lr * m_hat_io / (np.sqrt(v_hat_io) + eps)

            cell.m_W_ho = beta1 * cell.m_W_ho + (1 - beta1) * cell.dW_ho
            cell.v_W_ho = beta2 * cell.v_W_ho + (1 - beta2) * (cell.dW_ho ** 2)
            m_hat_ho = cell.m_W_ho / (1 - beta1 ** self.t)
            v_hat_ho = cell.v_W_ho / (1 - beta2 ** self.t)
            cell.W_ho -= lr * m_hat_ho / (np.sqrt(v_hat_ho) + eps)

            cell.m_b_o = beta1 * cell.m_b_o + (1 - beta1) * cell.db_o
            cell.v_b_o = beta2 * cell.v_b_o + (1 - beta2) * (cell.db_o ** 2)
            m_hat_bo = cell.m_b_o / (1 - beta1 ** self.t)
            v_hat_bo = cell.v_b_o / (1 - beta2 ** self.t)
            cell.b_o -= lr * m_hat_bo / (np.sqrt(v_hat_bo) + eps)

    def predict(self, input_ids, intent_ids):
        logits = self.forward(input_ids, intent_ids, training=False)
        return np.argmax(logits, axis=2)

            

    def predict(self, input_ids):
        logits = self.forward(input_ids, training=False)
        return np.argmax(logits, axis=1)


# ============================================================================
# SLOT FILLING MODEL WITH LSTM
# ============================================================================

class SlotFillingWithIntentLSTM:
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_slots, num_intents):
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_slots = num_slots
        self.num_intents = num_intents

        self.word_embeddings = np.random.randn(vocab_size, embed_dim) * 0.01
        self.intent_embeddings = np.random.randn(num_intents, 32) * 0.01

        self.encoder = BiLSTM(embed_dim + 32, hidden_dim)

        self.W_out = np.random.randn(2 * hidden_dim, num_slots) * 0.01
        self.b_out = np.zeros((1, num_slots))

        self.m_W_out = np.zeros_like(self.W_out)
        self.v_W_out = np.zeros_like(self.W_out)
        self.m_b_out = np.zeros_like(self.b_out)
        self.v_b_out = np.zeros_like(self.b_out)

        self.t = 0

    def forward(self, input_ids, intent_ids, training=True):
        self.input_ids = input_ids
        self.intent_ids = intent_ids
        batch_size, seq_len = input_ids.shape

        self.word_embeds = self.word_embeddings[input_ids]

        intent_embeds = self.intent_embeddings[intent_ids]
        self.intent_embeds = np.repeat(intent_embeds[:, np.newaxis, :], seq_len, axis=1)

        self.combined = np.concatenate([self.word_embeds, self.intent_embeds], axis=2)

        if training:
            self.dropout_mask = (np.random.rand(*self.combined.shape) > 0.3).astype(float) / 0.7
            combined_dropped = self.combined * self.dropout_mask
        else:
            combined_dropped = self.combined

        self.rnn_outputs = self.encoder.forward(combined_dropped)

        batch_size, seq_len, hidden_size = self.rnn_outputs.shape
        rnn_flat = self.rnn_outputs.reshape(-1, hidden_size)
        logits_flat = rnn_flat @ self.W_out + self.b_out
        self.logits = logits_flat.reshape(batch_size, seq_len, self.num_slots)

        return self.logits

    def backward(self, dloss):
        batch_size, seq_len, _ = dloss.shape
        hidden_size = 2 * self.hidden_dim

        dloss_flat = dloss.reshape(-1, self.num_slots)
        rnn_flat = self.rnn_outputs.reshape(-1, hidden_size)

        dW_out = rnn_flat.T @ dloss_flat
        db_out = np.sum(dloss_flat, axis=0, keepdims=True)

        drnn_flat = dloss_flat @ self.W_out.T
        drnn_outputs = drnn_flat.reshape(batch_size, seq_len, hidden_size)

        self.encoder.backward(drnn_outputs)

        return dW_out, db_out

    def update(self, dW_out, db_out, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
        self.t += 1

        # Update output layer
        self.m_W_out = beta1 * self.m_W_out + (1 - beta1) * dW_out
        self.v_W_out = beta2 * self.v_W_out + (1 - beta2) * (dW_out ** 2)
        m_hat = self.m_W_out / (1 - beta1 ** self.t)
        v_hat = self.v_W_out / (1 - beta2 ** self.t)
        self.W_out -= lr * m_hat / (np.sqrt(v_hat) + eps)

        self.m_b_out = beta1 * self.m_b_out + (1 - beta1) * db_out
        self.v_b_out = beta2 * self.v_b_out + (1 - beta2) * (db_out ** 2)
        m_hat = self.m_b_out / (1 - beta1 ** self.t)
        v_hat = self.v_b_out / (1 - beta2 ** self.t)
        self.b_out -= lr * m_hat / (np.sqrt(v_hat) + eps)

        # Update LSTM cells
        for cell in self.encoder.forward_cells + self.encoder.backward_cells:
            # Update forget gate weights
            cell.m_W_if = beta1 * cell.m_W_if + (1 - beta1) * cell.dW_if
            cell.v_W_if = beta2 * cell.v_W_if + (1 - beta2) * (cell.dW_if ** 2)
            m_hat_if = cell.m_W_if / (1 - beta1 ** self.t)
            v_hat_if = cell.v_W_if / (1 - beta2 ** self.t)
            cell.W_if -= lr * m_hat_if / (np.sqrt(v_hat_if) + eps)

            cell.m_W_hf = beta1 * cell.m_W_hf + (1 - beta1) * cell.dW_hf
            cell.v_W_hf = beta2 * cell.v_W_hf + (1 - beta2) * (cell.dW_hf ** 2)
            m_hat_hf = cell.m_W_hf / (1 - beta1 ** self.t)
            v_hat_hf = cell.v_W_hf / (1 - beta2 ** self.t)
            cell.W_hf -= lr * m_hat_hf / (np.sqrt(v_hat_hf) + eps)

            cell.m_b_f = beta1 * cell.m_b_f + (1 - beta1) * cell.db_f
            cell.v_b_f = beta2 * cell.v_b_f + (1 - beta2) * (cell.db_f ** 2)
            m_hat_bf = cell.m_b_f / (1 - beta1 ** self.t)
            v_hat_bf = cell.v_b_f / (1 - beta2 ** self.t)
            cell.b_f -= lr * m_hat_bf / (np.sqrt(v_hat_bf) + eps)

            # Update input gate weights
            cell.m_W_ii = beta1 * cell.m_W_ii + (1 - beta1) * cell.dW_ii
            cell.v_W_ii = beta2 * cell.v_W_ii + (1 - beta2) * (cell.dW_ii ** 2)
            m_hat_ii = cell.m_W_ii / (1 - beta1 ** self.t)
            v_hat_ii = cell.v_W_ii / (1 - beta2 ** self.t)
            cell.W_ii -= lr * m_hat_ii / (np.sqrt(v_hat_ii) + eps)

            cell.m_W_hi = beta1 * cell.m_W_hi + (1 - beta1) * cell.dW_hi
            cell.v_W_hi = beta2 * cell.v_W_hi + (1 - beta2) * (cell.dW_hi ** 2)
            m_hat_hi = cell.m_W_hi / (1 - beta1 ** self.t)
            v_hat_hi = cell.v_W_hi / (1 - beta2 ** self.t)
            cell.W_hi -= lr * m_hat_hi / (np.sqrt(v_hat_hi) + eps)

            cell.m_b_i = beta1 * cell.m_b_i + (1 - beta1) * cell.db_i
            cell.v_b_i = beta2 * cell.v_b_i + (1 - beta2) * (cell.db_i ** 2)
            m_hat_bi = cell.m_b_i / (1 - beta1 ** self.t)
            v_hat_bi = cell.v_b_i / (1 - beta2 ** self.t)
            cell.b_i -= lr * m_hat_bi / (np.sqrt(v_hat_bi) + eps)

            # Update cell gate weights
            cell.m_W_ig = beta1 * cell.m_W_ig + (1 - beta1) * cell.dW_ig
            cell.v_W_ig = beta2 * cell.v_W_ig + (1 - beta2) * (cell.dW_ig ** 2)
            m_hat_ig = cell.m_W_ig / (1 - beta1 ** self.t)
            v_hat_ig = cell.v_W_ig / (1 - beta2 ** self.t)
            cell.W_ig -= lr * m_hat_ig / (np.sqrt(v_hat_ig) + eps)

            cell.m_W_hg = beta1 * cell.m_W_hg + (1 - beta1) * cell.dW_hg
            cell.v_W_hg = beta2 * cell.v_W_hg + (1 - beta2) * (cell.dW_hg ** 2)
            m_hat_hg = cell.m_W_hg / (1 - beta1 ** self.t)
            v_hat_hg = cell.v_W_hg / (1 - beta2 ** self.t)
            cell.W_hg -= lr * m_hat_hg / (np.sqrt(v_hat_hg) + eps)

            cell.m_b_g = beta1 * cell.m_b_g + (1 - beta1) * cell.db_g
            cell.v_b_g = beta2 * cell.v_b_g + (1 - beta2) * (cell.db_g ** 2)
            m_hat_bg = cell.m_b_g / (1 - beta1 ** self.t)
            v_hat_bg = cell.v_b_g / (1 - beta2 ** self.t)
            cell.b_g -= lr * m_hat_bg / (np.sqrt(v_hat_bg) + eps)

            # Update output gate weights
            cell.m_W_io = beta1 * cell.m_W_io + (1 - beta1) * cell.dW_io
            cell.v_W_io = beta2 * cell.v_W_io + (1 - beta2) * (cell.dW_io ** 2)
            m_hat_io = cell.m_W_io / (1 - beta1 ** self.t)
            v_hat_io = cell.v_W_io / (1 - beta2 ** self.t)
            cell.W_io -= lr * m_hat_io / (np.sqrt(v_hat_io) + eps)

            cell.m_W_ho = beta1 * cell.m_W_ho + (1 - beta1) * cell.dW_ho
            cell.v_W_ho = beta2 * cell.v_W_ho + (1 - beta2) * (cell.dW_ho ** 2)
            m_hat_ho = cell.m_W_ho / (1 - beta1 ** self.t)
            v_hat_ho = cell.v_W_ho / (1 - beta2 ** self.t)
            cell.W_ho -= lr * m_hat_ho / (np.sqrt(v_hat_ho) + eps)

            cell.m_b_o = beta1 * cell.m_b_o + (1 - beta1) * cell.db_o
            cell.v_b_o = beta2 * cell.v_b_o + (1 - beta2) * (cell.db_o ** 2)
            m_hat_bo = cell.m_b_o / (1 - beta1 ** self.t)
            v_hat_bo = cell.v_b_o / (1 - beta2 ** self.t)
            cell.b_o -= lr * m_hat_bo / (np.sqrt(v_hat_bo) + eps)

    def predict(self, input_ids, intent_ids):
        logits = self.forward(input_ids, intent_ids, training=False)
        return np.argmax(logits, axis=2)
# ============================================================================
# INTENT CLASSIFIER MODEL
# ============================================================================

class IntentClassifier:
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_intents):
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_intents = num_intents

        self.embeddings = np.random.randn(vocab_size, embed_dim) * 0.01
        self.encoder = BiRNN(embed_dim, hidden_dim)

        self.W_out = np.random.randn(2 * hidden_dim, num_intents) * 0.01
        self.b_out = np.zeros((1, num_intents))

        self.m_W_out = np.zeros_like(self.W_out)
        self.v_W_out = np.zeros_like(self.W_out)
        self.m_b_out = np.zeros_like(self.b_out)
        self.v_b_out = np.zeros_like(self.b_out)
        self.m_emb = np.zeros_like(self.embeddings)
        self.v_emb = np.zeros_like(self.embeddings)

        self.t = 0

    def forward(self, input_ids, training=True):
        self.input_ids = input_ids
        batch_size, seq_len = input_ids.shape

        self.embeds = self.embeddings[input_ids]

        if training:
            self.dropout_mask = (np.random.rand(*self.embeds.shape) > 0.3).astype(float) / 0.7
            embeds_dropped = self.embeds * self.dropout_mask
        else:
            embeds_dropped = self.embeds

        self.rnn_outputs = self.encoder.forward(embeds_dropped)
        self.final_hidden = self.rnn_outputs[:, -1, :]
        self.logits = self.final_hidden @ self.W_out + self.b_out

        return self.logits

    def backward(self, dloss):
        batch_size = dloss.shape[0]

        dW_out = self.final_hidden.T @ dloss
        db_out = np.sum(dloss, axis=0, keepdims=True)

        dfinal_hidden = dloss @ self.W_out.T

        drnn_outputs = np.zeros_like(self.rnn_outputs)
        drnn_outputs[:, -1, :] = dfinal_hidden

        self.encoder.backward(drnn_outputs)

        return dW_out, db_out

    def update(self, dW_out, db_out, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
        self.t += 1

        self.m_W_out = beta1 * self.m_W_out + (1 - beta1) * dW_out
        self.v_W_out = beta2 * self.v_W_out + (1 - beta2) * (dW_out ** 2)
        m_hat = self.m_W_out / (1 - beta1 ** self.t)
        v_hat = self.v_W_out / (1 - beta2 ** self.t)
        self.W_out -= lr * m_hat / (np.sqrt(v_hat) + eps)

        self.m_b_out = beta1 * self.m_b_out + (1 - beta1) * db_out
        self.v_b_out = beta2 * self.v_b_out + (1 - beta2) * (db_out ** 2)
        m_hat = self.m_b_out / (1 - beta1 ** self.t)
        v_hat = self.v_b_out / (1 - beta2 ** self.t)
        self.b_out -= lr * m_hat / (np.sqrt(v_hat) + eps)

        for cell in self.encoder.forward_cells + self.encoder.backward_cells:
            m_W_ih = beta1 * cell.m_W_ih + (1 - beta1) * cell.dW_ih
            v_W_ih = beta2 * cell.v_W_ih + (1 - beta2) * (cell.dW_ih ** 2)
            m_hat = m_W_ih / (1 - beta1 ** self.t)
            v_hat = v_W_ih / (1 - beta2 ** self.t)
            cell.W_ih -= lr * m_hat / (np.sqrt(v_hat) + eps)
            cell.m_W_ih = m_W_ih
            cell.v_W_ih = v_W_ih

            m_W_hh = beta1 * cell.m_W_hh + (1 - beta1) * cell.dW_hh
            v_W_hh = beta2 * cell.v_W_hh + (1 - beta2) * (cell.dW_hh ** 2)
            m_hat = m_W_hh / (1 - beta1 ** self.t)
            v_hat = v_W_hh / (1 - beta2 ** self.t)
            cell.W_hh -= lr * m_hat / (np.sqrt(v_hat) + eps)
            cell.m_W_hh = m_W_hh
            cell.v_W_hh = v_W_hh

            m_b_h = beta1 * cell.m_b_h + (1 - beta1) * cell.db_h
            v_b_h = beta2 * cell.v_b_h + (1 - beta2) * (cell.db_h ** 2)
            m_hat = m_b_h / (1 - beta1 ** self.t)
            v_hat = v_b_h / (1 - beta2 ** self.t)
            cell.b_h -= lr * m_hat / (np.sqrt(v_hat) + eps)
            cell.m_b_h = m_b_h
            cell.v_b_h = v_b_h

    def predict(self, input_ids):
        logits = self.forward(input_ids, training=False)
        return np.argmax(logits, axis=1)

# ============================================================================
# SLOT FILLING MODEL WITH INTENT
# ============================================================================

class SlotFillingWithIntent:
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_slots, num_intents):
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_slots = num_slots
        self.num_intents = num_intents

        self.word_embeddings = np.random.randn(vocab_size, embed_dim) * 0.01
        self.intent_embeddings = np.random.randn(num_intents, 32) * 0.01

        self.encoder = BiRNN(embed_dim + 32, hidden_dim)

        self.W_out = np.random.randn(2 * hidden_dim, num_slots) * 0.01
        self.b_out = np.zeros((1, num_slots))

        self.m_W_out = np.zeros_like(self.W_out)
        self.v_W_out = np.zeros_like(self.W_out)
        self.m_b_out = np.zeros_like(self.b_out)
        self.v_b_out = np.zeros_like(self.b_out)

        self.t = 0

    def forward(self, input_ids, intent_ids, training=True):
        self.input_ids = input_ids
        self.intent_ids = intent_ids
        batch_size, seq_len = input_ids.shape

        self.word_embeds = self.word_embeddings[input_ids]

        intent_embeds = self.intent_embeddings[intent_ids]
        self.intent_embeds = np.repeat(intent_embeds[:, np.newaxis, :], seq_len, axis=1)

        self.combined = np.concatenate([self.word_embeds, self.intent_embeds], axis=2)

        if training:
            self.dropout_mask = (np.random.rand(*self.combined.shape) > 0.3).astype(float) / 0.7
            combined_dropped = self.combined * self.dropout_mask
        else:
            combined_dropped = self.combined

        self.rnn_outputs = self.encoder.forward(combined_dropped)

        batch_size, seq_len, hidden_size = self.rnn_outputs.shape
        rnn_flat = self.rnn_outputs.reshape(-1, hidden_size)
        logits_flat = rnn_flat @ self.W_out + self.b_out
        self.logits = logits_flat.reshape(batch_size, seq_len, self.num_slots)

        return self.logits

    def backward(self, dloss):
        batch_size, seq_len, _ = dloss.shape
        hidden_size = 2 * self.hidden_dim

        dloss_flat = dloss.reshape(-1, self.num_slots)
        rnn_flat = self.rnn_outputs.reshape(-1, hidden_size)

        dW_out = rnn_flat.T @ dloss_flat
        db_out = np.sum(dloss_flat, axis=0, keepdims=True)

        drnn_flat = dloss_flat @ self.W_out.T
        drnn_outputs = drnn_flat.reshape(batch_size, seq_len, hidden_size)

        self.encoder.backward(drnn_outputs)

        return dW_out, db_out

    def update(self, dW_out, db_out, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
        self.t += 1

        self.m_W_out = beta1 * self.m_W_out + (1 - beta1) * dW_out
        self.v_W_out = beta2 * self.v_W_out + (1 - beta2) * (dW_out ** 2)
        m_hat = self.m_W_out / (1 - beta1 ** self.t)
        v_hat = self.v_W_out / (1 - beta2 ** self.t)
        self.W_out -= lr * m_hat / (np.sqrt(v_hat) + eps)

        self.m_b_out = beta1 * self.m_b_out + (1 - beta1) * db_out
        self.v_b_out = beta2 * self.v_b_out + (1 - beta2) * (db_out ** 2)
        m_hat = self.m_b_out / (1 - beta1 ** self.t)
        v_hat = self.v_b_out / (1 - beta2 ** self.t)
        self.b_out -= lr * m_hat / (np.sqrt(v_hat) + eps)

        for cell in self.encoder.forward_cells + self.encoder.backward_cells:
            m_W_ih = beta1 * cell.m_W_ih + (1 - beta1) * cell.dW_ih
            v_W_ih = beta2 * cell.v_W_ih + (1 - beta2) * (cell.dW_ih ** 2)
            m_hat = m_W_ih / (1 - beta1 ** self.t)
            v_hat = v_W_ih / (1 - beta2 ** self.t)
            cell.W_ih -= lr * m_hat / (np.sqrt(v_hat) + eps)
            cell.m_W_ih = m_W_ih
            cell.v_W_ih = v_W_ih

            m_W_hh = beta1 * cell.m_W_hh + (1 - beta1) * cell.dW_hh
            v_W_hh = beta2 * cell.v_W_hh + (1 - beta2) * (cell.dW_hh ** 2)
            m_hat = m_W_hh / (1 - beta1 ** self.t)
            v_hat = v_W_hh / (1 - beta2 ** self.t)
            cell.W_hh -= lr * m_hat / (np.sqrt(v_hat) + eps)
            cell.m_W_hh = m_W_hh
            cell.v_W_hh = v_W_hh

            m_b_h = beta1 * cell.m_b_h + (1 - beta1) * cell.db_h
            v_b_h = beta2 * cell.v_b_h + (1 - beta2) * (cell.db_h ** 2)
            m_hat = m_b_h / (1 - beta1 ** self.t)
            v_hat = v_b_h / (1 - beta2 ** self.t)
            cell.b_h -= lr * m_hat / (np.sqrt(v_hat) + eps)
            cell.m_b_h = m_b_h
            cell.v_b_h = v_b_h

    def predict(self, input_ids, intent_ids):
        logits = self.forward(input_ids, intent_ids, training=False)
        return np.argmax(logits, axis=2)

# ============================================================================
# TRAINING FUNCTIONS
# ============================================================================

def train_intent_classifier(model, train_data, val_data, epochs=15, batch_size=32, lr=0.001):
    print("Training Intent Classifier...")
    
    if len(train_data) == 0:
        print("ERROR: No training data available!")
        return model

    best_acc = 0

    for epoch in range(epochs):
        np.random.shuffle(train_data)

        total_loss = 0
        num_batches = max(1, len(train_data) // batch_size)
        correct = 0
        total = 0

        # Create progress bar
        pbar = tqdm(range(num_batches), desc=f"Epoch {epoch+1}/{epochs}", ncols=100)
        
        for i in pbar:
            batch = train_data[i*batch_size:(i+1)*batch_size]
            
            if len(batch) == 0:
                continue

            max_len = max(len(item['input_ids']) for item in batch)
            input_ids = np.zeros((len(batch), max_len), dtype=int)
            intents = np.zeros(len(batch), dtype=int)

            for j, item in enumerate(batch):
                length = len(item['input_ids'])
                input_ids[j, :length] = item['input_ids']
                intents[j] = item['intent']

            logits = model.forward(input_ids, training=True)

            probs = softmax(logits)
            loss = -np.mean(np.log(probs[range(len(batch)), intents] + 1e-10))
            total_loss += loss

            # Calculate training accuracy
            preds = np.argmax(logits, axis=1)
            correct += np.sum(preds == intents)
            total += len(batch)

            dloss = probs.copy()
            dloss[range(len(batch)), intents] -= 1
            dloss /= len(batch)

            dW_out, db_out = model.backward(dloss)
            model.update(dW_out, db_out, lr=lr)
            
            # Update progress bar with current loss and accuracy
            train_acc = correct / total if total > 0 else 0
            pbar.set_postfix({'loss': f'{loss:.4f}', 'acc': f'{train_acc:.4f}'})

        train_acc = correct / total if total > 0 else 0
        val_acc = evaluate_intent(model, val_data, batch_size) if len(val_data) > 0 else 0
        print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/num_batches:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")

        if val_acc > best_acc:
            best_acc = val_acc
            print(f"  ✓ New best validation accuracy: {val_acc:.4f}")
    return model

def train_slot_filler(slot_model, intent_model, train_data, val_data, epochs=15, batch_size=32, lr=0.001):
    print("Training Slot Filler...")
    
    if len(train_data) == 0:
        print("ERROR: No training data available!")
        return slot_model

    best_f1 = 0
    

    for epoch in range(epochs):
        np.random.shuffle(train_data)

        total_loss = 0
        num_batches = max(1, len(train_data) // batch_size)

        # Create progress bar
        pbar = tqdm(range(num_batches), desc=f"Epoch {epoch+1}/{epochs}", ncols=100)
        
        for i in pbar:
            batch = train_data[i*batch_size:(i+1)*batch_size]
            
            if len(batch) == 0:
                continue

            max_len = max(len(item['input_ids']) for item in batch)
            input_ids = np.zeros((len(batch), max_len), dtype=int)
            slots = np.full((len(batch), max_len), -100, dtype=int)

            for j, item in enumerate(batch):
                length = len(item['input_ids'])
                input_ids[j, :length] = item['input_ids']
                slots[j, :length] = item['slots']

            predicted_intents = intent_model.predict(input_ids)

            logits = slot_model.forward(input_ids, predicted_intents, training=True)

            probs = softmax(logits)
            loss = 0
            count = 0

            for b in range(len(batch)):
                for t in range(max_len):
                    if slots[b, t] != -100:
                        loss += -np.log(probs[b, t, slots[b, t]] + 1e-10)
                        count += 1

            if count > 0:
                loss /= count
                total_loss += loss

                dloss = probs.copy()
                for b in range(len(batch)):
                    for t in range(max_len):
                        if slots[b, t] != -100:
                            dloss[b, t, slots[b, t]] -= 1
                dloss /= count

                dW_out, db_out = slot_model.backward(dloss)
                slot_model.update(dW_out, db_out, lr=lr)
                
                # Update progress bar with current loss
                pbar.set_postfix({'loss': f'{loss:.4f}'})

        metrics = evaluate_slot(slot_model, intent_model, val_data, batch_size) if len(val_data) > 0 else {'f1': 0, 'accuracy': 0}
        print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/num_batches:.4f}, Val F1: {metrics['f1']:.4f}, Val Acc: {metrics['accuracy']:.4f}")

        if metrics['f1'] > best_f1:
            best_f1 = metrics['f1']
            print(f"  ✓ New best validation F1: {metrics['f1']:.4f}")
    return slot_model

# ============================================================================
# EVALUATION FUNCTIONS
# ============================================================================

def evaluate_intent(model, data, batch_size=32):
    """Evaluate intent classifier"""
    if len(data) == 0:
        return 0.0
        
    all_preds = []
    all_labels = []

    num_batches = max(1, (len(data) + batch_size - 1) // batch_size)

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(data))
        batch = data[start_idx:end_idx]
        
        if len(batch) == 0:
            continue

        max_len = max(len(item['input_ids']) for item in batch)
        input_ids = np.zeros((len(batch), max_len), dtype=int)
        intents = np.zeros(len(batch), dtype=int)

        for j, item in enumerate(batch):
            length = len(item['input_ids'])
            input_ids[j, :length] = item['input_ids']
            intents[j] = item['intent']

        intent_preds = model.predict(input_ids)

        all_preds.extend(intent_preds)
        all_labels.extend(intents)

    if len(all_preds) == 0:
        print("WARNING: No predictions generated!")
        return 0.0
    
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    
    print(f"  Total predictions: {len(all_preds)}, Total labels: {len(all_labels)}")
    print(f"  Pred range: [{all_preds.min()}, {all_preds.max()}], Label range: [{all_labels.min()}, {all_labels.max()}]")
    print(f"  Sample predictions: {all_preds[:5]}")
    print(f"  Sample labels: {all_labels[:5]}")
        
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"  Calculated accuracy: {accuracy:.4f}")
    return accuracy


def evaluate_slot(slot_model, intent_model, data, batch_size=32):
    """Evaluate slot filling model"""
    if len(data) == 0:
        return {'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
        
    all_preds = []
    all_labels = []

    num_batches = max(1, (len(data) + batch_size - 1) // batch_size)

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(data))
        batch = data[start_idx:end_idx]
        
        if len(batch) == 0:
            continue

        max_len = max(len(item['input_ids']) for item in batch)
        input_ids = np.zeros((len(batch), max_len), dtype=int)
        slots = np.full((len(batch), max_len), -100, dtype=int)
        lengths = []

        for j, item in enumerate(batch):
            length = len(item['input_ids'])
            input_ids[j, :length] = item['input_ids']
            slots[j, :length] = item['slots']
            lengths.append(length)

        predicted_intents = intent_model.predict(input_ids)
        
        # Ensure predicted_intents is 1D
        if predicted_intents.ndim > 1:
            predicted_intents = predicted_intents.flatten()
        
        slot_preds = slot_model.predict(input_ids, predicted_intents)

        # Extract only valid slot predictions (not -100 padding)
        for j in range(len(batch)):
            length = lengths[j]
            preds_slice = slot_preds[j, :length]
            labels_slice = slots[j, :length]
            
            # Filter out -100 padding
            valid_mask = labels_slice != -100
            if np.any(valid_mask):
                all_preds.extend(preds_slice[valid_mask])
                all_labels.extend(labels_slice[valid_mask])

    if len(all_preds) == 0:
        print("WARNING: No slot predictions generated!")
        return {'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0}

    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    print(f"  Total slot predictions: {len(all_preds)}, Total slot labels: {len(all_labels)}")
    print(f"  Pred range: [{all_preds.min()}, {all_preds.max()}], Label range: [{all_labels.min()}, {all_labels.max()}]")
    print(f"  Unique predicted slots: {len(set(all_preds))}, Unique label slots: {len(set(all_labels))}")
    print(f"  Sample predictions: {all_preds[:10]}")
    print(f"  Sample labels: {all_labels[:10]}")

    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='weighted', zero_division=0
    )
    accuracy = accuracy_score(all_labels, all_preds)

    print(f"  Calculated accuracy: {accuracy:.4f}")
    print(f"  Calculated F1: {f1:.4f}")

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


def main(train_file=None, val_file=None, test_file=None, dataset_type='slurp', model_type='rnn'):
    """Main function to run the entire pipeline"""
    import sys
    import os

    # Check if running in notebook or command line
    if train_file is None:
        if len(sys.argv) < 4:
            print("⚠️  No file paths provided. Please call main() with file paths:")
            print("Example for SLURP: main('train.jsonl', 'val.jsonl', 'test.jsonl', 'slurp')")
            print("Example for ATIS: main('train.csv', 'test.csv', dataset_type='atis')")
            return None
        
        train_file = sys.argv[1]
        test_file = sys.argv[2]
        val_file = sys.argv[3] if len(sys.argv) > 3 else None
        dataset_type = sys.argv[4] if len(sys.argv) > 4 else 'slurp'
        model_type = sys.argv[5] if len(sys.argv) > 5 else 'rnn'
    
    print(f"Dataset type: {dataset_type}")
    print("="*80)
    
    # Load datasets based on type
    if dataset_type == 'atis':
        # For ATIS: train_file and val_file are actually train.csv and test.csv
        print("Loading ATIS CSV datasets...")
        train_data = load_atis_csv(train_file)
        test_data = load_atis_csv(val_file) if val_file else []
        
        if len(train_data) == 0:
            print("\nERROR: No training data loaded!")
            return None
        
        # Auto split train into train/val
        print("\nSplitting training data into train/val (80/20)...")
        train_data, val_data = create_train_val_split(train_data, val_split=0.2)
        print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")
        
    else:  # slurp or jsonl format
        # Validate files exist
        for file_path in [train_file, val_file, test_file]:
            if not os.path.exists(file_path):
                print(f"Error: File not found - {file_path}")
                return None
        
        # Load datasets
        train_data = load_json_dataset(train_file, dataset_type)
        val_data = load_json_dataset(val_file, dataset_type)
        test_data = load_json_dataset(test_file, dataset_type)
    
    # Check if we have any data
    if len(train_data) == 0:
        print("\nCRITICAL ERROR: No training data loaded!")
        return None
    
    # Build vocabulary and mappings
    vocab, intent2idx, slot2idx = build_vocab_and_mappings(train_data, val_data)
    
    if len(intent2idx) == 0:
        print("\nERROR: No intents found in data!")
        return None
    
    # Convert to IDs
    train_processed = convert_to_ids(train_data, vocab, intent2idx, slot2idx)
    val_processed = convert_to_ids(val_data, vocab, intent2idx, slot2idx)
    test_processed = convert_to_ids(test_data, vocab, intent2idx, slot2idx)
    
    # Model hyperparameters
    EMBED_DIM = 256
    HIDDEN_DIM = 256
    BATCH_SIZE = 32
    EPOCHS = 25
    LR = 0.0005
    
    print("\n" + "="*80)
    print(f"PHASE 1: Training Intent Classifier ({model_type.upper()})")
    print("="*80)
    
    # Initialize intent classifier based on model type
    if model_type == 'lstm':
        intent_model = IntentClassifierLSTM(
            vocab_size=len(vocab),
            embed_dim=EMBED_DIM,
            hidden_dim=HIDDEN_DIM,
            num_intents=len(intent2idx)
        )
    else:  # 'rnn'
        intent_model = IntentClassifier(
            vocab_size=len(vocab),
            embed_dim=EMBED_DIM,
            hidden_dim=HIDDEN_DIM,
            num_intents=len(intent2idx)
        )
    
    intent_model = train_intent_classifier(
        intent_model, 
        train_processed, 
        val_processed,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        lr=LR
    )
    
    # Evaluate intent classifier on test set
    print("\nEvaluating Intent Classifier on Test Set...")
    test_intent_acc = evaluate_intent(intent_model, test_processed, batch_size=BATCH_SIZE)
    print(f"Test Intent Accuracy: {test_intent_acc:.4f}")
    
    print("\n" + "="*80)
    print(f"PHASE 2: Training Slot Filler ({model_type.upper()})")
    print("="*80)
    
    # Initialize slot filler based on model type
    if model_type == 'lstm':
        slot_model = SlotFillingWithIntentLSTM(
            vocab_size=len(vocab),
            embed_dim=EMBED_DIM,
            hidden_dim=HIDDEN_DIM,
            num_slots=len(slot2idx),
            num_intents=len(intent2idx)
        )
    else:  # 'rnn'
        slot_model = SlotFillingWithIntent(
            vocab_size=len(vocab),
            embed_dim=EMBED_DIM,
            hidden_dim=HIDDEN_DIM,
            num_slots=len(slot2idx),
            num_intents=len(intent2idx)
        )
    
    slot_model = train_slot_filler(
        slot_model,
        intent_model,
        train_processed,
        val_processed,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        lr=LR
    )
    
    # Evaluate slot filler on test set
    print("\nEvaluating Slot Filler on Test Set...")
    test_metrics = evaluate_slot(slot_model, intent_model, test_processed, batch_size=BATCH_SIZE)
    print(f"Test Slot Accuracy: {test_metrics['accuracy']:.4f}")
    print(f"Test Slot Precision: {test_metrics['precision']:.4f}")
    print(f"Test Slot Recall: {test_metrics['recall']:.4f}")
    print(f"Test Slot F1: {test_metrics['f1']:.4f}")
    
    print("\n" + "="*80)
    print("FINAL RESULTS")
    print("="*80)
    print(f"Intent Classification - Test Accuracy: {test_intent_acc:.4f}")
    print(f"Slot Filling - Test F1: {test_metrics['f1']:.4f}")
    print(f"Slot Filling - Test Accuracy: {test_metrics['accuracy']:.4f}")
    print("="*80)
    
    return {
        'intent_model': intent_model,
        'slot_model': slot_model,
        'vocab': vocab,
        'intent2idx': intent2idx,
        'slot2idx': slot2idx,
        'test_intent_acc': test_intent_acc,
        'test_slot_metrics': test_metrics
    }

if __name__ == "__main__":
    try:
        get_ipython()
        IN_NOTEBOOK = True
    except NameError:
        IN_NOTEBOOK = False
    
    if IN_NOTEBOOK:
        print("Running in Jupyter Notebook mode")
        
        # For SLURP:
        # results = main(
        #     train_file='../cleaned-datasets/slurp/train.jsonl',
        #     val_file='../cleaned-datasets/slurp/devel.jsonl',
        #     test_file='../cleaned-datasets/slurp/test.jsonl',
        #     dataset_type='slurp',
        #     model_type='lstm'
        # )
        
        # For ATIS (uncomment below):
        results = main(
            train_file='../cleaned-datasets/atis/train.csv',
            val_file='../cleaned-datasets/atis/test.csv',
            test_file='None',
            dataset_type='atis',
            model_type='rnn'
        )
    else:
        print("Running in command-line mode")
        main()

Running in Jupyter Notebook mode
Dataset type: atis
Loading ATIS CSV datasets...
Loading ATIS CSV from ../cleaned-datasets/atis/train.csv...
CSV columns found: ['intent', 'text', 'slots']
Using 'text' column for tokens
Loaded 4501 samples from CSV

Loading ATIS CSV from ../cleaned-datasets/atis/test.csv...
CSV columns found: ['intent', 'text', 'slots']
Using 'text' column for tokens
Loaded 754 samples from CSV


Splitting training data into train/val (80/20)...
Train: 3600, Val: 901, Test: 754

Building vocabulary and mappings...
Vocabulary size: 799
Number of intents: 4
Number of slots: 117

PHASE 1: Training Intent Classifier (RNN)
Training Intent Classifier...


Epoch 1/25: 100%|████████████████████████| 112/112 [00:08<00:00, 13.53it/s, loss=1.1029, acc=0.6850]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 3], Label range: [0, 3]
  Sample predictions: [1 1 1 1 1]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.4206
Epoch 1/25 - Loss: 1.0178, Train Acc: 0.6850, Val Acc: 0.4206
  ✓ New best validation accuracy: 0.4206


Epoch 2/25: 100%|████████████████████████| 112/112 [00:08<00:00, 13.98it/s, loss=0.5376, acc=0.7815]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 2], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7325
Epoch 2/25 - Loss: 0.8208, Train Acc: 0.7815, Val Acc: 0.7325
  ✓ New best validation accuracy: 0.7325


Epoch 3/25: 100%|████████████████████████| 112/112 [00:08<00:00, 13.86it/s, loss=0.6453, acc=0.8181]


  Total predictions: 901, Total labels: 901
  Pred range: [1, 3], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.6681
Epoch 3/25 - Loss: 0.6759, Train Acc: 0.8181, Val Acc: 0.6681


Epoch 4/25: 100%|████████████████████████| 112/112 [00:07<00:00, 14.39it/s, loss=0.8173, acc=0.8119]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 2], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7403
Epoch 4/25 - Loss: 0.7017, Train Acc: 0.8119, Val Acc: 0.7403
  ✓ New best validation accuracy: 0.7403


Epoch 5/25: 100%|████████████████████████| 112/112 [00:08<00:00, 13.58it/s, loss=0.4927, acc=0.8186]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 3], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7703
Epoch 5/25 - Loss: 0.6643, Train Acc: 0.8186, Val Acc: 0.7703
  ✓ New best validation accuracy: 0.7703


Epoch 6/25: 100%|████████████████████████| 112/112 [00:08<00:00, 12.91it/s, loss=0.4635, acc=0.8178]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 3], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.6482
Epoch 6/25 - Loss: 0.6799, Train Acc: 0.8178, Val Acc: 0.6482


Epoch 7/25: 100%|████████████████████████| 112/112 [00:08<00:00, 13.23it/s, loss=0.7828, acc=0.8198]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 3], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7458
Epoch 7/25 - Loss: 0.6539, Train Acc: 0.8198, Val Acc: 0.7458


Epoch 8/25: 100%|████████████████████████| 112/112 [00:07<00:00, 14.60it/s, loss=0.6771, acc=0.8158]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 3], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7503
Epoch 8/25 - Loss: 0.6640, Train Acc: 0.8158, Val Acc: 0.7503


Epoch 9/25: 100%|████████████████████████| 112/112 [00:07<00:00, 14.49it/s, loss=0.5302, acc=0.8189]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 3], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7026
Epoch 9/25 - Loss: 0.6682, Train Acc: 0.8189, Val Acc: 0.7026


Epoch 10/25: 100%|███████████████████████| 112/112 [00:07<00:00, 14.14it/s, loss=0.7276, acc=0.8186]


  Total predictions: 901, Total labels: 901
  Pred range: [1, 3], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7469
Epoch 10/25 - Loss: 0.6611, Train Acc: 0.8186, Val Acc: 0.7469


Epoch 11/25: 100%|███████████████████████| 112/112 [00:08<00:00, 13.05it/s, loss=0.7158, acc=0.8186]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 3], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7469
Epoch 11/25 - Loss: 0.6562, Train Acc: 0.8186, Val Acc: 0.7469


Epoch 12/25: 100%|███████████████████████| 112/112 [00:08<00:00, 12.74it/s, loss=0.8286, acc=0.8178]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 3], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7514
Epoch 12/25 - Loss: 0.6526, Train Acc: 0.8178, Val Acc: 0.7514


Epoch 13/25: 100%|███████████████████████| 112/112 [00:09<00:00, 12.12it/s, loss=0.6496, acc=0.8133]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 3], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.6504
Epoch 13/25 - Loss: 0.6603, Train Acc: 0.8133, Val Acc: 0.6504


Epoch 14/25: 100%|███████████████████████| 112/112 [00:09<00:00, 12.22it/s, loss=0.7008, acc=0.8170]


  Total predictions: 901, Total labels: 901
  Pred range: [1, 2], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7048
Epoch 14/25 - Loss: 0.6708, Train Acc: 0.8170, Val Acc: 0.7048


Epoch 15/25: 100%|███████████████████████| 112/112 [00:08<00:00, 13.25it/s, loss=0.4978, acc=0.8184]


  Total predictions: 901, Total labels: 901
  Pred range: [1, 2], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.6804
Epoch 15/25 - Loss: 0.6651, Train Acc: 0.8184, Val Acc: 0.6804


Epoch 16/25: 100%|███████████████████████| 112/112 [00:08<00:00, 12.81it/s, loss=0.5612, acc=0.8186]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 2], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7081
Epoch 16/25 - Loss: 0.6694, Train Acc: 0.8186, Val Acc: 0.7081


Epoch 17/25: 100%|███████████████████████| 112/112 [00:09<00:00, 12.03it/s, loss=0.8140, acc=0.8186]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 3], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.6859
Epoch 17/25 - Loss: 0.6606, Train Acc: 0.8186, Val Acc: 0.6859


Epoch 18/25: 100%|███████████████████████| 112/112 [00:09<00:00, 12.35it/s, loss=0.4344, acc=0.8158]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 2], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7070
Epoch 18/25 - Loss: 0.6623, Train Acc: 0.8158, Val Acc: 0.7070


Epoch 19/25: 100%|███████████████████████| 112/112 [00:08<00:00, 12.55it/s, loss=0.7428, acc=0.8181]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 2], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7059
Epoch 19/25 - Loss: 0.6681, Train Acc: 0.8181, Val Acc: 0.7059


Epoch 20/25: 100%|███████████████████████| 112/112 [00:08<00:00, 12.86it/s, loss=0.6800, acc=0.8181]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 2], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.6903
Epoch 20/25 - Loss: 0.6663, Train Acc: 0.8181, Val Acc: 0.6903


Epoch 21/25: 100%|███████████████████████| 112/112 [00:09<00:00, 12.41it/s, loss=0.5845, acc=0.8189]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 3], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7725
Epoch 21/25 - Loss: 0.6771, Train Acc: 0.8189, Val Acc: 0.7725
  ✓ New best validation accuracy: 0.7725


Epoch 22/25: 100%|███████████████████████| 112/112 [00:09<00:00, 12.33it/s, loss=0.7303, acc=0.8184]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 2], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7048
Epoch 22/25 - Loss: 0.6814, Train Acc: 0.8184, Val Acc: 0.7048


Epoch 23/25: 100%|███████████████████████| 112/112 [00:09<00:00, 12.04it/s, loss=0.7183, acc=0.8181]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 2], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7137
Epoch 23/25 - Loss: 0.6755, Train Acc: 0.8181, Val Acc: 0.7137


Epoch 24/25: 100%|███████████████████████| 112/112 [00:09<00:00, 12.35it/s, loss=0.8479, acc=0.8186]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 3], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7181
Epoch 24/25 - Loss: 0.6757, Train Acc: 0.8186, Val Acc: 0.7181


Epoch 25/25: 100%|███████████████████████| 112/112 [00:08<00:00, 12.70it/s, loss=0.4926, acc=0.8189]


  Total predictions: 901, Total labels: 901
  Pred range: [0, 2], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 2 2 2 2]
  Calculated accuracy: 0.7403
Epoch 25/25 - Loss: 0.6581, Train Acc: 0.8189, Val Acc: 0.7403

Evaluating Intent Classifier on Test Set...
  Total predictions: 754, Total labels: 754
  Pred range: [1, 2], Label range: [0, 3]
  Sample predictions: [2 2 2 2 2]
  Sample labels: [2 0 2 2 2]
  Calculated accuracy: 0.8077
Test Intent Accuracy: 0.8077

PHASE 2: Training Slot Filler (RNN)
Training Slot Filler...


Epoch 1/25: 100%|████████████████████████████████████| 112/112 [00:12<00:00,  9.26it/s, loss=1.7912]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [11, 116], Label range: [0, 116]
  Unique predicted slots: 21, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.5083
  Calculated F1: 0.4508
Epoch 1/25 - Loss: 2.0837, Val F1: 0.4508, Val Acc: 0.5083
  ✓ New best validation F1: 0.4508


Epoch 2/25: 100%|████████████████████████████████████| 112/112 [00:11<00:00,  9.57it/s, loss=1.7820]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [44, 116], Label range: [0, 116]
  Unique predicted slots: 3, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6379
  Calculated F1: 0.4974
Epoch 2/25 - Loss: 1.8768, Val F1: 0.4974, Val Acc: 0.6379
  ✓ New best validation F1: 0.4974


Epoch 3/25: 100%|████████████████████████████████████| 112/112 [00:12<00:00,  9.32it/s, loss=1.6666]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [0, 116], Label range: [0, 116]
  Unique predicted slots: 19, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6347
  Calculated F1: 0.4964
Epoch 3/25 - Loss: 1.6773, Val F1: 0.4964, Val Acc: 0.6347


Epoch 4/25: 100%|████████████████████████████████████| 112/112 [00:12<00:00,  8.63it/s, loss=1.7384]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [5, 116], Label range: [0, 116]
  Unique predicted slots: 20, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6136
  Calculated F1: 0.4904
Epoch 4/25 - Loss: 1.6768, Val F1: 0.4904, Val Acc: 0.6136


Epoch 5/25: 100%|████████████████████████████████████| 112/112 [00:11<00:00,  9.61it/s, loss=1.7381]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [5, 116], Label range: [0, 116]
  Unique predicted slots: 22, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.4966
  Calculated F1: 0.4461
Epoch 5/25 - Loss: 1.7332, Val F1: 0.4461, Val Acc: 0.4966


Epoch 6/25: 100%|████████████████████████████████████| 112/112 [00:12<00:00,  8.65it/s, loss=1.7782]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [7, 116], Label range: [0, 116]
  Unique predicted slots: 22, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.5823
  Calculated F1: 0.4796
Epoch 6/25 - Loss: 1.7231, Val F1: 0.4796, Val Acc: 0.5823


Epoch 7/25: 100%|████████████████████████████████████| 112/112 [00:14<00:00,  7.94it/s, loss=1.6224]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [8, 116], Label range: [0, 116]
  Unique predicted slots: 19, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6335
  Calculated F1: 0.4961
Epoch 7/25 - Loss: 1.7723, Val F1: 0.4961, Val Acc: 0.6335


Epoch 8/25: 100%|████████████████████████████████████| 112/112 [00:13<00:00,  8.42it/s, loss=1.6405]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [23, 116], Label range: [0, 116]
  Unique predicted slots: 6, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6355
  Calculated F1: 0.4970
Epoch 8/25 - Loss: 1.6711, Val F1: 0.4970, Val Acc: 0.6355


Epoch 9/25: 100%|████████████████████████████████████| 112/112 [00:12<00:00,  8.89it/s, loss=1.6249]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [11, 116], Label range: [0, 116]
  Unique predicted slots: 20, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6316
  Calculated F1: 0.4955
Epoch 9/25 - Loss: 1.6665, Val F1: 0.4955, Val Acc: 0.6316


Epoch 10/25: 100%|███████████████████████████████████| 112/112 [00:16<00:00,  6.98it/s, loss=1.8063]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [11, 116], Label range: [0, 116]
  Unique predicted slots: 20, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.4785
  Calculated F1: 0.4409
Epoch 10/25 - Loss: 1.6731, Val F1: 0.4409, Val Acc: 0.4785


Epoch 11/25: 100%|███████████████████████████████████| 112/112 [00:15<00:00,  7.21it/s, loss=1.6787]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [23, 116], Label range: [0, 116]
  Unique predicted slots: 7, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6354
  Calculated F1: 0.4973
Epoch 11/25 - Loss: 1.7824, Val F1: 0.4973, Val Acc: 0.6354


Epoch 12/25: 100%|███████████████████████████████████| 112/112 [00:20<00:00,  5.56it/s, loss=1.8474]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [23, 116], Label range: [0, 116]
  Unique predicted slots: 6, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6352
  Calculated F1: 0.4971
Epoch 12/25 - Loss: 1.6668, Val F1: 0.4971, Val Acc: 0.6352


Epoch 13/25: 100%|███████████████████████████████████| 112/112 [00:26<00:00,  4.26it/s, loss=1.5331]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [23, 116], Label range: [0, 116]
  Unique predicted slots: 6, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6353
  Calculated F1: 0.4973
Epoch 13/25 - Loss: 1.6681, Val F1: 0.4973, Val Acc: 0.6353


Epoch 14/25: 100%|███████████████████████████████████| 112/112 [00:19<00:00,  5.66it/s, loss=1.5571]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [23, 116], Label range: [0, 116]
  Unique predicted slots: 6, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6349
  Calculated F1: 0.4971
Epoch 14/25 - Loss: 1.6643, Val F1: 0.4971, Val Acc: 0.6349


Epoch 15/25: 100%|███████████████████████████████████| 112/112 [00:16<00:00,  6.69it/s, loss=1.7053]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [23, 116], Label range: [0, 116]
  Unique predicted slots: 6, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6347
  Calculated F1: 0.4970
Epoch 15/25 - Loss: 1.6661, Val F1: 0.4970, Val Acc: 0.6347


Epoch 16/25: 100%|███████████████████████████████████| 112/112 [00:17<00:00,  6.44it/s, loss=1.6979]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [11, 116], Label range: [0, 116]
  Unique predicted slots: 7, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6346
  Calculated F1: 0.4970
Epoch 16/25 - Loss: 1.6651, Val F1: 0.4970, Val Acc: 0.6346


Epoch 17/25: 100%|███████████████████████████████████| 112/112 [00:22<00:00,  5.09it/s, loss=1.7629]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [11, 116], Label range: [0, 116]
  Unique predicted slots: 7, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6346
  Calculated F1: 0.4970
Epoch 17/25 - Loss: 1.6603, Val F1: 0.4970, Val Acc: 0.6346


Epoch 18/25: 100%|███████████████████████████████████| 112/112 [00:19<00:00,  5.81it/s, loss=1.7603]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [11, 116], Label range: [0, 116]
  Unique predicted slots: 6, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6340
  Calculated F1: 0.4968
Epoch 18/25 - Loss: 1.6595, Val F1: 0.4968, Val Acc: 0.6340


Epoch 19/25: 100%|███████████████████████████████████| 112/112 [00:20<00:00,  5.46it/s, loss=1.6227]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [11, 116], Label range: [0, 116]
  Unique predicted slots: 21, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6217
  Calculated F1: 0.4920
Epoch 19/25 - Loss: 1.6827, Val F1: 0.4920, Val Acc: 0.6217


Epoch 20/25: 100%|███████████████████████████████████| 112/112 [00:16<00:00,  6.90it/s, loss=1.7558]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [5, 116], Label range: [0, 116]
  Unique predicted slots: 20, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6325
  Calculated F1: 0.4961
Epoch 20/25 - Loss: 1.7668, Val F1: 0.4961, Val Acc: 0.6325


Epoch 21/25: 100%|███████████████████████████████████| 112/112 [00:17<00:00,  6.27it/s, loss=1.6808]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [3, 116], Label range: [0, 116]
  Unique predicted slots: 20, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6331
  Calculated F1: 0.4962
Epoch 21/25 - Loss: 1.6588, Val F1: 0.4962, Val Acc: 0.6331


Epoch 22/25: 100%|███████████████████████████████████| 112/112 [00:19<00:00,  5.69it/s, loss=1.7315]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [3, 116], Label range: [0, 116]
  Unique predicted slots: 19, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6338
  Calculated F1: 0.4980
Epoch 22/25 - Loss: 1.6581, Val F1: 0.4980, Val Acc: 0.6338
  ✓ New best validation F1: 0.4980


Epoch 23/25: 100%|███████████████████████████████████| 112/112 [00:20<00:00,  5.49it/s, loss=1.7320]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [30, 116], Label range: [0, 116]
  Unique predicted slots: 6, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6349
  Calculated F1: 0.4985
Epoch 23/25 - Loss: 1.6570, Val F1: 0.4985, Val Acc: 0.6349
  ✓ New best validation F1: 0.4985


Epoch 24/25: 100%|███████████████████████████████████| 112/112 [00:17<00:00,  6.27it/s, loss=1.6107]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [30, 116], Label range: [0, 116]
  Unique predicted slots: 6, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6347
  Calculated F1: 0.4982
Epoch 24/25 - Loss: 1.6545, Val F1: 0.4982, Val Acc: 0.6347


Epoch 25/25: 100%|███████████████████████████████████| 112/112 [00:28<00:00,  3.90it/s, loss=1.7280]


  Total slot predictions: 10659, Total slot labels: 10659
  Pred range: [30, 116], Label range: [0, 116]
  Unique predicted slots: 6, Unique label slots: 90
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116  44 116  71 116  23]
  Calculated accuracy: 0.6345
  Calculated F1: 0.4981
Epoch 25/25 - Loss: 1.6513, Val F1: 0.4981, Val Acc: 0.6345

Evaluating Slot Filler on Test Set...
  Total slot predictions: 8013, Total slot labels: 8013
  Pred range: [11, 116], Label range: [0, 116]
  Unique predicted slots: 11, Unique label slots: 86
  Sample predictions: [116 116 116 116 116 116 116 116 116 116]
  Sample labels: [116 116 116 116 116 116 116 116  44 116]
  Calculated accuracy: 0.5741
  Calculated F1: 0.4356
Test Slot Accuracy: 0.5741
Test Slot Precision: 0.4231
Test Slot Recall: 0.5741
Test Slot F1: 0.4356

FINAL RESULTS
Intent Classification - Test Accuracy: 0.8077
Slot Filling - Test F1: 0.4356
Slot Filling - Test Accuracy: 0.5741
