In [1]:
import os, sys
import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, top_k_accuracy_score
from torch.utils.data import DataLoader, TensorDataset
import preprocessor as p
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import torch

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.dataset import load_dataset
from src.model.BiLSTMClassifier import EnhancedBiLSTM
from src.embedding_matrix import *

In [None]:
print(torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# BiLTSM Model
- On GloVe Twitter Embeddings(100d)
- On FastText Crawl Subwords Embeddings(300d)

In [3]:
train, _, _, _ = load_dataset()

X = train["TEXT"].values.astype("U")
y = train["Label"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
vocab = create_vocab()

glove_embeddings = load_pretrained_embeddings("glove-twitter-100")
create_embedding_matrix_glove(vocab, glove_embeddings)

glove_embeddings_200d = load_pretrained_embeddings("glove-twitter-200")
create_embedding_matrix_glove(vocab, glove_embeddings_200d)

fasttext_embeddings = load_fasttext()
create_embedding_matrix_fasttext(vocab, fasttext_embeddings)

In [6]:
def top_k_accuracy(predict):
    y_scores = np.zeros((len(X_test), 20))
    top_k_accs = {}

    for i, (_, scores) in enumerate(predict):
        for label, score in scores.items():
            y_scores[i, label] = score

    for k in [1, 3, 5]:
        acc = top_k_accuracy_score(y_test, y_scores, k=k)
        top_k_accs[f"top_{k}_accuracy"] = acc

    return top_k_accs

## GloVe Twitter Embeddings(100d)

In [None]:
embedding_matrix_glove = np.load("../data/embeddings/embedding_matrix_glove_twitter_100.npy")
vocab = np.load("../data/embeddings/vocab.npy", allow_pickle=True).item()

vocab_size, embedding_dim_glove = embedding_matrix_glove.shape
print(f"Vocabulary Size: {vocab_size}")

In [8]:
tokenizer = TweetTokenizer()

def encode(text, vocab, max_len=50):
    text = p.tokenize(text)
    tokens = tokenizer.tokenize(text.lower())
    ids = [vocab.get(t, vocab["<UNK>"]) for t in tokens]
    if len(ids) < max_len:
        ids = ids + [vocab["<PAD>"]] * (max_len - len(ids))
    else:
        ids = ids[:max_len]
    return ids

max_len = 50

X_train_ids_glove = np.array([encode(t, vocab, max_len) for t in X_train])
X_test_ids_glove  = np.array([encode(t, vocab, max_len) for t in X_test])

In [9]:

batch_size = 64
num_classes = len(np.unique(y_train))

train_dataset_glove = TensorDataset(
    torch.tensor(X_train_ids_glove, dtype=torch.long).to(device),
    torch.tensor(y_train, dtype=torch.long).to(device)
)
train_loader_glove = DataLoader(train_dataset_glove, batch_size=batch_size, shuffle=True)

model_glove = EnhancedBiLSTM(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim_glove,
    hidden_dim=256,
    num_classes=num_classes,
    embedding_matrix=embedding_matrix_glove,
    freeze_embeddings=True,
    num_layers=2,
    dropout=0.5
)

optimizer = optim.Adam(model_glove.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [None]:
model_glove.to(device)

train_losses_glove = []
val_losses_glove = []
epochs = 20

for epoch in range(epochs):
    model_glove.train()
    total_loss = 0
    total = 0

    for x_batch, y_batch in train_loader_glove:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        y_pred = model_glove(x_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * x_batch.size(0)
        total += x_batch.size(0)

    # Save the model
    torch.save(model_glove.state_dict(), f"models/bilstm_glove_twitter_{epoch+1}_epochs.pth")

    # Validation
    model_glove.eval()
    with torch.no_grad():
        X = torch.tensor(X_test_ids_glove, dtype=torch.long).to(device)
        y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)
        
        logits = model_glove(X)
        pred = logits.argmax(dim=1)
        accuracy = (pred == y_test_tensor).float().mean().item()
        val_loss = criterion(logits, y_test_tensor).item()

    train_losses_glove.append(total_loss / total)
    val_losses_glove.append(val_loss)

    print(f"Epoch {epoch+1}, "
          f"Training Loss={total_loss / total:.4f}, "
          f"Validation Loss={val_loss:.4f}, "
          f"Validation Accuracy={accuracy:.4f}")

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(range(1, epochs+1), train_losses_glove, label='Training Loss', marker='o')
plt.plot(range(1, epochs+1), val_losses_glove, label='Validation Loss', marker='x')
plt.title('BiLSTM (GloVe): Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss (Cross Entropy)')
plt.legend()
plt.grid(True)

plt.show()

model_glove.load_state_dict(torch.load(f"models/bilstm_glove_twitter_best.pth"))

model_glove.eval()
with torch.no_grad():
    X = torch.tensor(X_test_ids_glove, dtype=torch.long).to(device)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)
    
    logits = model_glove(X)
    pred = logits.argmax(dim=1)
    preds = logits.softmax(dim=1)

    preds = [
        {label: score for label, score in enumerate(sample_scores)}
        for sample_scores in preds
    ]
    top_k_accs = top_k_accuracy(enumerate(preds))
    print("Top-K Accuracies:", top_k_accs)

    report = classification_report(y_test_tensor.cpu().numpy(), pred.cpu().numpy())
    print("Classification Report:\n", report)

    val_loss = criterion(logits, torch.tensor(y_test_tensor, dtype=torch.long)).item()
    print(f"Final Validation Loss: {val_loss:.4f}")


## GloVe Twitter Embeddings(200d)

In [None]:
embedding_matrix_glove_200d = np.load("../data/embeddings/embedding_matrix_glove_twitter_200.npy")
vocab = np.load("../data/embeddings/vocab.npy", allow_pickle=True).item()

vocab_size, embedding_dim_glove_200d = embedding_matrix_glove_200d.shape
print(f"Vocabulary Size: {vocab_size}")

In [13]:
tokenizer = TweetTokenizer()

def encode(text, vocab, max_len=50):
    text = p.tokenize(text)
    tokens = tokenizer.tokenize(text.lower())
    ids = [vocab.get(t, vocab["<UNK>"]) for t in tokens]
    if len(ids) < max_len:
        ids = ids + [vocab["<PAD>"]] * (max_len - len(ids))
    else:
        ids = ids[:max_len]
    return ids

max_len = 50

X_train_ids_glove_200d = np.array([encode(t, vocab, max_len) for t in X_train])
X_test_ids_glove_200d  = np.array([encode(t, vocab, max_len) for t in X_test])

In [29]:

batch_size = 64
num_classes = len(np.unique(y_train))

train_dataset_glove_200d = TensorDataset(
    torch.tensor(X_train_ids_glove_200d, dtype=torch.long).to(device),
    torch.tensor(y_train, dtype=torch.long).to(device)
)
train_loader_glove_200d = DataLoader(train_dataset_glove_200d, batch_size=batch_size, shuffle=True)

model_glove_200d = EnhancedBiLSTM(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim_glove_200d,
    hidden_dim=256,
    num_classes=num_classes,
    embedding_matrix=embedding_matrix_glove_200d,
    freeze_embeddings=True,
    num_layers=2,
    dropout=0.5
)

optimizer = optim.Adam(model_glove_200d.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [None]:
train_losses_glove_200d = []
val_losses_glove_200d = []
epochs = 20

model_glove_200d.to(device)

for epoch in range(epochs):
    model_glove_200d.train()
    total_loss = 0
    total = 0

    for x_batch, y_batch in train_loader_glove_200d:
        optimizer.zero_grad()
        y_pred = model_glove_200d(x_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x_batch.size(0)
        total += x_batch.size(0)

    # Save the model
    torch.save(model_glove_200d.state_dict(), f"models/bilstm_glove_twitter_200d_{epoch+1}_epochs.pth")

    model_glove_200d.eval()
    with torch.no_grad():
        X = torch.tensor(X_test_ids_glove_200d, dtype=torch.long).to(device)
        y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)
        
        logits = model_glove_200d(X)
        pred = logits.argmax(dim=1)
        accuracy = (pred == y_test_tensor).float().mean().item()
        val_loss = criterion(logits, y_test_tensor).item()

    train_losses_glove_200d.append(total_loss / total)
    val_losses_glove_200d.append(val_loss)

    print(f"Epoch {epoch+1}, Training Loss={total_loss / total:.4f}, Validation Loss={val_loss:.4f}, Validation Accuracy={accuracy:.4f}")



In [None]:
plt.figure(figsize=(12, 6))
plt.plot(range(1, epochs+1), train_losses_glove_200d, label='Training Loss', marker='o')
plt.plot(range(1, epochs+1), val_losses_glove_200d, label='Validation Loss', marker='x')
plt.title('BiLSTM (GloVe 200d): Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss (Cross Entropy)')
plt.legend()
plt.grid(True)

plt.show()

model_glove_200d.load_state_dict(torch.load(f"models/bilstm_glove_twitter_200d_best.pth"))

model_glove_200d.eval()
with torch.no_grad():
    X = torch.tensor(X_test_ids_glove_200d, dtype=torch.long).to(device)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)
    
    logits = model_glove_200d(X)
    pred = logits.argmax(dim=1)
    preds = logits.softmax(dim=1)

    preds = [
        {label: score for label, score in enumerate(sample_scores)}
        for sample_scores in preds
    ]
    top_k_accs = top_k_accuracy(enumerate(preds))
    print("Top-K Accuracies:", top_k_accs)

    report = classification_report(y_test_tensor.cpu().numpy(), pred.cpu().numpy())
    print("Classification Report:\n", report)

    val_loss = criterion(logits, torch.tensor(y_test_tensor, dtype=torch.long)).item()
    print(f"Final Validation Loss: {val_loss:.4f}")


## FastText Crawl Subwords Embeddings(300d)

In [None]:
embedding_matrix_fasttext = np.load("../data/embeddings/embedding_matrix_crawl_subword_300.npy")
vocab = np.load("../data/embeddings/vocab.npy", allow_pickle=True).item()

vocab_size, embedding_dim_fasttext = embedding_matrix_fasttext.shape
print(f"Vocabulary Size: {vocab_size}")

In [22]:
tokenizer = TweetTokenizer()

def encode(text, vocab, max_len=50):
    text = p.tokenize(text)
    tokens = tokenizer.tokenize(text.lower())
    ids = [vocab.get(t, vocab["<UNK>"]) for t in tokens]
    if len(ids) < max_len:
        ids = ids + [vocab["<PAD>"]] * (max_len - len(ids))
    else:
        ids = ids[:max_len]
    return ids

max_len = 50

X_train_ids_fasttext = np.array([encode(t, vocab, max_len) for t in X_train])
X_test_ids_fasttext  = np.array([encode(t, vocab, max_len) for t in X_test])

In [23]:

batch_size = 64
num_classes = len(np.unique(y_train))

train_dataset_fasttext = TensorDataset(
    torch.tensor(X_train_ids_fasttext, dtype=torch.long).to(device),
    torch.tensor(y_train, dtype=torch.long).to(device)
)
train_loader_fasttext = DataLoader(train_dataset_fasttext, batch_size=batch_size, shuffle=True)

model_fasttext = EnhancedBiLSTM(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim_fasttext,
    hidden_dim=128,
    num_classes=num_classes,
    embedding_matrix=embedding_matrix_fasttext,
    freeze_embeddings=True,
    num_layers=2,
    dropout=0.5
)

optimizer = optim.Adam(model_fasttext.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

In [None]:
train_losses_fasttext = []
val_losses_fasttext = []
epochs = 20

model_fasttext.to(device)

for epoch in range(epochs):
    model_fasttext.train()
    total_loss = 0
    total = 0

    for x_batch, y_batch in train_loader_fasttext:
        optimizer.zero_grad()
        y_pred = model_fasttext(x_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x_batch.size(0)
        total += x_batch.size(0)

    # Save the model
    torch.save(model_fasttext.state_dict(), f"models/bilstm_fasttext_{epoch+1}_epoches.pth")

    model_fasttext.eval()
    with torch.no_grad():
        X = torch.tensor(X_test_ids_fasttext, dtype=torch.long).to(device)
        y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)
        
        logits = model_fasttext(X)
        pred = logits.argmax(dim=1)
        accuracy = (pred == y_test_tensor).float().mean().item()
        val_loss = criterion(logits, y_test_tensor).item()

    train_losses_fasttext.append(total_loss / total)
    val_losses_fasttext.append(val_loss)

    print(f"Epoch {epoch+1}, Training Loss={total_loss / total:.4f}, Validation Loss={val_loss:.4f}, Validation Accuracy={accuracy:.4f}")



In [None]:
plt.figure(figsize=(12, 6))
plt.plot(range(1, epochs+1), train_losses_fasttext, label='Training Loss', marker='o')
plt.plot(range(1, epochs+1), val_losses_fasttext, label='Validation Loss', marker='x')
plt.title('BiLSTM (FastText): Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss (Cross Entropy)')
plt.legend()
plt.grid(True)

plt.show()

model_fasttext.load_state_dict(torch.load(f"models/bilstm_fasttext_best.pth"))

model_fasttext.eval()
with torch.no_grad():
    X = torch.tensor(X_test_ids_fasttext, dtype=torch.long).to(device)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)
    
    logits = model_fasttext(X)
    pred = logits.argmax(dim=1)
    preds = logits.softmax(dim=1)

    preds = [
        {label: score for label, score in enumerate(sample_scores)}
        for sample_scores in preds
    ]
    top_k_accs = top_k_accuracy(enumerate(preds))
    print("Top-K Accuracies:", top_k_accs)

    report = classification_report(y_test_tensor.cpu().numpy(), pred.cpu().numpy())
    print("Classification Report:\n", report)

    val_loss = criterion(logits, torch.tensor(y_test_tensor, dtype=torch.long)).item()
    print(f"Final Validation Loss: {val_loss:.4f}")
