# 2025 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## 1.1 Import Libraries


In [None]:
import json
import os
import pickle
import random
import re
from collections import Counter, defaultdict
from pathlib import Path

import faiss
import nltk
import spacy
from lemminflect import getAllInflections
from nltk.corpus import stopwords as nltk_stopwords, wordnet
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import sent_tokenize

from datasets import Dataset
from sentence_transformers import CrossEncoder, InputExample, SentenceTransformer, losses
from transformers import (
    BertForSequenceClassification,
    BertModel,
    BertTokenizer,
    Trainer,
    TrainingArguments,
    logging as hf_logging
)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset


import numpy as np
import pandas as pd
from tqdm import tqdm

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/felikskong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/felikskong/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/felikskong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felikskong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 1.2 Define file path

In [9]:
train_claims_path = './data/train-claims.json'
dev_claims_path = './data/dev-claims.json'
evidence_path = './data/evidence.json'

## 1.3 Task 1 - Preprocessing

In [None]:
# ✅
with open(train_claims_path, 'r') as f:
    train_claims = json.load(f)
with open(evidence_path, 'r') as f:
    evidence_dict = json.load(f)

all_nouns = []
for claim_obj in train_claims.values():
    doc = nlp(claim_obj["claim_text"])
    nouns = [token.lemma_.lower() for token in doc if token.pos_ == "NOUN"]
    all_nouns.extend(nouns)

top_keywords = set(word for word, _ in Counter(all_nouns).most_common(100))

all_forms = set()
for lemma in top_keywords:
    all_forms.add(lemma)
    infl_map = getAllInflections(lemma, upos="NOUN")
    for forms in infl_map.values():
        all_forms.update(forms)

def contains_climate_keywords(text: str, all_forms: set) -> bool:
    words = re.findall(r"\b[a-z']+\b", text.lower())
    return any(word in all_forms for word in words)


def is_english(text: str, threshold: float = 0.5) -> bool:
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    if len(text) == 0:
        return False
    english_char_count = sum(1 for char in text if char.isalpha())
    return (english_char_count / len(text)) >= threshold

def clean_and_split(eid, text):
    result_ids = []
    result_texts = []
    sentences = sent_tokenize(text)
    for i, sent in enumerate(sentences):
        sent = sent.lower()
        sent = re.sub(r'[^a-z0-9\s.,!?]', '', sent)
        sent = re.sub(r'\s+', ' ', sent).strip()
        if len(sent.split()) >= 5:
            result_ids.append(f"{eid}_s{i}")
            result_texts.append(sent)
    return result_ids, result_texts
word_embedding_path = './word_embedding/evidence_embeddings.npy'
word_embedding_meta_path = "./word_embedding/evidence_meta.pkl"


with open(evidence_path, 'r') as f:
    evidence_dict = json.load(f)
eids  = list(evidence_dict.keys())
texts = list(evidence_dict.values())
english_pairs = [
    (eid, txt)
    for eid, txt in zip(eids, texts)
    if is_english(txt)
]
print(f"Step1: English keep {len(english_pairs)}/{len(texts)}")

climate_pairs = [
    (eid, txt)
    for eid, txt in english_pairs
    if contains_climate_keywords(txt, all_forms)
]
print(f"Step2: Climate-related keep {len(climate_pairs)}/{len(english_pairs)}")


cleaned_evidence_ids = []
cleaned_evidence_texts = []
original_evidence_ids = []

for eid, text in climate_pairs:
    cleaned_ids, cleaned_texts = clean_and_split(eid, text)
    cleaned_evidence_ids.extend(cleaned_ids)
    cleaned_evidence_texts.extend(cleaned_texts)
    original_evidence_ids.extend([eid] * len(cleaned_ids))

Step1: English keep 1207838/1208827
Step2: Climate-related keep 385471/1207838


## 1.4 Task 2 - Preprocessing

### 1.4.1 Data Processing for BERT

In [None]:
# ✅
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

label2id = {
    "SUPPORTS": 0,
    "REFUTES": 1,
    "NOT_ENOUGH_INFO": 2,
    "DISPUTED": 3
}

class ClaimEvidenceDataset(Dataset):
    def __init__(self, claims, evidence_dict, tokenizer, max_length=512):
        self.encodings = []
        self.labels = []
        for claim_data in claims.values():
            claim_text = claim_data["claim_text"]
            label_str = claim_data["claim_label"]
            for eid in claim_data.get("evidences", []):
                if eid in evidence_dict:
                    evidence_text = evidence_dict[eid]
                    encoded = tokenizer(
                        claim_text,
                        evidence_text,
                        padding="max_length",
                        truncation=True,
                        max_length=max_length,
                        return_tensors="pt"
                    )
                    self.encodings.append({k: v.squeeze() for k, v in encoded.items()})
                    self.labels.append(label2id[label_str])

    def __getitem__(self, idx):
        item = self.encodings[idx]
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
PAD, MASK = tokenizer.pad_token_id, tokenizer.mask_token_id

label2id = {"SUPPORTS": 0, "REFUTES": 1, "NOT_ENOUGH_INFO": 2, "DISPUTED": 3}
id2label = {v: k for k, v in label2id.items()}


class ClaimEvidenceDataset(Dataset):
    def __init__(self,
                 claims: dict,
                 evidence_dict: dict,
                 tokenizer,
                 max_length: int = 512,
                 balance: bool = True,
                 target_ratio: float = 1.0,
                 augmenters=None,
                 aug_params=None,
                 seed: int = 42):

        random.seed(seed)
        self.tokenizer, self.max_length = tokenizer, max_length
        self.encodings, self.labels = [], []
        self.augmenters = set(augmenters or ['dropout', 'swap', 'pad', 'cutmix'])

        _default = dict(dropout_prob=0.15,
                        swap_prob=0.10,
                        pad_prob=0.05,
                        cutmix_min=0.3,
                        cutmix_max=0.7)
        self.aug_params = {**_default, **(aug_params or {})}

        for cdict in claims.values():
            claim_text = cdict["claim_text"]
            lab = label2id[cdict["claim_label"]]
            for eid in cdict.get("evidences", []):
                if eid in evidence_dict:
                    evi = evidence_dict[eid]
                    toks = tokenizer(claim_text, evi,
                                     truncation=True,
                                     padding="max_length",
                                     max_length=max_length,
                                     return_tensors="pt")
                    self.encodings.append({k: v.squeeze(0) for k, v in toks.items()})
                    self.labels.append(lab)

        if balance:
            self._balance_dataset(target_ratio)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: v.clone() for k, v in self.encodings[idx].items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def _balance_dataset(self, target_ratio: float):
        by_label = defaultdict(list)
        for i, y in enumerate(self.labels):
            by_label[y].append(i)

        max_count = int(max(len(v) for v in by_label.values()) * target_ratio)

        for lab, idx_list in by_label.items():
            need = max_count - len(idx_list)
            for _ in range(max(0, need)):
                base_idx = random.choice(idx_list)
                base_enc = self.encodings[base_idx]
                aug_enc = self._augment_encoding(base_enc, lab)
                self.encodings.append(aug_enc)
                self.labels.append(lab)

    def _augment_encoding(self, enc, lab):
        enc = {k: v.clone() for k, v in enc.items()} 
        choice = random.choice(list(self.augmenters))
        if choice == 'dropout':
            self._token_dropout(enc)
        elif choice == 'swap':
            self._swap_neighbor(enc)
        elif choice == 'pad':
            self._random_pad(enc)
        elif choice == 'cutmix':
            self._cutmix(enc, lab)
        return enc

    def _token_dropout(self, enc):
        ids = enc['input_ids']
        mask = torch.rand_like(ids.float()) < self.aug_params['dropout_prob']
        ids[mask & (ids != PAD)] = MASK
        enc['input_ids'] = ids

    def _swap_neighbor(self, enc):
        ids = enc['input_ids']
        for i in range(1, len(ids) - 1):
            if random.random() < self.aug_params['swap_prob'] and ids[i] not in (PAD, MASK):
                ids[i], ids[i + 1] = ids[i + 1].clone(), ids[i].clone()
        enc['input_ids'] = ids

    def _random_pad(self, enc):
        ids, mask = enc['input_ids'], enc['attention_mask']
        pad_prob = self.aug_params['pad_prob']
        new_ids, new_mask = [], []
        for tok, m in zip(ids, mask):
            if m.item() == 0:  
                break
            new_ids.append(tok.item())
            new_mask.append(1)
            if random.random() < pad_prob and len(new_ids) < self.max_length - 1:
                new_ids.append(PAD)
                new_mask.append(0)
        new_ids = (new_ids + [PAD] * self.max_length)[:self.max_length]
        new_mask = (new_mask + [0] * self.max_length)[:self.max_length]
        enc['input_ids'] = torch.tensor(new_ids, dtype=torch.long)
        enc['attention_mask'] = torch.tensor(new_mask, dtype=torch.long)

    def _cutmix(self, enc, lab):
        same_idxs = [i for i, y in enumerate(self.labels) if y == lab]
        other = {k: v.clone() for k, v in self.encodings[random.choice(same_idxs)].items()}
        lam = random.uniform(self.aug_params['cutmix_min'],
                             self.aug_params['cutmix_max'])
        cut_point = int(lam * self.max_length)
        enc['input_ids'][cut_point:] = other['input_ids'][cut_point:]
        enc['attention_mask'][cut_point:] = other['attention_mask'][cut_point:]

### 1.4.2 Data Processing for RNN

In [None]:
# ✅
stopwords = set(nltk_stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('english')

MAX_LEN = 50
BATCH_SIZE = 64

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word, 'v')
    return lemmatizer.lemmatize(lemma, 'n')

def preprocess(text, remove_stopwords=True, lemma=True, stem=False):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [t for t in tokens if re.match('^[a-zA-Z0-9-]+$', t)]
    if remove_stopwords:
        tokens = [t for t in tokens if t not in stopwords]
    if lemma:
        tokens = [lemmatize(t) for t in tokens]
    if stem:
        tokens = [stemmer.stem(t) for t in tokens]
    return ' '.join(tokens)

def load_data(claims_file, evidence_file):
    with open(claims_file, 'r', encoding='utf-8') as f:
        claims_data = json.load(f)
    with open(evidence_file, 'r', encoding='utf-8') as f:
        evid_data = json.load(f)
    
    claim_texts, evid_texts, labels = [], [], []
    for cid, cdata in claims_data.items():
        claim = preprocess(cdata['claim_text'])
        evid_ids = cdata['evidences']
        evids = ' '.join([evid_data.get(eid, '') for eid in evid_ids])
        evid = preprocess(evids)
        claim_texts.append(claim)
        evid_texts.append(evid)
        labels.append(cdata['claim_label'])
    
    df = pd.DataFrame({
        'claim': claim_texts,
        'evidence': evid_texts,
        'label': labels
    })
    return df

train_df = load_data(train_claims_path, evidence_path)
dev_df = load_data(dev_claims_path, evidence_path)

all_text = train_df['claim'].tolist() + train_df['evidence'].tolist()
token_counts = Counter(w for text in all_text for w in text.split())
vocab = {w: idx+1 for idx, (w, _) in enumerate(token_counts.items())}
vocab_size = len(vocab) + 1

def text_to_seq(text):
    seq = [vocab.get(w, 0) for w in text.split()]
    return seq + [0]*(MAX_LEN - len(seq)) if len(seq) < MAX_LEN else seq[:MAX_LEN]

train_claims = [text_to_seq(t) for t in train_df['claim']]
train_evids = [text_to_seq(t) for t in train_df['evidence']]
dev_claims = [text_to_seq(t) for t in dev_df['claim']]
dev_evids = [text_to_seq(t) for t in dev_df['evidence']]

label_enc = LabelEncoder()
train_labels = label_enc.fit_transform(train_df['label'])
dev_labels = label_enc.transform(dev_df['label'])

class ClaimDataset(Dataset):
    def __init__(self, claims, evidences, labels):
        self.claims = torch.tensor(claims, dtype=torch.long)
        self.evidences = torch.tensor(evidences, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return self.claims[idx], self.evidences[idx], self.labels[idx]

train_ds = ClaimDataset(train_claims, train_evids, train_labels)
dev_ds = ClaimDataset(dev_claims, dev_evids, dev_labels)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
dev_dl = DataLoader(dev_ds, batch_size=BATCH_SIZE)

class SelfAttentionPooling(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.attention = nn.Linear(input_dim, 1)

    def forward(self, x):
        weights = torch.softmax(self.attention(x), dim=1)
        pooled = torch.sum(weights * x, dim=1)
        return pooled

# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## 2.1 Task 1

### 2.1.1 Train MiniLM

In [None]:
# ✅
with open(train_claims_path, 'r') as f:
    train_claims = json.load(f)
with open(evidence_path, 'r') as f:
    evidence_dict = json.load(f)

train_samples = []
missed = 0

for claim in train_claims.values():
    claim_text = claim['claim_text']
    evidence_ids = claim.get('evidences', [])
    for eid in evidence_ids:
        if eid in evidence_dict:
            ev_text = evidence_dict[eid]
            train_samples.append(InputExample(texts=[claim_text, ev_text], label=1.0))
        else:
            missed += 1


print(f"Total training pairs: {len(train_samples)}")
print(f"Missing evidence ids: {missed}")

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=32)

train_loss = losses.MultipleNegativesRankingLoss(model)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=5,
    warmup_steps=100,
    show_progress_bar=True
)

model.save('./model/my_finetuned_minilm_retriever')
print("Finetuned model saved.")

print("""
Total training pairs: 4122
Missing evidence ids: 0
Iteration: 100%|██████████| 129/129 [01:06<00:00,  1.93it/s]
Iteration: 100%|██████████| 129/129 [00:43<00:00,  2.98it/s]
Iteration: 100%|██████████| 129/129 [00:39<00:00,  3.27it/s]
Iteration: 100%|██████████| 129/129 [00:41<00:00,  3.13it/s]
Iteration: 100%|██████████| 129/129 [00:42<00:00,  3.05it/s]
Epoch: 100%|██████████| 5/5 [03:53<00:00, 46.64s/it]
Finetuned model saved.
""")

Total training pairs: 4122
Missing evidence ids: 0


Iteration: 100%|██████████| 129/129 [01:06<00:00,  1.93it/s]
Iteration: 100%|██████████| 129/129 [00:43<00:00,  2.98it/s]
Iteration: 100%|██████████| 129/129 [00:39<00:00,  3.27it/s]
Iteration: 100%|██████████| 129/129 [00:41<00:00,  3.13it/s]
Iteration: 100%|██████████| 129/129 [00:42<00:00,  3.05it/s]
Epoch: 100%|██████████| 5/5 [03:53<00:00, 46.64s/it]


Finetuned model saved.


### 2.1.2 Train Msmarco Reranker

In [None]:
# ✅
with open(train_claims_path, 'r') as f:
    train_claims = json.load(f)
with open(evidence_path, 'r') as f:
    evidence_dict = json.load(f)

train_samples = []

def generate_samples(claims_data):
    samples = []
    for claim in claims_data.values():
        claim_text = claim["claim_text"]
        evidence_ids = claim.get("evidences", [])
        pos_evidence_texts = [evidence_dict[eid] for eid in evidence_ids if eid in evidence_dict]

        for ev in pos_evidence_texts:
            samples.append(InputExample(texts=[claim_text, ev], label=1.0))

        neg_pool = [e for eid, e in evidence_dict.items() if eid not in evidence_ids]
        for _ in range(len(pos_evidence_texts)):
            neg_ev = random.choice(neg_pool)
            samples.append(InputExample(texts=[claim_text, neg_ev], label=0.0))

    return samples

train_samples.extend(generate_samples(train_claims))

print(f"Total training samples: {len(train_samples)}")

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)

model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2", num_labels=1)

model.fit(
    train_dataloader=train_dataloader,
    epochs=5,
    warmup_steps=100,
    show_progress_bar=True
)

model.save('./model/my_finetuned_msmarco_reranker')
print("Finetuned model saved.")

print("""
Total training samples: 8244
Iteration: 100%|██████████| 516/516 [01:09<00:00,  7.41it/s]
Iteration: 100%|██████████| 516/516 [01:05<00:00,  7.89it/s]
Iteration: 100%|██████████| 516/516 [01:12<00:00,  7.11it/s]
Iteration: 100%|██████████| 516/516 [01:11<00:00,  7.27it/s]
Iteration: 100%|██████████| 516/516 [01:11<00:00,  7.20it/s]
Epoch: 100%|██████████| 5/5 [05:50<00:00, 70.05s/it]
Finetuned model saved.
""")

Total training samples: 8244


Iteration: 100%|██████████| 516/516 [01:09<00:00,  7.41it/s]
Iteration: 100%|██████████| 516/516 [01:05<00:00,  7.89it/s]
Iteration: 100%|██████████| 516/516 [01:12<00:00,  7.11it/s]
Iteration: 100%|██████████| 516/516 [01:11<00:00,  7.27it/s]
Iteration: 100%|██████████| 516/516 [01:11<00:00,  7.20it/s]
Epoch: 100%|██████████| 5/5 [05:50<00:00, 70.05s/it]


Finetuned model saved.


### 2.1.3 Load Finetuned models

In [None]:
# ✅
model = SentenceTransformer('./model/my_finetuned_minilm_retriever')
reranker =  CrossEncoder('./model/my_finetuned_msmarco_reranker')

### 2.1.4 Encode Evidence Dictionary

In [None]:
# ✅
evidence_embeddings = model.encode(
    cleaned_evidence_texts,
    convert_to_numpy=True,
    normalize_embeddings= True,
    show_progress_bar=True
)

os.makedirs("./word_embedding", exist_ok=True)
np.save(word_embedding_path, evidence_embeddings)

with open(word_embedding_meta_path, "wb") as f:
    pickle.dump((cleaned_evidence_ids, cleaned_evidence_texts, original_evidence_ids), f)

Batches: 100%|██████████| 12133/12133 [05:59<00:00, 33.76it/s]


## 2.2 Task 2

### 2.2.1 BERT Finetuning

In [None]:
model_path = "./model/my_bert_classifier"

with open(train_claims_path, 'r') as f:
    train_claims = json.load(f)
with open(evidence_path, 'r') as f:
    evidence_dict = json.load(f)

train_dataset = train_dataset = ClaimEvidenceDataset(
    claims=train_claims,
    evidence_dict=evidence_dict,
    tokenizer=tokenizer,
    max_length=512,
    balance=True,             
    target_ratio=1.0,
)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    evaluation_strategy="no",
    save_strategy="no",
    logging_strategy="no",   
    logging_steps=50,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

trainer.train()

model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

### 2.2.2 Train RNN

In [None]:
EMBED_DIM = 100
HIDDEN_DIM = 64
NUM_CLASSES = 4
DROPOUT_PROB = 0.4
EPOCHS = 20
LR = 1e-4

DEVICE = "cpu"
if torch.cuda.is_available(): 
    DEVICE = "cuda"
elif torch.backends.mps.is_available():
    DEVICE = "mps"

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.embed_dropout = nn.Dropout(DROPOUT_PROB)
        self.rnn_claim = nn.RNN(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.rnn_evid = nn.RNN(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.rnn_dropout = nn.Dropout(DROPOUT_PROB)
        self.attention_claim = SelfAttentionPooling(hidden_dim * 2)
        self.attention_evid = SelfAttentionPooling(hidden_dim * 2)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim * 4, 128),
            nn.ReLU(),
            nn.Dropout(DROPOUT_PROB),
            nn.Linear(128, num_classes)
        )

    def forward(self, claim, evidence):
        claim_emb = self.embed_dropout(self.embedding(claim))
        evid_emb = self.embed_dropout(self.embedding(evidence))
        
        claim_out, _ = self.rnn_claim(claim_emb)
        evid_out, _ = self.rnn_evid(evid_emb)
        
        claim_out = self.rnn_dropout(claim_out)
        evid_out = self.rnn_dropout(evid_out)
        
        claim_pool = self.attention_claim(claim_out)
        evid_pool = self.attention_evid(evid_out)
        
        combined = torch.cat([claim_pool, evid_pool], dim=1)
        return self.classifier(combined)

model = RNNModel(vocab_size, EMBED_DIM, HIDDEN_DIM, NUM_CLASSES).to(DEVICE)

In [None]:
label2idx = {
    "SUPPORTS": 0,
    "REFUTES": 1,
    "NOT_ENOUGH_INFO": 2,
    "DISPUTED": 3
}

with open(train_claims_path, 'r', encoding='utf-8') as f:
    train_claim = json.load(f)

label_counts = Counter([label2idx[obj["claim_label"]] for obj in train_claim.values()])
total = sum(label_counts.values())

class_weights = [total / label_counts[i] for i in range(len(label2idx))]

weights = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)

criterion = nn.CrossEntropyLoss(weight=weights)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

best_acc = 0.0
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss, total_correct = 0, 0
    for claim, evid, label in tqdm(train_dl, desc=f"Epoch {epoch}"):
        claim, evid, label = claim.to(DEVICE), evid.to(DEVICE), label.to(DEVICE)
        optimizer.zero_grad()
        out = model(claim, evid)
        loss = criterion(out, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_correct += (out.argmax(1) == label).sum().item()
    acc = total_correct / len(train_ds)
    print(f"Train Loss: {total_loss/len(train_dl):.4f}, Train Acc: {acc:.4f}")
    
    model.eval()
    val_loss, val_correct = 0, 0
    with torch.no_grad():
        for claim, evid, label in dev_dl:
            claim, evid, label = claim.to(DEVICE), evid.to(DEVICE), label.to(DEVICE)
            out = model(claim, evid)
            loss = criterion(out, label)
            val_loss += loss.item()
            val_correct += (out.argmax(1) == label).sum().item()
    val_acc = val_correct / len(dev_ds)
    print(f"Val Loss: {val_loss/len(dev_dl):.4f}, Val Acc: {val_acc:.4f}")

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), "rnn_model.pth")
        print(f"✅ New best model saved (epoch {epoch}, acc {val_acc:.4%})\n")
    else:
        print()

import pickle
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_enc, f)

Epoch 1: 100%|██████████| 20/20 [00:01<00:00, 12.34it/s]


Train Loss: 4.6202, Train Acc: 0.3542
Val Loss: 1.3208, Val Acc: 0.4416
✅ New best model saved (epoch 1, acc 44.1558%)



Epoch 2: 100%|██████████| 20/20 [00:01<00:00, 12.00it/s]


Train Loss: 3.2437, Train Acc: 0.4015
Val Loss: 1.3139, Val Acc: 0.4416



Epoch 3: 100%|██████████| 20/20 [00:01<00:00, 12.65it/s]


Train Loss: 3.2604, Train Acc: 0.4251
Val Loss: 1.3084, Val Acc: 0.4286



Epoch 4: 100%|██████████| 20/20 [00:01<00:00, 12.72it/s]


Train Loss: 3.2710, Train Acc: 0.4357
Val Loss: 1.3071, Val Acc: 0.4351



Epoch 5: 100%|██████████| 20/20 [00:01<00:00, 12.43it/s]


Train Loss: 2.7899, Train Acc: 0.4316
Val Loss: 1.3043, Val Acc: 0.4221



Epoch 6: 100%|██████████| 20/20 [00:01<00:00, 12.73it/s]


Train Loss: 3.1248, Train Acc: 0.4438
Val Loss: 1.3016, Val Acc: 0.4221



Epoch 7: 100%|██████████| 20/20 [00:01<00:00, 12.33it/s]


Train Loss: 3.4566, Train Acc: 0.4226
Val Loss: 1.3015, Val Acc: 0.4416



Epoch 8: 100%|██████████| 20/20 [00:01<00:00, 12.47it/s]


Train Loss: 2.8482, Train Acc: 0.4202
Val Loss: 1.2984, Val Acc: 0.4091



Epoch 9: 100%|██████████| 20/20 [00:01<00:00, 12.39it/s]


Train Loss: 2.6763, Train Acc: 0.4226
Val Loss: 1.2941, Val Acc: 0.3896



Epoch 10: 100%|██████████| 20/20 [00:01<00:00, 12.29it/s]


Train Loss: 2.8144, Train Acc: 0.4349
Val Loss: 1.2938, Val Acc: 0.3636



Epoch 11: 100%|██████████| 20/20 [00:01<00:00, 12.44it/s]


Train Loss: 2.9803, Train Acc: 0.4332
Val Loss: 1.2903, Val Acc: 0.3636



Epoch 12: 100%|██████████| 20/20 [00:01<00:00, 11.90it/s]


Train Loss: 2.8918, Train Acc: 0.4381
Val Loss: 1.2863, Val Acc: 0.4026



Epoch 13: 100%|██████████| 20/20 [00:01<00:00, 12.41it/s]


Train Loss: 3.0893, Train Acc: 0.4308
Val Loss: 1.2822, Val Acc: 0.3961



Epoch 14: 100%|██████████| 20/20 [00:01<00:00, 12.51it/s]


Train Loss: 2.9196, Train Acc: 0.4397
Val Loss: 1.2775, Val Acc: 0.4091



Epoch 15: 100%|██████████| 20/20 [00:01<00:00, 12.31it/s]


Train Loss: 2.8342, Train Acc: 0.4357
Val Loss: 1.2779, Val Acc: 0.3247



Epoch 16: 100%|██████████| 20/20 [00:01<00:00, 12.85it/s]


Train Loss: 2.4624, Train Acc: 0.4397
Val Loss: 1.2746, Val Acc: 0.3506



Epoch 17: 100%|██████████| 20/20 [00:01<00:00, 12.44it/s]


Train Loss: 2.3187, Train Acc: 0.4650
Val Loss: 1.2735, Val Acc: 0.3182



Epoch 18: 100%|██████████| 20/20 [00:01<00:00, 12.82it/s]


Train Loss: 2.5657, Train Acc: 0.4658
Val Loss: 1.2731, Val Acc: 0.2597



Epoch 19: 100%|██████████| 20/20 [00:01<00:00, 12.48it/s]


Train Loss: 2.2013, Train Acc: 0.4894
Val Loss: 1.2727, Val Acc: 0.2403



Epoch 20: 100%|██████████| 20/20 [00:01<00:00, 12.50it/s]


Train Loss: 2.4498, Train Acc: 0.4634
Val Loss: 1.2719, Val Acc: 0.2468



### 2.2.3 Train LSTM

In [None]:
# ✅
BERT_MODEL     = "bert-base-uncased"
MAX_LEN        = 256
LSTM_HID_DIM   = 512
NUM_CLASSES    = 4
DROPOUT_PROB   = 0.2
NUM_LAYERS     = 3
BATCH_SIZE     = 16
EPOCHS         = 10
LR             = 2e-4

DEVICE = "cpu"
if torch.cuda.is_available(): 
    DEVICE = "cuda"
elif torch.backends.mps.is_available():
    DEVICE = "mps"
    
label2idx = {
    "SUPPORTS":         0,
    "REFUTES":          1,
    "NOT_ENOUGH_INFO":  2,
    "DISPUTED":         3,
}

with open(train_claims_path, "r", encoding="utf-8") as f:
    train_claims = json.load(f)
with open(dev_claims_path, "r", encoding="utf-8") as f:
    dev_claims = json.load(f)
with open(evidence_path, "r", encoding="utf-8") as f:
    evidence_dict = json.load(f)

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

class ClaimEvidenceDataset(Dataset):
    def __init__(self, claims, evidences, tokenizer, max_len):
        self.items = []
        for cid, obj in claims.items():
            claim_text = obj["claim_text"]
            ev_ids     = obj.get("evidences", [])
            ev_texts   = [evidences[e] for e in ev_ids if e in evidences]
            full_input = claim_text + " [SEP] " + " ".join(ev_texts)
            label = label2idx[obj["claim_label"]]
            self.items.append((full_input, label))
        self.tokenizer = tokenizer
        self.max_len   = max_len

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        text, label = self.items[idx]
        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return (
            enc["input_ids"].squeeze(0),
            enc["attention_mask"].squeeze(0),
            torch.tensor(label, dtype=torch.long),
        )

def collate_batch(batch):
    ids, masks, labs = zip(*batch)
    return torch.stack(ids), torch.stack(masks), torch.stack(labs)

train_ds = ClaimEvidenceDataset(train_claims, evidence_dict, tokenizer, MAX_LEN)
dev_ds   = ClaimEvidenceDataset(dev_claims,   evidence_dict, tokenizer, MAX_LEN)

train_dl = DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True,
    collate_fn=collate_batch, num_workers=0, pin_memory=True
)
dev_dl   = DataLoader(
    dev_ds,   batch_size=BATCH_SIZE, shuffle=False,
    collate_fn=collate_batch, num_workers=0, pin_memory=True
)

class BiLSTMWithBertEncoder(nn.Module):
    def __init__(self, bert_name, lstm_hid, num_classes, 
                 dropout_prob, lstm_layers):
        super().__init__()
        self.bert = BertModel.from_pretrained(bert_name)
        for p in self.bert.parameters():
            p.requires_grad = False

        bert_dim = self.bert.config.hidden_size

        self.dropout_bert = nn.Dropout(dropout_prob)

        self.lstm = nn.LSTM(
            input_size    = bert_dim,
            hidden_size   = lstm_hid,
            num_layers    = lstm_layers,
            batch_first   = True,
            bidirectional = True,
            dropout       = dropout_prob
        )

        self.attn_fc = nn.Linear(2 * lstm_hid, 1)

        self.dropout_pool = nn.Dropout(dropout_prob)

        self.classifier = nn.Linear(2 * lstm_hid, num_classes)

    def forward(self, input_ids, attention_mask):
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        seq_emb  = bert_out.last_hidden_state 
        seq_emb  = self.dropout_bert(seq_emb)

        lstm_out, _ = self.lstm(seq_emb)

        scores = self.attn_fc(lstm_out).squeeze(-1)
        scores = scores.masked_fill(attention_mask == 0, -1e9)
        alphas = torch.softmax(scores, dim=1)
        pooled = torch.sum(lstm_out * alphas.unsqueeze(-1), dim=1)

        pooled = self.dropout_pool(pooled)
        logits = self.classifier(pooled)
        return logits
    
model = BiLSTMWithBertEncoder(BERT_MODEL, LSTM_HID_DIM, NUM_CLASSES, DROPOUT_PROB, NUM_LAYERS)
model.to(DEVICE)

BiLSTMWithBertEncoder(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
# ✅
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

best_acc = 0.0
BEST_MODEL_PATH = "task2_best_lstm.pt"

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0
    for input_ids, attn_mask, labels in tqdm(train_dl, desc=f"Train Epoch {epoch}"):
        input_ids = input_ids.to(DEVICE)
        attn_mask = attn_mask.to(DEVICE)
        labels    = labels.to(DEVICE)

        logits = model(input_ids, attn_mask)
        loss   = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dl)
    print(f"→ Epoch {epoch} Avg Loss: {avg_loss:.4f}")

    model.eval()
    correct = 0
    total   = 0
    with torch.no_grad():
        for input_ids, attn_mask, labels in tqdm(dev_dl, desc=" Eval"):
            input_ids = input_ids.to(DEVICE)
            attn_mask = attn_mask.to(DEVICE)
            labels    = labels.to(DEVICE)

            preds = model(input_ids, attn_mask).argmax(dim=1)
            correct += (preds == labels).sum().item()
            total   += labels.size(0)

    acc = correct / total
    print(f"→ Dev Accuracy: {acc:.4%}")

    if acc > best_acc:
        best_acc = acc
        torch.save(model.state_dict(), BEST_MODEL_PATH)
        print(f"✅ New best model saved (epoch {epoch}, acc {acc:.4%})\n")
    else:
        print()

print("""
Train Epoch 1: 100%|██████████| 77/77 [00:41<00:00,  1.87it/s]
→ Epoch 1 Avg Loss: 1.2819
 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.26it/s]
→ Dev Accuracy: 44.1558%
✅ New best model saved (epoch 1, acc 44.1558%)

Train Epoch 2: 100%|██████████| 77/77 [00:41<00:00,  1.87it/s]
→ Epoch 2 Avg Loss: 1.2243
 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.17it/s]
→ Dev Accuracy: 45.4545%
✅ New best model saved (epoch 2, acc 45.4545%)

Train Epoch 3: 100%|██████████| 77/77 [00:41<00:00,  1.88it/s]
→ Epoch 3 Avg Loss: 1.2018
 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.29it/s]
→ Dev Accuracy: 46.1039%
✅ New best model saved (epoch 3, acc 46.1039%)

Train Epoch 4: 100%|██████████| 77/77 [00:40<00:00,  1.91it/s]
→ Epoch 4 Avg Loss: 1.1856
 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.28it/s]
→ Dev Accuracy: 48.7013%
✅ New best model saved (epoch 4, acc 48.7013%)

Train Epoch 5: 100%|██████████| 77/77 [00:40<00:00,  1.91it/s]
→ Epoch 5 Avg Loss: 1.1661
 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.31it/s]
→ Dev Accuracy: 51.2987%
✅ New best model saved (epoch 5, acc 51.2987%)

Train Epoch 6: 100%|██████████| 77/77 [00:40<00:00,  1.91it/s]
→ Epoch 6 Avg Loss: 1.1500
 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.29it/s]
→ Dev Accuracy: 50.6494%

Train Epoch 7: 100%|██████████| 77/77 [00:40<00:00,  1.91it/s]
→ Epoch 7 Avg Loss: 1.1397
 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.31it/s]
→ Dev Accuracy: 51.9481%
✅ New best model saved (epoch 7, acc 51.9481%)

Train Epoch 8: 100%|██████████| 77/77 [00:40<00:00,  1.91it/s]
→ Epoch 8 Avg Loss: 1.1269
 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.29it/s]
→ Dev Accuracy: 53.2468%
✅ New best model saved (epoch 8, acc 53.2468%)

Train Epoch 9: 100%|██████████| 77/77 [00:40<00:00,  1.91it/s]
→ Epoch 9 Avg Loss: 1.1130
 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.28it/s]
→ Dev Accuracy: 54.5455%
✅ New best model saved (epoch 9, acc 54.5455%)

Train Epoch 10: 100%|██████████| 77/77 [00:40<00:00,  1.91it/s]
→ Epoch 10 Avg Loss: 1.1044
 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.30it/s]
→ Dev Accuracy: 53.8961%
""")

Train Epoch 1: 100%|██████████| 77/77 [00:41<00:00,  1.87it/s]


→ Epoch 1 Avg Loss: 1.2819


 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.26it/s]


→ Dev Accuracy: 44.1558%
✅ New best model saved (epoch 1, acc 44.1558%)



Train Epoch 2: 100%|██████████| 77/77 [00:41<00:00,  1.87it/s]


→ Epoch 2 Avg Loss: 1.2243


 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.17it/s]


→ Dev Accuracy: 45.4545%
✅ New best model saved (epoch 2, acc 45.4545%)



Train Epoch 3: 100%|██████████| 77/77 [00:41<00:00,  1.88it/s]


→ Epoch 3 Avg Loss: 1.2018


 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.29it/s]


→ Dev Accuracy: 46.1039%
✅ New best model saved (epoch 3, acc 46.1039%)



Train Epoch 4: 100%|██████████| 77/77 [00:40<00:00,  1.91it/s]


→ Epoch 4 Avg Loss: 1.1856


 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.28it/s]


→ Dev Accuracy: 48.7013%
✅ New best model saved (epoch 4, acc 48.7013%)



Train Epoch 5: 100%|██████████| 77/77 [00:40<00:00,  1.91it/s]


→ Epoch 5 Avg Loss: 1.1661


 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.31it/s]


→ Dev Accuracy: 51.2987%
✅ New best model saved (epoch 5, acc 51.2987%)



Train Epoch 6: 100%|██████████| 77/77 [00:40<00:00,  1.91it/s]


→ Epoch 6 Avg Loss: 1.1500


 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.29it/s]


→ Dev Accuracy: 50.6494%



Train Epoch 7: 100%|██████████| 77/77 [00:40<00:00,  1.91it/s]


→ Epoch 7 Avg Loss: 1.1397


 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.31it/s]


→ Dev Accuracy: 51.9481%
✅ New best model saved (epoch 7, acc 51.9481%)



Train Epoch 8: 100%|██████████| 77/77 [00:40<00:00,  1.91it/s]


→ Epoch 8 Avg Loss: 1.1269


 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.29it/s]


→ Dev Accuracy: 53.2468%
✅ New best model saved (epoch 8, acc 53.2468%)



Train Epoch 9: 100%|██████████| 77/77 [00:40<00:00,  1.91it/s]


→ Epoch 9 Avg Loss: 1.1130


 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.28it/s]


→ Dev Accuracy: 54.5455%
✅ New best model saved (epoch 9, acc 54.5455%)



Train Epoch 10: 100%|██████████| 77/77 [00:40<00:00,  1.91it/s]


→ Epoch 10 Avg Loss: 1.1044


 Eval: 100%|██████████| 10/10 [00:03<00:00,  3.30it/s]

→ Dev Accuracy: 53.8961%






In [None]:
model = BiLSTMWithBertEncoder(
    bert_name=BERT_MODEL,
    lstm_hid=LSTM_HID_DIM,
    num_classes=NUM_CLASSES,
    dropout_prob=DROPOUT_PROB,
    lstm_layers=NUM_LAYERS,
)
model.load_state_dict(torch.load("task2_best_lstm.pt", map_location=DEVICE))
model.to(DEVICE)
model.eval()

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

dev_ds = ClaimEvidenceDataset(dev_claims, evidence_dict, tokenizer, MAX_LEN)
dev_dl = DataLoader(
    dev_ds, batch_size=BATCH_SIZE, shuffle=False,
    collate_fn=collate_batch, num_workers=0, pin_memory=True
)

from sklearn.metrics import accuracy_score

model.eval()
preds, labels = [], []

with torch.no_grad():
    for input_ids, attn_mask, batch_labels in tqdm(dev_dl, desc="Evaluating best model"):
        input_ids = input_ids.to(DEVICE)
        attn_mask = attn_mask.to(DEVICE)
        batch_labels = batch_labels.to(DEVICE)

        logits = model(input_ids, attn_mask)
        pred = torch.argmax(logits, dim=1)

        preds.extend(pred.cpu().numpy())
        labels.extend(batch_labels.cpu().numpy())

acc = accuracy_score(labels, preds)
print(f"🎯 Reloaded Best Model Accuracy: {acc:.4%}")

Evaluating best model: 100%|██████████| 10/10 [00:03<00:00,  2.96it/s]

🎯 Reloaded Best Model Accuracy: 54.5455%





### 2.2.4 Ensembled Model Using Soft Vote

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 256
id2label = {0: "SUPPORTS", 1: "REFUTES", 2: "NOT_ENOUGH_INFO", 3: "DISPUTED"}
label2id = {v: k for k, v in id2label.items()}

with open(evidence_path, "r") as f:
    evidence_dict = json.load(f)

bert_model = BertForSequenceClassification.from_pretrained("model/my_bert_classifier").to(DEVICE)
bert_tokenizer = BertTokenizer.from_pretrained("model/my_bert_classifier")
bert_model.eval()

class BiLSTMWithBertEncoder(torch.nn.Module):
    def __init__(self, bert_name, lstm_hid, num_classes, dropout_prob, lstm_layers):
        super().__init__()
        from transformers import AutoModel
        self.bert = AutoModel.from_pretrained(bert_name)
        for p in self.bert.parameters():
            p.requires_grad = False
        bert_dim = self.bert.config.hidden_size
        self.dropout_bert = torch.nn.Dropout(dropout_prob)
        self.lstm = torch.nn.LSTM(
            input_size=bert_dim,
            hidden_size=lstm_hid,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout_prob
        )
        self.attn_fc = torch.nn.Linear(2 * lstm_hid, 1)
        self.dropout_pool = torch.nn.Dropout(dropout_prob)
        self.classifier = torch.nn.Linear(2 * lstm_hid, num_classes)

    def forward(self, input_ids, attention_mask):
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        seq_emb = self.dropout_bert(bert_out.last_hidden_state)
        lstm_out, _ = self.lstm(seq_emb)
        scores = self.attn_fc(lstm_out).squeeze(-1)
        scores = scores.masked_fill(attention_mask == 0, -1e9)
        alphas = torch.softmax(scores, dim=1)
        pooled = torch.sum(lstm_out * alphas.unsqueeze(-1), dim=1)
        pooled = self.dropout_pool(pooled)
        logits = self.classifier(pooled)
        return logits

BERT_MODEL = "bert-base-uncased"
LSTM_HID_DIM = 512
NUM_CLASSES = 4
DROPOUT_PROB = 0.2
NUM_LAYERS = 3

bilstm_model = BiLSTMWithBertEncoder(BERT_MODEL, LSTM_HID_DIM, NUM_CLASSES, DROPOUT_PROB, NUM_LAYERS).to(DEVICE)
bilstm_model.load_state_dict(torch.load("task2_best_model.pt", map_location=DEVICE))
bilstm_model.eval()
bilstm_tokenizer = bert_tokenizer

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 50
EMBED_DIM = 100
HIDDEN_DIM = 64
NUM_CLASSES = 4
DROPOUT_PROB = 0.4
vocab_size = 1

stopwords = set(nltk_stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('english')

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word, 'v')
    return lemmatizer.lemmatize(lemma, 'n')

def preprocess(text, remove_stopwords=True, lemma=True, stem=False):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [t for t in tokens if re.match('^[a-zA-Z0-9-]+$', t)]
    if remove_stopwords:
        tokens = [t for t in tokens if t not in stopwords]
    if lemma:
        tokens = [lemmatize(t) for t in tokens]
    if stem:
        tokens = [stemmer.stem(t) for t in tokens]
    return ' '.join(tokens)

def text_to_seq(text):
    tokens = text.split()
    seq = [vocab.get(t, 0) for t in tokens]
    return seq + [0] * (MAX_LEN - len(seq)) if len(seq) < MAX_LEN else seq[:MAX_LEN]

class SelfAttentionPooling(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.attention = nn.Linear(input_dim, 1)

    def forward(self, x):
        weights = torch.softmax(self.attention(x), dim=1)
        pooled = torch.sum(weights * x, dim=1)
        return pooled

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.embed_dropout = nn.Dropout(DROPOUT_PROB)
        self.rnn_claim = nn.RNN(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.rnn_evid = nn.RNN(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.rnn_dropout = nn.Dropout(DROPOUT_PROB)
        self.attention_claim = SelfAttentionPooling(hidden_dim * 2)
        self.attention_evid = SelfAttentionPooling(hidden_dim * 2)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim * 4, 128),
            nn.ReLU(),
            nn.Dropout(DROPOUT_PROB),
            nn.Linear(128, num_classes)
        )

    def forward(self, claim, evidence):
        claim_emb = self.embed_dropout(self.embedding(claim))
        evid_emb = self.embed_dropout(self.embedding(evidence))

        claim_out, _ = self.rnn_claim(claim_emb)
        evid_out, _ = self.rnn_evid(evid_emb)

        claim_out = self.rnn_dropout(claim_out)
        evid_out = self.rnn_dropout(evid_out)

        claim_pool = self.attention_claim(claim_out)
        evid_pool = self.attention_evid(evid_out)

        combined = torch.cat([claim_pool, evid_pool], dim=1)
        return self.classifier(combined)

with open(train_claims_path, "r", encoding="utf-8") as f:
    train_data = json.load(f)
with open(evidence_path, "r", encoding="utf-8") as f:
    evidence_dict = json.load(f)

all_text = []
for item in train_data.values():
    claim = preprocess(item["claim_text"])
    evids = ' '.join([evidence_dict.get(eid, '') for eid in item["evidences"]])
    ev_text = preprocess(evids)
    all_text.extend(claim.split() + ev_text.split())

token_counts = Counter(all_text)
vocab = {w: idx + 1 for idx, (w, _) in enumerate(token_counts.items())}
vocab_size = len(vocab) + 1

rnn_model = RNNModel(vocab_size, EMBED_DIM, HIDDEN_DIM, NUM_CLASSES).to(DEVICE)
rnn_model.load_state_dict(torch.load("rnn_model.pth", map_location=DEVICE))
rnn_model.eval()

with open("label_encoder.pkl", "rb") as f:
    label_enc = pickle.load(f)

def get_bert_probs(claim, evid_ids):
    evids = " ".join([evidence_dict.get(eid, "") for eid in evid_ids])
    inputs = bert_tokenizer(claim, evids, truncation=True, padding="max_length", max_length=MAX_LEN, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = bert_model(**inputs).logits
        probs = F.softmax(logits, dim=-1).cpu().numpy()[0]
    return np.array(probs)

def get_bilstm_probs(claim, evid_ids):
    evids = " ".join([evidence_dict.get(eid, "") for eid in evid_ids])
    inputs = bilstm_tokenizer(claim, evids, truncation=True, padding="max_length", max_length=MAX_LEN, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = bilstm_model(inputs["input_ids"], inputs["attention_mask"])
    return F.softmax(logits, dim=-1).cpu().numpy()[0]


def get_rnn_probs(claim, evid_ids):
    claim_text = preprocess(claim)
    evid_text = " ".join([evidence_dict.get(eid, "") for eid in evid_ids])
    evid_text = preprocess(evid_text)
    claim_seq = text_to_seq(claim_text)
    evid_seq = text_to_seq(evid_text)
    claim_tensor = torch.tensor([claim_seq], dtype=torch.long).to(DEVICE)
    evid_tensor = torch.tensor([evid_seq], dtype=torch.long).to(DEVICE)
    with torch.no_grad():
        logits = rnn_model(claim_tensor, evid_tensor)
        probs = F.softmax(logits, dim=-1).cpu().numpy()[0]
    return np.array(probs)

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Predict Task 1 on Dev Set

In [None]:
# ✅
word_embedding_path = './word_embedding/evidence_embeddings.npy'
word_embedding_meta_path = "./word_embedding/evidence_meta.pkl"

evidence_embeddings = np.load(word_embedding_path)

with open(word_embedding_meta_path, "rb") as f:
    evidence_ids, evidence_texts, original_evidence_ids = pickle.load(f)

arr = np.array(evidence_embeddings, dtype='float32', order='C')

dimension = evidence_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(np.array(evidence_embeddings, dtype='float32', order='C'))


def clean_claim(claim: str) -> str:
    claim = claim.lower()
    claim = re.sub(r'[^a-z0-9\s]', '', claim)
    claim = re.sub(r'\s+', ' ', claim).strip()
    return claim

evidence_dict = dict(zip(evidence_ids, evidence_texts))

with open(evidence_path, 'r') as f:
    original_evidence_dict = json.load(f)

def retrieve_evidence(claim_id, claim_data, retrieval=100, top_k=5):
    claim_text = claim_data["claim_text"]
    cleaned_claim = clean_claim(claim_text)

    claim_embedding = model.encode([cleaned_claim], convert_to_numpy=True, normalize_embeddings=True)
    scores, indices = index.search(claim_embedding, retrieval * 3)

    seen_original_ids = set()
    candidates = []
    for i in indices[0]:
        eid = evidence_ids[i]
        text = evidence_dict[eid]
        original_id = original_evidence_ids[i]

        if original_id not in seen_original_ids:
            candidates.append((original_id, eid, text))
            seen_original_ids.add(original_id)

        if len(candidates) >= retrieval:
            break

    pairs = [(claim_text, original_evidence_dict[orig_id]) for (orig_id, _, _) in candidates]  
    similarity_scores = reranker.predict(pairs)

    reranked = sorted(zip(candidates, similarity_scores), key=lambda x: x[1], reverse=True)

    top_k_original_ids = [orig_id for (orig_id, _, _), _ in reranked[:top_k]]

    result = {
        "claim_text": claim_text,
        "evidences": top_k_original_ids
    }

    return result

In [None]:
# ✅
with open(dev_claims_path, 'r') as f:
    dev_claims = json.load(f)

claim_ids = list(dev_claims.keys())

retrieval_values = [100,200,500,1000]
top_k_values = [3, 4, 5]

best_f1 = 0
best_setting = {}

for retrieval in retrieval_values:
    for top_k in top_k_values:
        recalls = []
        precisions = []
        f1s = []

        for cid in tqdm(claim_ids, desc=f"Evaluating R={retrieval}, K={top_k}"):
            truth = set(dev_claims[cid]["evidences"])
            
            retrieved_info = retrieve_evidence(cid, dev_claims[cid], retrieval=retrieval, top_k=top_k)
            retrieved = set(retrieved_info["evidences"])

            hit = len(truth & retrieved)

            recall = hit / len(truth) if len(truth) > 0 else 0
            precision = hit / top_k if top_k > 0 else 0

            if precision + recall > 0:
                f1 = 2 * precision * recall / (precision + recall)
            else:
                f1 = 0

            recalls.append(recall)
            precisions.append(precision)
            f1s.append(f1)

        avg_recall = np.mean(recalls)
        avg_precision = np.mean(precisions)
        avg_f1 = np.mean(f1s)

        print(f"\nRetrieval={retrieval}, Top-K={top_k}")
        print(f"   - Avg Recall   : {avg_recall:.2%}")
        print(f"   - Avg Precision: {avg_precision:.2%}")
        print(f"   - Avg F1       : {avg_f1:.2%}")

        if avg_f1 > best_f1:
            best_f1 = avg_f1
            best_setting = {'retrieval': retrieval, 'top_k': top_k}

print(f"\nBest Setting: Retrieval={best_setting['retrieval']}, Top-K={best_setting['top_k']}, F1={best_f1:.2%}")

print("""
Evaluating R=100, K=3: 100%|██████████| 154/154 [00:21<00:00,  7.14it/s]
Retrieval=100, Top-K=3
   - Avg Recall   : 22.71%
   - Avg Precision: 20.35%
   - Avg F1       : 19.85%
Evaluating R=100, K=4: 100%|██████████| 154/154 [00:21<00:00,  7.21it/s]
Retrieval=100, Top-K=4
   - Avg Recall   : 26.36%
   - Avg Precision: 18.02%
   - Avg F1       : 19.90%
Evaluating R=100, K=5: 100%|██████████| 154/154 [00:23<00:00,  6.63it/s]
Retrieval=100, Top-K=5
   - Avg Recall   : 27.86%
   - Avg Precision: 15.45%
   - Avg F1       : 18.59%
Evaluating R=200, K=3: 100%|██████████| 154/154 [00:38<00:00,  4.03it/s]
Retrieval=200, Top-K=3
   - Avg Recall   : 22.25%
   - Avg Precision: 19.70%
   - Avg F1       : 19.32%
Evaluating R=200, K=4: 100%|██████████| 154/154 [00:41<00:00,  3.74it/s]
Retrieval=200, Top-K=4
   - Avg Recall   : 25.65%
   - Avg Precision: 17.37%
   - Avg F1       : 19.25%
Evaluating R=200, K=5: 100%|██████████| 154/154 [00:43<00:00,  3.56it/s]
Retrieval=200, Top-K=5
   - Avg Recall   : 27.47%
   - Avg Precision: 15.06%
   - Avg F1       : 18.20%
Evaluating R=500, K=3: 100%|██████████| 154/154 [01:45<00:00,  1.46it/s]
Retrieval=500, Top-K=3
   - Avg Recall   : 20.89%
   - Avg Precision: 18.61%
   - Avg F1       : 18.25%
Evaluating R=500, K=4: 100%|██████████| 154/154 [01:49<00:00,  1.41it/s]
Retrieval=500, Top-K=4
   - Avg Recall   : 25.31%
   - Avg Precision: 16.88%
   - Avg F1       : 18.84%
Evaluating R=500, K=5: 100%|██████████| 154/154 [01:50<00:00,  1.39it/s]
Retrieval=500, Top-K=5
   - Avg Recall   : 26.56%
   - Avg Precision: 14.55%
   - Avg F1       : 17.57%
Evaluating R=1000, K=3: 100%|██████████| 154/154 [03:40<00:00,  1.43s/it]
Retrieval=1000, Top-K=3
   - Avg Recall   : 20.78%
   - Avg Precision: 18.61%
   - Avg F1       : 18.21%
Evaluating R=1000, K=4: 100%|██████████| 154/154 [03:41<00:00,  1.44s/it]
Retrieval=1000, Top-K=4
   - Avg Recall   : 25.18%
   - Avg Precision: 16.72%
   - Avg F1       : 18.69%
Evaluating R=1000, K=5: 100%|██████████| 154/154 [03:39<00:00,  1.42s/it]
Retrieval=1000, Top-K=5
   - Avg Recall   : 26.61%
   - Avg Precision: 14.55%
   - Avg F1       : 17.59%

Best Setting: Retrieval=100, Top-K=4, F1=19.90%
""")


Evaluating R=100, K=3: 100%|██████████| 154/154 [00:21<00:00,  7.14it/s]
Retrieval=100, Top-K=3
   - Avg Recall   : 22.71%
   - Avg Precision: 20.35%
   - Avg F1       : 19.85%
Evaluating R=100, K=4: 100%|██████████| 154/154 [00:21<00:00,  7.21it/s]
Retrieval=100, Top-K=4
   - Avg Recall   : 26.36%
   - Avg Precision: 18.02%
   - Avg F1       : 19.90%
Evaluating R=100, K=5: 100%|██████████| 154/154 [00:23<00:00,  6.63it/s]
Retrieval=100, Top-K=5
   - Avg Recall   : 27.86%
   - Avg Precision: 15.45%
   - Avg F1       : 18.59%
Evaluating R=200, K=3: 100%|██████████| 154/154 [00:38<00:00,  4.03it/s]
Retrieval=200, Top-K=3
   - Avg Recall   : 22.25%
   - Avg Precision: 19.70%
   - Avg F1       : 19.32%
Evaluating R=200, K=4: 100%|██████████| 154/154 [00:41<00:00,  3.74it/s]
Retrieval=200, Top-K=4
   - Avg Recall   : 25.65%
   - Avg Precision: 17.37%
   - Avg F1       : 19.25%
Evaluating R=200, K=5: 100%|██████████| 154/154 [00:43<00:00,  3.56it/s]
Retrieval=200, Top-K=5
   - Avg Recall   

## Predict Task 2 on Dev Set

In [None]:
hf_logging.set_verbosity_error()

with open(dev_claims_path, "r") as f:
    dev_claims = json.load(f)

true_labels = []
pred_labels = []

for cid, entry in tqdm(dev_claims.items(), desc="Ensemble Predicting"):
    claim_text = entry["claim_text"]
    evidence_ids = entry.get("evidences", [])
    true_label = label2id[entry["claim_label"]]

    p1 = get_bert_probs(claim_text, evidence_ids)
    p2 = get_bilstm_probs(claim_text, evidence_ids)
    p3 = get_rnn_probs(claim_text, evidence_ids)

    avg_probs = (p1 + p2 + p3) / 3
    pred_idx = int(np.argmax(avg_probs))

    true_labels.append(true_label)
    pred_labels.append(pred_idx)

acc = accuracy_score(true_labels, pred_labels)
print(f"✅ Ensemble Accuracy on Dev Set: {acc:.4f}")

Ensemble Predicting: 100%|██████████| 154/154 [00:17<00:00,  8.56it/s]

✅ Ensemble Accuracy on Dev Set: 0.5130





In [27]:
model = BiLSTMWithBertEncoder(
    bert_name=BERT_MODEL,
    lstm_hid=LSTM_HID_DIM,
    num_classes=NUM_CLASSES,
    dropout_prob=DROPOUT_PROB,
    lstm_layers=NUM_LAYERS,
)
model.load_state_dict(torch.load("task2_best_lstm.pt", map_location=DEVICE))
model.to(DEVICE)
model.eval()

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

dev_ds = ClaimEvidenceDataset(dev_claims, evidence_dict, tokenizer, MAX_LEN)
dev_dl = DataLoader(
    dev_ds, batch_size=BATCH_SIZE, shuffle=False,
    collate_fn=collate_batch, num_workers=0, pin_memory=True
)

from sklearn.metrics import accuracy_score

model.eval()
preds, labels = [], []

with torch.no_grad():
    for input_ids, attn_mask, batch_labels in tqdm(dev_dl, desc="Evaluating best model"):
        input_ids = input_ids.to(DEVICE)
        attn_mask = attn_mask.to(DEVICE)
        batch_labels = batch_labels.to(DEVICE)

        logits = model(input_ids, attn_mask)
        pred = torch.argmax(logits, dim=1)

        preds.extend(pred.cpu().numpy())
        labels.extend(batch_labels.cpu().numpy())

acc = accuracy_score(labels, preds)
print(f"🎯 Reloaded Best Model Accuracy: {acc:.4%}")

Evaluating best model: 100%|██████████| 10/10 [00:03<00:00,  3.26it/s]

🎯 Reloaded Best Model Accuracy: 54.5455%





## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*