# 2025 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 0.Environment
```bash
pip3 install tqdm 
pip3 install transformers 
pip install nltk tqdm
python -m nltk.downloader punkt stopwords
pip install spacy
python -m spacy download en_core_web_sm


```

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## 1.1 Data Loading

In [1]:
import os
import json
import statistics
import pandas as pd
from tqdm import tqdm

DATA_DIR = "data"

def load_json(fname):
    path = os.path.join(DATA_DIR, fname)
    if not os.path.exists(path):
        print(f"[WARN] {path} not found, skip.")
        return None
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

train     = load_json("train-claims.json")
dev       = load_json("dev-claims.json")
test      = load_json("test-claims-unlabelled.json")
evidence  = load_json("evidence.json")

def claim_stats(claim_dict, split_name):
    if claim_dict is None:
        return {"split": split_name, "n_claims": 0}

    n_claims   = len(claim_dict)
    labels     = [v.get("claim_label") for v in claim_dict.values() if "claim_label" in v]
    ev_per_c   = [len(v.get("evidences", [])) for v in claim_dict.values()]
    return {
        "split": split_name,
        "# claims": n_claims,
        "avg #evidence": round(statistics.mean(ev_per_c), 2) if ev_per_c else 0,
        "label distribution": pd.Series(labels).value_counts().to_dict() if labels else {},
    }

summary = [
    claim_stats(train, "train"),
    claim_stats(dev,   "dev"),
    claim_stats(test,  "test")
]

if evidence is not None:
    token_lens = [len(passage.split()) for passage in evidence.values()]
    summary.append({
        "split": "evidence-corpus",
        "# claims": len(evidence),     
        "avg #evidence": f"{statistics.mean(token_lens):.1f} tokens",                  
        "label distribution": "-",                               
    })

print(pd.DataFrame(summary).to_string(index=False))



          split  # claims avg #evidence                                                         label distribution
          train      1228          3.36 {'SUPPORTS': 519, 'NOT_ENOUGH_INFO': 386, 'REFUTES': 199, 'DISPUTED': 124}
            dev       154          3.19     {'SUPPORTS': 68, 'NOT_ENOUGH_INFO': 41, 'REFUTES': 27, 'DISPUTED': 18}
           test       153             0                                                                         {}
evidence-corpus   1208827   19.7 tokens                                                                          -


## 1.2 Data Preprocessing

In [2]:
# 多进程预处理数据，结果以json文件存在本地
"""
Stemming-based preprocessing for the retrieval task.
✓ Multi-process spaCy
✓ Porter stemming
✓ On-disk cache (evidence_stemmed.json / claims_stemmed.json)
"""

import json, statistics, collections, time, multiprocessing as mp
from pathlib import Path
from tqdm import tqdm

import spacy
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# ------------------------------------------------------------------
# CONFIGURATION
# ------------------------------------------------------------------
DATA_DIR      = Path("data")
OUT_EVID      = Path("preprocessed/evidence_stemmed.json")
OUT_CLAIM     = Path("preprocessed/claims_stemmed.json")
FORCE_REBUILD = True                 # True → ignore cache, rebuild
BATCH_SIZE    = 1_000                 # spaCy batch size
NUM_PROC      = max(mp.cpu_count() - 1, 1)   # use all but 1 core

# ------------------------------------------------------------------
# INITIALISE SPACY & STEMMER
# ------------------------------------------------------------------
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
stemmer = PorterStemmer()
stop_set = set(stopwords.words("english"))

def stem_doc(doc):
    out = []
    for tok in doc:
        lemma = tok.lemma_.lower()
        if lemma.isalpha() and lemma not in stop_set:
            out.append(stemmer.stem(lemma))
    return out

def jload(path: Path):
    with path.open(encoding="utf-8") as f:
        return json.load(f)

def jdump(obj, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False)

# ------------------------------------------------------------------
# 0) LOAD CACHE (IF PRESENT)
# ------------------------------------------------------------------
if OUT_EVID.exists() and OUT_CLAIM.exists() and not FORCE_REBUILD:
    t0 = time.time()
    evidence_proc  = jload(OUT_EVID)
    claim_proc_all = jload(OUT_CLAIM)
    print(f"Cached data loaded in {time.time() - t0:.2f} s – ready to use.")
    exit(0)

print("No valid cache – preprocessing will start …")
t_start = time.time()

# ------------------------------------------------------------------
# 1) PRE-PROCESS EVIDENCE (PARALLEL)
# ------------------------------------------------------------------
evidence_raw = jload(DATA_DIR / "evidence.json")
evid_ids     = list(evidence_raw.keys())
evid_texts   = list(evidence_raw.values())

evidence_proc = {}
lengths = []

print(f"Tokenising {len(evid_ids):,} evidence passages "
      f"with {NUM_PROC} CPU process(es)…")

for evid_id, doc in tqdm(
        zip(evid_ids,
            nlp.pipe(evid_texts,
                     batch_size=BATCH_SIZE,
                     n_process=NUM_PROC)),
        total=len(evid_ids),
        desc="Stemming evidence",
        unit="doc"
):
    stems = stem_doc(doc)
    if stems:
        evidence_proc[evid_id] = stems
        lengths.append(len(stems))

jdump(evidence_proc, OUT_EVID)
print(f"Evidence saved → {OUT_EVID.resolve()}")

# ------------------------------------------------------------------
# 2) PRE-PROCESS CLAIMS (PARALLEL, PER SPLIT)
# ------------------------------------------------------------------
claim_files = [
    "train-claims.json",
    "dev-claims.json",
    "test-claims-unlabelled.json",
]
claim_proc_all = {}

for fname in claim_files:
    raw_claims = jload(DATA_DIR / fname)
    cids  = list(raw_claims.keys())
    texts = [raw_claims[cid]["claim_text"] for cid in cids]

    for cid, doc in tqdm(
            zip(cids,
                nlp.pipe(texts,
                         batch_size=BATCH_SIZE,
                         n_process=NUM_PROC)),
            total=len(cids),
            desc=f"Stemming {fname}",
            unit="doc"
    ):
        stems = stem_doc(doc)
        if stems:
            claim_proc_all[cid] = stems

jdump(claim_proc_all, OUT_CLAIM)
print(f"Claims saved → {OUT_CLAIM.resolve()}")

# ------------------------------------------------------------------
# 3) QUICK CORPUS STATISTICS
# ------------------------------------------------------------------
print("\n=== Evidence after stemming ===")
print(f"Total passages        : {len(evidence_proc):,}")
print(f"Stem length (min/max) : {min(lengths)} / {max(lengths)}")
print(f"Stem length (mean)    : {statistics.mean(lengths):.1f}")

vocab = {s for toks in evidence_proc.values() for s in toks}
print(f"Vocabulary size       : {len(vocab):,}")

counter = collections.Counter(s for toks in evidence_proc.values() for s in toks)
print("Top-20 stems          :", counter.most_common(20))

print(f"\nFinished in {time.time() - t_start:.1f} s – "
      f"results cached for future runs.")

No valid cache – preprocessing will start …
Tokenising 1,208,827 evidence passages with 9 CPU process(es)…


Stemming evidence: 100%|██████████| 1208827/1208827 [07:30<00:00, 2685.52doc/s]


Evidence saved → /Users/felikskong/Desktop/NLP/NLP_Ass3/preprocessed/evidence_stemmed.json


Stemming train-claims.json: 100%|██████████| 1228/1228 [00:44<00:00, 27.34doc/s]
Stemming dev-claims.json: 100%|██████████| 154/154 [00:44<00:00,  3.45doc/s]
Stemming test-claims-unlabelled.json: 100%|██████████| 153/153 [00:44<00:00,  3.44doc/s]


Claims saved → /Users/felikskong/Desktop/NLP/NLP_Ass3/preprocessed/claims_stemmed.json

=== Evidence after stemming ===
Total passages        : 1,207,920
Stem length (min/max) : 1 / 304
Stem length (mean)    : 11.3
Vocabulary size       : 510,195
Top-20 stems          : [('also', 66963), ('state', 58250), ('bear', 56376), ('first', 53537), ('one', 49589), ('new', 44100), ('year', 42117), ('play', 39752), ('american', 39704), ('includ', 39608), ('use', 39337), ('unit', 38930), ('nation', 37995), ('name', 37335), ('know', 37286), ('district', 34882), ('two', 34481), ('film', 33964), ('counti', 32636), ('footbal', 31480)]

Finished in 593.3 s – results cached for future runs.


# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [4]:
import json, random, math, time, itertools, collections
from pathlib import Path
from typing import List, Dict

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

DATA_DIR   = Path("data")
STEM_EVID  = Path("preprocessed/evidence_stemmed.json")
STEM_CLAIM = Path("preprocessed/claims_stemmed.json")

# 加载数据
claim_tokens = json.loads(STEM_CLAIM.read_text())
evidence_tokens = json.loads(STEM_EVID.read_text())

train_lbl = json.loads((DATA_DIR / "train-claims.json").read_text())
dev_lbl   = json.loads((DATA_DIR / "dev-claims.json").read_text())

# 所有 evidence ID（用于采样负例）
all_evid_ids = list(evidence_tokens.keys())

# helper：构建正负样本
def build_dataset(claim_label_dict, mode="train", neg_ratio=1):
    dataset = []
    for cid, info in claim_label_dict.items():
        claim_tok = claim_tokens.get(cid)
        if not claim_tok:
            continue

        pos_evids = [eid for eid in info.get("evidences", []) if eid in evidence_tokens]
        for eid in pos_evids:
            dataset.append({
                "claim_id": cid,
                "evidence_id": eid,
                "claim_tokens": claim_tok,
                "evidence_tokens": evidence_tokens[eid],
                "label": 1
            })

        # 构造负例（从非相关的 evidence 中随机采样）
        if neg_ratio > 0:
            for _ in range(len(pos_evids) * neg_ratio):
                while True:
                    neg_eid = random.choice(all_evid_ids)
                    if neg_eid not in pos_evids and neg_eid in evidence_tokens:
                        dataset.append({
                            "claim_id": cid,
                            "evidence_id": neg_eid,
                            "claim_tokens": claim_tok,
                            "evidence_tokens": evidence_tokens[neg_eid],
                            "label": 0
                        })
                        break
    print(f"[{mode}] total samples: {len(dataset)} ({len(claim_label_dict)} claims)")
    return dataset

train_data = build_dataset(train_lbl, mode="train", neg_ratio=1)
dev_data   = build_dataset(dev_lbl, mode="dev", neg_ratio=1)

Path("cached").mkdir(exist_ok=True)
with open("cached/train_retrieval.json", "w") as f:
    json.dump(train_data, f, ensure_ascii=False)
with open("cached/dev_retrieval.json", "w") as f:
    json.dump(dev_data, f, ensure_ascii=False)

[train] total samples: 8242 (1228 claims)
[dev] total samples: 982 (154 claims)


In [8]:
from collections import Counter
import json

def build_vocab(data_list, min_freq=2):
    counter = Counter()
    for item in data_list:
        counter.update(item["claim_tokens"])
        counter.update(item["evidence_tokens"])
    vocab = {"<PAD>": 0, "<UNK>": 1}
    for token, freq in counter.items():
        if freq >= min_freq:
            vocab[token] = len(vocab)
    return vocab

import torch
from torch.utils.data import Dataset

class ClaimEvidenceDataset(Dataset):
    def __init__(self, data, vocab, max_len=100):
        self.data = data
        self.vocab = vocab
        self.max_len = max_len

    def encode(self, tokens):
        ids = [self.vocab.get(tok, self.vocab["<UNK>"]) for tok in tokens]
        if len(ids) > self.max_len:
            return ids[:self.max_len]
        return ids + [self.vocab["<PAD>"]] * (self.max_len - len(ids))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        claim_ids = self.encode(item["claim_tokens"])
        evid_ids = self.encode(item["evidence_tokens"])
        label = torch.tensor(item["label"], dtype=torch.float)
        return torch.tensor(claim_ids), torch.tensor(evid_ids), label
    
import torch.nn as nn

class SiameseLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_size=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_size, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size * 4, 1)  # concat [c_avg, e_avg]
    
    def encode(self, x):
        embedded = self.embedding(x)  # (B, T, E)
        outputs, _ = self.lstm(embedded)  # (B, T, 2H)
        avg = outputs.mean(dim=1)  # average pooling
        return avg  # (B, 2H)

    def forward(self, claim, evid):
        c_repr = self.encode(claim)
        e_repr = self.encode(evid)
        x = torch.cat([c_repr, e_repr], dim=1)
        logits = self.fc(x).squeeze(1)  # (B,)
        return logits

In [14]:
import json, random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from tqdm import tqdm
from sklearn.metrics import accuracy_score, roc_auc_score
from pathlib import Path

# -----------------------------
# Load and preprocess data
# -----------------------------
train_data = json.load(open("cached/train_retrieval.json"))
dev_data   = json.load(open("cached/dev_retrieval.json"))

def build_vocab(data_list, min_freq=2):
    counter = Counter()
    for item in data_list:
        counter.update(item["claim_tokens"])
        counter.update(item["evidence_tokens"])
    vocab = {"<PAD>": 0, "<UNK>": 1}
    for token, freq in counter.items():
        if freq >= min_freq:
            vocab[token] = len(vocab)
    return vocab

vocab = build_vocab(train_data + dev_data)

class ClaimEvidenceDataset(Dataset):
    def __init__(self, data, vocab, max_len=100):
        self.data = data
        self.vocab = vocab
        self.max_len = max_len

    def encode(self, tokens):
        ids = [self.vocab.get(tok, self.vocab["<UNK>"]) for tok in tokens]
        if len(ids) > self.max_len:
            return ids[:self.max_len]
        return ids + [self.vocab["<PAD>"]] * (self.max_len - len(ids))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        claim_ids = self.encode(item["claim_tokens"])
        evid_ids  = self.encode(item["evidence_tokens"])
        label = torch.tensor(item["label"], dtype=torch.float)
        return torch.tensor(claim_ids), torch.tensor(evid_ids), label

train_dataset = ClaimEvidenceDataset(train_data, vocab)
dev_dataset   = ClaimEvidenceDataset(dev_data, vocab)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
dev_loader   = DataLoader(dev_dataset, batch_size=64)

# -----------------------------
# Model
# -----------------------------
class SiameseLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_size=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_size, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size * 4, 1)
    
    def encode(self, x):
        embedded = self.embedding(x)
        outputs, _ = self.lstm(embedded)
        avg = outputs.mean(dim=1)
        return avg

    def forward(self, claim, evid):
        c_repr = self.encode(claim)
        e_repr = self.encode(evid)
        x = torch.cat([c_repr, e_repr], dim=1)
        logits = self.fc(x).squeeze(1)
        return logits

# -----------------------------
# Train & Eval Functions
# -----------------------------
def train_one_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []

    for claim, evid, label in tqdm(dataloader, desc="Training", leave=False):
        claim, evid, label = claim.to(device), evid.to(device), label.to(device)
        logits = model(claim, evid)
        loss = F.binary_cross_entropy_with_logits(logits, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        all_preds.extend(torch.sigmoid(logits).detach().cpu().numpy())
        all_labels.extend(label.cpu().numpy())

    preds_binary = [1 if p > 0.5 else 0 for p in all_preds]
    acc = accuracy_score(all_labels, preds_binary)
    auc = roc_auc_score(all_labels, all_preds)
    return total_loss / len(dataloader), acc, auc

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for claim, evid, label in tqdm(dataloader, desc="Evaluating", leave=False):
            claim, evid, label = claim.to(device), evid.to(device), label.to(device)
            logits = model(claim, evid)
            loss = F.binary_cross_entropy_with_logits(logits, label)

            total_loss += loss.item()
            all_preds.extend(torch.sigmoid(logits).cpu().numpy())
            all_labels.extend(label.cpu().numpy())

    preds_binary = [1 if p > 0.5 else 0 for p in all_preds]
    acc = accuracy_score(all_labels, preds_binary)
    auc = roc_auc_score(all_labels, all_preds)
    return total_loss / len(dataloader), acc, auc

# -----------------------------
# Training Loop
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SiameseLSTM(vocab_size=len(vocab)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

NUM_EPOCHS = 1
best_auc = 0.0

for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")
    train_loss, train_acc, train_auc = train_one_epoch(model, train_loader, optimizer, device)
    print(f"Train loss: {train_loss:.4f} | acc: {train_acc:.4f} | auc: {train_auc:.4f}")

    dev_loss, dev_acc, dev_auc = evaluate(model, dev_loader, device)
    print(f" Dev  loss: {dev_loss:.4f} | acc: {dev_acc:.4f} | auc: {dev_auc:.4f}")

    # 保存最佳模型
    if dev_auc > best_auc:
        best_auc = dev_auc
        torch.save(model.state_dict(), "best_model.pt")
        print(f"✅ Best model saved with dev AUC: {best_auc:.4f}")

with open("cached/vocab.json", "w") as f:
    json.dump(vocab, f)


Epoch 1/1


                                                           

Train loss: 0.4656 | acc: 0.7923 | auc: 0.8713


                                                           

 Dev  loss: 0.3707 | acc: 0.8585 | auc: 0.9389
✅ Best model saved with dev AUC: 0.9389




In [1]:
import json, random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from tqdm import tqdm
from sklearn.metrics import accuracy_score, roc_auc_score
from pathlib import Path

In [2]:
import json
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset

# -----------------------------
# Load data
# -----------------------------
with open("preprocessed/evidence_stemmed.json") as f:
    evidence_db = json.load(f)
with open("preprocessed/claims_stemmed.json") as f:
    claim_tokens = json.load(f)
with open("data/dev-claims.json") as f:
    dev_claims = json.load(f)
with open("cached/vocab.json") as f:
    vocab = json.load(f)
vocab = {k: int(v) for k, v in vocab.items()}

# -----------------------------
# Utilities
# -----------------------------
def encode_tokens(tokens, vocab, max_len=100):
    ids = [vocab.get(tok, vocab["<UNK>"]) for tok in tokens]
    return ids[:max_len] + [vocab["<PAD>"]] * (max_len - len(ids))

# -----------------------------
# Model
# -----------------------------
class SiameseLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_size=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_size, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size * 4, 1)

    def encode(self, x):
        emb = self.embedding(x)
        out, _ = self.lstm(emb)
        return out.mean(dim=1)

    def forward(self, claim, evid):
        c = self.encode(claim)
        e = self.encode(evid)
        x = torch.cat([c, e], dim=1)
        return self.fc(x).squeeze(1)

# -----------------------------
# Load model
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SiameseLSTM(len(vocab)).to(device)
model.load_state_dict(torch.load("best_model.pt", map_location=device))
model.eval()

# -----------------------------
# Retrieval function (batched)
# -----------------------------
def retrieve_evidence(claim_text, top_k=5, batch_size=512, max_len=100):
    with torch.no_grad():
        cid = next((k for k, v in dev_claims.items() if v["claim_text"] == claim_text), None)
        if cid is None or cid not in claim_tokens:
            return []

        c_tok = claim_tokens[cid]
        c_enc = encode_tokens(c_tok, vocab, max_len)
        c_tensor = torch.tensor(c_enc).unsqueeze(0).to(device)

        evidence_ids = list(evidence_db.keys())
        evidence_encs = [encode_tokens(evidence_db[eid], vocab, max_len) for eid in evidence_ids]

        scores = []
        for i in range(0, len(evidence_encs), batch_size):
            batch_e = evidence_encs[i:i+batch_size]
            e_tensor = torch.tensor(batch_e).to(device)
            c_batch = c_tensor.repeat(e_tensor.size(0), 1)

            logits = model(c_batch, e_tensor)
            probs = torch.sigmoid(logits).cpu().numpy()

            for j, score in enumerate(probs):
                scores.append((evidence_ids[i + j], score))

        return sorted(scores, key=lambda x: x[1], reverse=True)[:top_k]

# -----------------------------
# Evaluate on dev set
# -----------------------------
top_k = 5
recalls, precisions, f1s = [], [], []

for cid in tqdm(dev_claims, desc="Evaluating"):
    truth = set(dev_claims[cid]["evidences"])
    claim_text = dev_claims[cid]["claim_text"]
    retrieved = [eid for eid, _ in retrieve_evidence(claim_text, top_k)]

    hit = len(truth & set(retrieved))
    recall = hit / len(truth) if truth else 0
    precision = hit / top_k
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

    recalls.append(recall)
    precisions.append(precision)
    f1s.append(f1)

# -----------------------------
# Print results
# -----------------------------
print(f"\nAverage Recall@{top_k}:    {np.mean(recalls):.2%}")
print(f"Average Precision@{top_k}: {np.mean(precisions):.2%}")
print(f"Average F1@{top_k}:        {np.mean(f1s):.2%}")

  model.load_state_dict(torch.load("best_model.pt", map_location=device))
Evaluating:   0%|          | 0/154 [40:50<?, ?it/s]


KeyboardInterrupt: 

# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*