In [1]:
import collections
import itertools
import json
import math
import multiprocessing as mp
import os
import random
import statistics
import time
from pathlib import Path
from typing import List, Dict

import numpy as np
import pandas as pd
import spacy
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [2]:
DATA_DIR = "data"

def load_json(fname):
    path = os.path.join(DATA_DIR, fname)
    if not os.path.exists(path):
        print(f"[WARN] {path} not found, skip.")
        return None
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

train     = load_json("train-claims.json")
dev       = load_json("dev-claims.json")
test      = load_json("test-claims-unlabelled.json")
evidence  = load_json("evidence.json")

def claim_stats(claim_dict, split_name):
    if claim_dict is None:
        return {"split": split_name, "n_claims": 0}

    n_claims   = len(claim_dict)
    labels     = [v.get("claim_label") for v in claim_dict.values() if "claim_label" in v]
    ev_per_c   = [len(v.get("evidences", [])) for v in claim_dict.values()]
    return {
        "split": split_name,
        "# claims": n_claims,
        "avg #evidence": round(statistics.mean(ev_per_c), 2) if ev_per_c else 0,
        "label distribution": pd.Series(labels).value_counts().to_dict() if labels else {},
    }

summary = [
    claim_stats(train, "train"),
    claim_stats(dev,   "dev"),
    claim_stats(test,  "test")
]

if evidence is not None:
    token_lens = [len(passage.split()) for passage in evidence.values()]
    summary.append({
        "split": "evidence-corpus",
        "# claims": len(evidence),     
        "avg #evidence": f"{statistics.mean(token_lens):.1f} tokens",                  
        "label distribution": "-",                               
    })

print(pd.DataFrame(summary).to_string(index=False))

          split  # claims avg #evidence                                                         label distribution
          train      1228          3.36 {'SUPPORTS': 519, 'NOT_ENOUGH_INFO': 386, 'REFUTES': 199, 'DISPUTED': 124}
            dev       154          3.19     {'SUPPORTS': 68, 'NOT_ENOUGH_INFO': 41, 'REFUTES': 27, 'DISPUTED': 18}
           test       153             0                                                                         {}
evidence-corpus   1208827   19.7 tokens                                                                          -


In [None]:
# ------------------------------------------------------------------
# CONFIGURATION
# ------------------------------------------------------------------
DATA_DIR      = Path("data")
OUT_EVID      = Path("preprocessed/evidence_stemmed.json")
OUT_CLAIM     = Path("preprocessed/claims_stemmed.json")
FORCE_REBUILD = False                 # True → ignore cache, rebuild
BATCH_SIZE    = 1_000                 # spaCy batch size
NUM_PROC      = max(mp.cpu_count() - 1, 1)   # use all but 1 core

# ------------------------------------------------------------------
# INITIALISE SPACY & STEMMER
# ------------------------------------------------------------------
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
stemmer = PorterStemmer()
stop_set = set(stopwords.words("english")) - {"not", "no"}  # keep negations

def stem_doc(doc):
    """spaCy Doc → list of stems (alpha tokens, no stop-words)."""
    return [
        stemmer.stem(tok.text)
        for tok in doc
        if tok.text.isalpha() and tok.text not in stop_set
    ]

def jload(path: Path):
    with path.open(encoding="utf-8") as f:
        return json.load(f)

def jdump(obj, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False)

# ------------------------------------------------------------------
# 0) LOAD CACHE (IF PRESENT)
# ------------------------------------------------------------------
if OUT_EVID.exists() and OUT_CLAIM.exists() and not FORCE_REBUILD:
    t0 = time.time()
    evidence_proc  = jload(OUT_EVID)
    claim_proc_all = jload(OUT_CLAIM)
    print(f"Cached data loaded in {time.time() - t0:.2f} s – ready to use.")
    exit(0)

print("No valid cache – preprocessing will start …")
t_start = time.time()

# ------------------------------------------------------------------
# 1) PRE-PROCESS EVIDENCE (PARALLEL)
# ------------------------------------------------------------------
evidence_raw = jload(DATA_DIR / "evidence.json")
evid_ids     = list(evidence_raw.keys())
evid_texts   = list(evidence_raw.values())

evidence_proc = {}
lengths = []

print(f"Tokenising {len(evid_ids):,} evidence passages "
      f"with {NUM_PROC} CPU process(es)…")

for evid_id, doc in tqdm(
        zip(evid_ids,
            nlp.pipe(evid_texts,
                     batch_size=BATCH_SIZE,
                     n_process=NUM_PROC)),
        total=len(evid_ids),
        desc="Stemming evidence",
        unit="doc"
):
    stems = stem_doc(doc)
    if stems:
        evidence_proc[evid_id] = stems
        lengths.append(len(stems))

jdump(evidence_proc, OUT_EVID)
print(f"Evidence saved → {OUT_EVID.resolve()}")

# ------------------------------------------------------------------
# 2) PRE-PROCESS CLAIMS (PARALLEL, PER SPLIT)
# ------------------------------------------------------------------
claim_files = [
    "train-claims.json",
    "dev-claims.json",
    "test-claims-unlabelled.json",
]
claim_proc_all = {}

for fname in claim_files:
    raw_claims = jload(DATA_DIR / fname)
    cids  = list(raw_claims.keys())
    texts = [raw_claims[cid]["claim_text"] for cid in cids]

    for cid, doc in tqdm(
            zip(cids,
                nlp.pipe(texts,
                         batch_size=BATCH_SIZE,
                         n_process=NUM_PROC)),
            total=len(cids),
            desc=f"Stemming {fname}",
            unit="doc"
    ):
        claim_proc_all[cid] = stem_doc(doc)

jdump(claim_proc_all, OUT_CLAIM)
print(f"Claims saved → {OUT_CLAIM.resolve()}")

# ------------------------------------------------------------------
# 3) QUICK CORPUS STATISTICS
# ------------------------------------------------------------------
print("\n=== Evidence after stemming ===")
print(f"Total passages        : {len(evidence_proc):,}")
print(f"Stem length (min/max) : {min(lengths)} / {max(lengths)}")
print(f"Stem length (mean)    : {statistics.mean(lengths):.1f}")

vocab = {s for toks in evidence_proc.values() for s in toks}
print(f"Vocabulary size       : {len(vocab):,}")

counter = collections.Counter(s for toks in evidence_proc.values() for s in toks)
print("Top-20 stems          :", counter.most_common(20))

print(f"\nFinished in {time.time() - t_start:.1f} s – "
      f"results cached for future runs.")

Cached data loaded in 4.00 s – ready to use.
No valid cache – preprocessing will start …
Tokenising 1,208,827 evidence passages with 9 CPU process(es)…


Stemming evidence: 100%|██████████| 1208827/1208827 [07:15<00:00, 2775.40doc/s]


Evidence saved → /Users/felikskong/Desktop/NLP/NLP_Ass3/preprocessed/evidence_stemmed.json


Stemming train-claims.json: 100%|██████████| 1228/1228 [00:43<00:00, 28.17doc/s]
Stemming dev-claims.json: 100%|██████████| 154/154 [00:42<00:00,  3.64doc/s]
Stemming test-claims-unlabelled.json: 100%|██████████| 153/153 [00:42<00:00,  3.59doc/s]


Claims saved → /Users/felikskong/Desktop/NLP/NLP_Ass3/preprocessed/claims_stemmed.json

=== Evidence after stemming ===
Total passages        : 1,208,067
Stem length (min/max) : 1 / 314
Stem length (mean)    : 11.9
Vocabulary size       : 510,481
Top-20 stems          : [('the', 267774), ('it', 91702), ('he', 75901), ('also', 66963), ('in', 64095), ('state', 58250), ('born', 54299), ('first', 53537), ('one', 49589), ('new', 45425), ('year', 42117), ('play', 39752), ('american', 39704), ('includ', 39608), ('use', 39337), ('unit', 38930), ('nation', 37995), ('name', 37335), ('known', 36300), ('district', 34882)]

Finished in 573.0 s – results cached for future runs.


: 

In [2]:
DATA_DIR   = Path("data")
STEM_EVID  = Path("preprocessed/evidence_stemmed.json")
STEM_CLAIM = Path("preprocessed/claims_stemmed.json")

evidence = json.loads(STEM_EVID.read_text())
claims   = json.loads(STEM_CLAIM.read_text())

train_lbl = json.loads((DATA_DIR / "train-claims.json").read_text())
dev_lbl   = json.loads((DATA_DIR / "dev-claims.json").read_text())
MIN_FREQ = 3
PAD, UNK = "<PAD>", "<UNK>"

freq = collections.Counter(t for toks in
                           itertools.chain(evidence.values(), claims.values())
                           for t in toks)
itos = [PAD, UNK] + [t for t, c in freq.items() if c >= MIN_FREQ]
stoi = {t:i for i,t in enumerate(itos)}

def numerise(tokens: List[str]) -> List[int]:
    return [stoi.get(t, stoi[UNK]) for t in tokens]
class TripletDataset(Dataset):
    def __init__(self, labelled: Dict, evid_dict: Dict):
        evid_ids = list(evid_dict.keys())
        self.items = []                                # (cid, pos_id, neg_id)

        for cid, obj in labelled.items():
            # keep only gold evidences that survived preprocessing
            pos_ids = [eid for eid in obj["evidences"] if eid in evid_dict]
            if not pos_ids:           # drop the claim if none survive
                continue

            for pos_id in pos_ids:
                neg_id = random.choice(evid_ids)
                # guarantee positive ≠ negative
                while neg_id == pos_id:
                    neg_id = random.choice(evid_ids)
                self.items.append((cid, pos_id, neg_id))

        self.evid_dict = evid_dict

    def __len__(self): return len(self.items)

    def __getitem__(self, idx):
        cid, pos, neg = self.items[idx]
        return (numerise(claims[cid]),
                numerise(self.evid_dict[pos]),
                numerise(self.evid_dict[neg]))

def collate(batch):
    def pad(seqs):
        m = max(len(s) for s in seqs)
        return torch.tensor([s + [0]*(m-len(s)) for s in seqs])
    c, p, n = zip(*batch)
    return pad(c), pad(p), pad(n)

train_ds = TripletDataset(train_lbl, evidence)
train_dl = DataLoader(train_ds, batch_size=128,
                      shuffle=True, collate_fn=collate)
class BiLSTMSentenceEncoder(nn.Module):
    def __init__(self, vocab_sz, emb_dim=100, hid_dim=128):
        super().__init__()
        self.emb = nn.Embedding(vocab_sz, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hid_dim, batch_first=True,
                            bidirectional=True)
    def forward(self, x):                 # x: (B, L)
        mask = (x != 0).float()
        out, _ = self.lstm(self.emb(x))
        out = (out * mask.unsqueeze(-1)).sum(1) / mask.sum(1, keepdim=True)
        out = nn.functional.normalize(out, p=2, dim=-1)   # (B, 2*hid)
        return out
device = "cuda" if torch.cuda.is_available() else "cpu"
model  = BiLSTMSentenceEncoder(len(itos)).to(device)
optim  = torch.optim.Adam(model.parameters(), lr=3e-4)
margin = 0.3
loss_fn = nn.MarginRankingLoss(margin=margin)

for epoch in range(1):
    model.train(); total = 0
    pbar = tqdm(train_dl, desc=f"Epoch {epoch+1}")
    for c, p, n in pbar:
        c, p, n = (t.to(device) for t in (c, p, n))
        vc, vp, vn = model(c), model(p), model(n)
        pos_sim = (vc * vp).sum(1)
        neg_sim = (vc * vn).sum(1)
        loss = loss_fn(pos_sim, neg_sim,
                       torch.ones_like(pos_sim, device=device))
        optim.zero_grad(); loss.backward(); optim.step()
        total += loss.item()
        pbar.set_postfix(loss=total/(pbar.n+1))

Epoch 1: 100%|██████████| 33/33 [01:46<00:00,  3.22s/it, loss=0.103]


In [4]:
DEVICE = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
BATCH_SIZE    = 1_000
EVID_JSON = Path("preprocessed") / "evidence_stemmed.json"
with open(EVID_JSON, "r", encoding="utf-8") as f:
    evidence_proc = json.load(f)

class EvidenceDataset(Dataset):
    """Returns (evidence_id, token_id_tensor)."""
    def __init__(self, evid_dict, numerise_fn):
        self.ids       = list(evid_dict.keys())
        self.evid_dict = evid_dict
        self.numerise  = numerise_fn

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        eid       = self.ids[idx]
        token_ids = torch.tensor(self.numerise(self.evid_dict[eid]), dtype=torch.long)
        return eid, token_ids

def pad_collate(batch):
    """Pads a batch of (eid, seq) into (list_of_eids, padded_tensor)."""
    eids, seqs = zip(*batch)
    padded = pad_sequence(seqs, batch_first=True, padding_value=0)
    return list(eids), padded

# build the DataLoader with multiple workers
loader = DataLoader(
    EvidenceDataset(evidence_proc, numerise),
    batch_size   = BATCH_SIZE,
    shuffle      = False,
    collate_fn   = pad_collate,
    num_workers  = 0,        # ← single-process mode for notebooks
    pin_memory   = torch.cuda.is_available()
)

# encode all evidence in parallel batches
model.to(DEVICE).eval()
evidence_vecs = {}

with torch.no_grad():
    for eids, batch in tqdm(loader, desc="Encoding evidence", unit="batch"):
        # batch: (B, L)
        batch_vec = model(batch.to(DEVICE)).cpu()  # (B, hidden*2)
        for eid, vec in zip(eids, batch_vec):
            evidence_vecs[eid] = vec

Encoding evidence: 100%|██████████| 1209/1209 [01:02<00:00, 19.39batch/s]


In [10]:
# # ——— Demo on one dev claim ———
# demo_id = next(iter(dev_lbl))
# claim_stems = claims[demo_id]             # List[str]
# top5 = rank_evidence(claim_stems, top_k=5)
# print("TOP-5 evidence IDs:", top5)
# print("Gold               :", dev_lbl[demo_id]["evidences"])

In [11]:

DEVICE       = "cuda" if torch.cuda.is_available() else "cpu"
TOP_K        = 3

# 1) Load your pre-stemmed claims
CLAIMS_JSON  = Path("preprocessed") / "claims_stemmed.json"
with open(CLAIMS_JSON, "r", encoding="utf-8") as f:
    claim_proc_all = json.load(f)

# 2) Load your evidence vectors
EVID_VECS_PTH = Path("preprocessed") / "evidence_vecs.pt"
evid_vecs     = torch.load(EVID_VECS_PTH, map_location="cpu")

# 3) Make sure your model is on the same device
model.to(DEVICE)
model.eval()

# 4) rank_evidence uses that model & evid_vecs
# def rank_evidence(claim_tokens, top_k=TOP_K):
#     idxs = numerise(claim_tokens)
#     x    = torch.tensor([idxs], dtype=torch.long, device=DEVICE)
#     with torch.no_grad():
#         v_c = model(x).cpu().squeeze(0)
#     sims = {eid: float(torch.dot(v_c, v_e)) for eid, v_e in evid_vecs.items()}
#     return sorted(sims, key=sims.get, reverse=True)[:top_k]

def rank_evidence(claim_tokens: List[str], top_k: int = 5) -> List[str]:
    # 1) Numericise + to DEVICE
    idxs = numerise(claim_tokens)
    x = torch.tensor([idxs], dtype=torch.long, device=DEVICE)
    
    # 2) Encode and bring back to CPU
    with torch.no_grad():
        v_c = model(x)         # shape (1, D) on DEVICE
    v_c = v_c.cpu().squeeze(0)  # shape (D,) on CPU
    
    # 3) Compute dot-product (cosine since vectors are l2-normalised)
    sims = {}
    for eid, v_e in evid_vecs.items():
        # v_e is already a CPU tensor of shape (D,)
        sims[eid] = float(torch.dot(v_c, v_e))
    
    # 4) Sort by descending similarity and return top_k IDs
    ranked = sorted(sims, key=sims.get, reverse=True)[:top_k]
    return ranked

# 5) Run evaluation
with open("data/dev-claims.json", "r", encoding="utf-8") as f:
    dev_claims = json.load(f)

recalls, precisions, f1s = [], [], []
for cid, obj in tqdm(dev_claims.items(), desc="Evaluating"):
    gold      = set(obj["evidences"])
    stems     = claim_proc_all[cid]
    retrieved = rank_evidence(stems)

    hits      = len(gold & set(retrieved))
    recall    = hits / len(gold) if gold else 0.0
    precision = hits / TOP_K       # always divide by K
    f1        = (2 * recall * precision / (recall + precision)
                 if (recall + precision) > 0 else 0.0)

    recalls.append(recall)
    precisions.append(precision)
    f1s.append(f1)

print(f"\nRecall@{TOP_K}:    {np.mean(recalls):.2%}")
print(f"Precision@{TOP_K}: {np.mean(precisions):.2%}")
print(f"F1@{TOP_K}:        {np.mean(f1s):.2%}")

  evid_vecs     = torch.load(EVID_VECS_PTH, map_location="cpu")
Evaluating: 100%|██████████| 154/154 [04:09<00:00,  1.62s/it]


Recall@3:    0.00%
Precision@3: 0.00%
F1@3:        0.00%



