In [1]:
import os
import json
import re
from pathlib import Path
import spacy
import nltk
from collections import Counter
from lemminflect import getAllInflections
from nltk.tokenize import sent_tokenize
# If you haven’t already downloaded these:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/felikskong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felikskong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# ── 0) Paths ───────────────────────────────────────────────────────────────
train_claims_path = "data/train-claims.json"
evidence_path     = "data/evidence.json"
output_path       = Path("preprocessed") / "climate_evidence.json"

# ── 1) Load data ───────────────────────────────────────────────────────────
with open(train_claims_path, "r", encoding="utf-8") as f:
    train_claims = json.load(f)

with open(evidence_path, "r", encoding="utf-8") as f:
    evidence_dict = json.load(f)

# ── 2) Extract top-100 nouns from train claims ───────────────────────────────
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
all_nouns = []
for claim in train_claims.values():
    doc = nlp(claim["claim_text"])
    all_nouns.extend([tok.lemma_.lower() for tok in doc if tok.pos_ == "NOUN"])

top_keywords = set(w for w, _ in Counter(all_nouns).most_common(100))

# ── 3) Expand each keyword to all its noun inflections ───────────────────────
all_forms = set()
for lemma in top_keywords:
    all_forms.add(lemma)
    infl_map = getAllInflections(lemma, upos="NOUN") or {}
    for forms in infl_map.values():
        all_forms.update(forms)

# ── 4) Define filtering functions ───────────────────────────────────────────
def is_english(text: str, threshold: float = 0.5) -> bool:
    """Rough check: at least `threshold` fraction of chars must be A–Z/a–z."""
    cleaned = re.sub(r"[^A-Za-z\s]", "", text)
    if not cleaned:
        return False
    alpha_count = sum(ch.isalpha() for ch in cleaned)
    return (alpha_count / len(cleaned)) >= threshold

def contains_climate_keywords(text: str, forms: set) -> bool:
    """True if any token in text (ASCII, a–z only) is in our all_forms set."""
    words = re.findall(r"\b[a-z']+\b", text.lower())
    return any(w in forms for w in words)

# ── 5) Filter the evidence ─────────────────────────────────────────────────
#  5.1 Keep only English passages
english_pairs = [
    (eid, txt)
    for eid, txt in evidence_dict.items()
    if is_english(txt)
]

#  5.2 Among those, keep only climate-related ones
climate_pairs = [
    (eid, txt)
    for eid, txt in english_pairs
    if contains_climate_keywords(txt, all_forms)
]

print(f"Step1: English keep {len(english_pairs)}/{len(evidence_dict)}")
print(f"Step2: Climate-related keep {len(climate_pairs)}/{len(english_pairs)}")

# ── 6) Write out climate-related evidence ───────────────────────────────────
output_path.parent.mkdir(parents=True, exist_ok=True)
climate_evidence = {eid: txt for eid, txt in climate_pairs}

with open(output_path, "w", encoding="utf-8") as outf:
    json.dump(climate_evidence, outf, ensure_ascii=False, indent=2)

print(f"✅ Saved {len(climate_evidence)} passages to {output_path}")

Step1: English keep 1207838/1208827
Step2: Climate-related keep 385471/1207838
✅ Saved 385471 passages to preprocessed/climate_evidence.json


In [7]:
# ───────────────────────────────────────────────────────────────────────────────
# Full Training Script for BiLSTM+Frozen-BERT Classification in one Notebook Cell
# ───────────────────────────────────────────────────────────────────────────────

import os
import json
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from tqdm.notebook import tqdm

# ───────────────────────────────────────────────────────────────────────────────
# 0) Config & Paths
# ───────────────────────────────────────────────────────────────────────────────
DATA_DIR       = Path("data")
TRAIN_AUG_JSON = DATA_DIR / "train-claims-augmented.json"
DEV_JSON       = DATA_DIR / "dev-claims.json"
EVID_JSON      = DATA_DIR / "evidence.json"

BERT_MODEL     = "bert-base-uncased"
MAX_LEN        = 256
LSTM_HID_DIM   = 128
NUM_CLASSES    = 4
BATCH_SIZE     = 16
EPOCHS         = 1
LR             = 2e-4

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Label ↔ index map
label2idx = {
    "SUPPORTS":         0,
    "NOT_ENOUGH_INFO":  1,
    "REFUTES":          2,
    "DISPUTED":         3,
}

# ───────────────────────────────────────────────────────────────────────────────
# 1) Load JSON data
# ───────────────────────────────────────────────────────────────────────────────
with open(TRAIN_AUG_JSON, "r", encoding="utf-8") as f:
    train_claims = json.load(f)
with open(DEV_JSON, "r", encoding="utf-8") as f:
    dev_claims = json.load(f)
with open(EVID_JSON, "r", encoding="utf-8") as f:
    evidence_dict = json.load(f)

# ───────────────────────────────────────────────────────────────────────────────
# 2) Dataset + DataLoader (num_workers=0 to avoid pickling errors)
# ───────────────────────────────────────────────────────────────────────────────
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

class ClaimEvidenceDataset(Dataset):
    def __init__(self, claims, evidences, tokenizer, max_len):
        self.items = []
        for cid, obj in claims.items():
            claim_text = obj["claim_text"]
            ev_ids     = obj.get("evidences", [])
            ev_texts   = [evidences[e] for e in ev_ids if e in evidences]
            # full sequence: claim [SEP] evidence1 evidence2 ...
            full_input = claim_text + " [SEP] " + " ".join(ev_texts)
            label = label2idx[obj["claim_label"]]
            self.items.append((full_input, label))
        self.tokenizer = tokenizer
        self.max_len   = max_len

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        text, label = self.items[idx]
        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return (
            enc["input_ids"].squeeze(0),
            enc["attention_mask"].squeeze(0),
            torch.tensor(label, dtype=torch.long),
        )

def collate_batch(batch):
    ids, masks, labs = zip(*batch)
    return torch.stack(ids), torch.stack(masks), torch.stack(labs)

# create datasets and loaders
train_ds = ClaimEvidenceDataset(train_claims, evidence_dict, tokenizer, MAX_LEN)
dev_ds   = ClaimEvidenceDataset(dev_claims,   evidence_dict, tokenizer, MAX_LEN)

train_dl = DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True,
    collate_fn=collate_batch, num_workers=0, pin_memory=True
)
dev_dl   = DataLoader(
    dev_ds,   batch_size=BATCH_SIZE, shuffle=False,
    collate_fn=collate_batch, num_workers=0, pin_memory=True
)

# ───────────────────────────────────────────────────────────────────────────────
# 3) Model Definition
# ───────────────────────────────────────────────────────────────────────────────
class BiLSTMWithBertEncoder(nn.Module):
    def __init__(self, bert_name, lstm_hid, num_classes):
        super().__init__()
        # 1) BERT (frozen)
        self.bert = BertModel.from_pretrained(bert_name)
        for p in self.bert.parameters():
            p.requires_grad = False

        bert_dim = self.bert.config.hidden_size
        # 2) BiLSTM on top
        self.lstm = nn.LSTM(
            input_size    = bert_dim,
            hidden_size   = lstm_hid,
            num_layers    = 1,
            batch_first   = True,
            bidirectional = True
        )
        # 3) classification head
        self.classifier = nn.Linear(2 * lstm_hid, num_classes)

    def forward(self, input_ids, attention_mask):
        # a) BERT encoding
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        seq_emb = out.last_hidden_state        # (B, L, D)
        # b) BiLSTM
        lstm_out, _ = self.lstm(seq_emb)       # (B, L, 2H)
        # c) masked mean-pooling
        mask   = attention_mask.unsqueeze(-1).float()  # (B, L, 1)
        summed = (lstm_out * mask).sum(1)              # (B, 2H)
        lens   = mask.sum(1)                           # (B, 1)
        pooled = summed / lens                         # (B, 2H)
        # d) classifier
        return self.classifier(pooled)                 # (B, num_classes)

model     = BiLSTMWithBertEncoder(BERT_MODEL, LSTM_HID_DIM, NUM_CLASSES)
model.to(DEVICE)

# ───────────────────────────────────────────────────────────────────────────────
# 4) Training Loop
# ───────────────────────────────────────────────────────────────────────────────
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0.0
    for input_ids, attn_mask, labels in tqdm(train_dl, desc=f"Train Epoch {epoch}"):
        input_ids = input_ids.to(DEVICE)
        attn_mask = attn_mask.to(DEVICE)
        labels    = labels.to(DEVICE)

        logits = model(input_ids, attn_mask)
        loss   = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"→ Epoch {epoch} Avg Loss: {total_loss/len(train_dl):.4f}")

    # eval on dev
    model.eval()
    correct = 0; total = 0
    with torch.no_grad():
        for input_ids, attn_mask, labels in tqdm(dev_dl, desc=" Eval"):
            input_ids = input_ids.to(DEVICE)
            attn_mask = attn_mask.to(DEVICE)
            labels    = labels.to(DEVICE)

            preds = model(input_ids, attn_mask).argmax(dim=1)
            correct += (preds == labels).sum().item()
            total   += labels.size(0)

    print(f"→ Dev Accuracy: {correct/total:.4%}\n")

Train Epoch 1:   0%|          | 0/130 [00:00<?, ?it/s]

→ Epoch 1 Avg Loss: 1.3850


 Eval:   0%|          | 0/10 [00:00<?, ?it/s]

→ Dev Accuracy: 37.6623%



In [10]:
import json
import torch
from pathlib import Path
from tqdm.notebook import tqdm

# 假设 以下 变量/对象 已在当前 scope 中 定义好：
#   model, tokenizer, MAX_LEN, DEVICE, label2idx

# 1) 读入 test 集（带 top-k evidence IDs） & 全部 evidence 文本
TEST_JSON     = Path("test-claims-predictions_top3.json")
EVIDENCE_JSON = Path("data") / "evidence.json"

test_claims   = json.loads(TEST_JSON.read_text(encoding="utf-8"))
evidence_dict = json.loads(EVIDENCE_JSON.read_text(encoding="utf-8"))

# 2) 构造 idx→label 映射
idx2label = {v:k for k,v in label2idx.items()}

# 3) 推断并写入结果
model.eval()
output = {}

with torch.no_grad():
    for cid, obj in tqdm(test_claims.items(), desc="Predicting labels"):
        claim_text = obj["claim_text"]
        # 直接保留所有原始 evidence IDs（不做丢弃）
        ev_ids = obj.get("evidence", [])

        # 如果还要拿文本去做前向编码，就用 .get()，不会抛 KeyError
        ev_texts = [ evidence_dict.get(eid, "") for eid in ev_ids ]

        # 用 tokenizer 的双句接口（自动在中间插 [SEP]）
        enc = tokenizer(
            claim_text,
            " ".join(ev_texts),
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        input_ids      = enc["input_ids"].to(DEVICE)
        attention_mask = enc["attention_mask"].to(DEVICE)

        # forward + argmax → label idx
        logits = model(input_ids, attention_mask)  # (1, num_classes)
        pred   = logits.argmax(dim=-1).item()
        label  = idx2label[pred]

        # 保存：claim_text, 预测 label, **原样保留** evidence IDs 列表
        output[cid] = {
            "claim_text":  claim_text,
            "claim_label": label,
            "evidences":   ev_ids
        }

# 4) 写入 test-output.json
OUT_JSON = Path("test-output.json")
with OUT_JSON.open("w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

print(f"✅ Written predictions to {OUT_JSON.resolve()}")

Predicting labels:   0%|          | 0/153 [00:00<?, ?it/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


✅ Written predictions to /Users/felikskong/Desktop/NLP/NLP_Ass3/test-output.json
