In [1]:
import os
import json
import re
from pathlib import Path
import spacy
import nltk
from collections import Counter
from lemminflect import getAllInflections
from nltk.tokenize import sent_tokenize
# If you haven’t already downloaded these:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/felikskong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/felikskong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# ── 0) Paths ───────────────────────────────────────────────────────────────
train_claims_path = "data/train-claims.json"
evidence_path     = "data/evidence.json"
output_path       = Path("preprocessed") / "climate_evidence.json"

# ── 1) Load data ───────────────────────────────────────────────────────────
with open(train_claims_path, "r", encoding="utf-8") as f:
    train_claims = json.load(f)

with open(evidence_path, "r", encoding="utf-8") as f:
    evidence_dict = json.load(f)

# ── 2) Extract top-100 nouns from train claims ───────────────────────────────
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
all_nouns = []
for claim in train_claims.values():
    doc = nlp(claim["claim_text"])
    all_nouns.extend([tok.lemma_.lower() for tok in doc if tok.pos_ == "NOUN"])

top_keywords = set(w for w, _ in Counter(all_nouns).most_common(100))

# ── 3) Expand each keyword to all its noun inflections ───────────────────────
all_forms = set()
for lemma in top_keywords:
    all_forms.add(lemma)
    infl_map = getAllInflections(lemma, upos="NOUN") or {}
    for forms in infl_map.values():
        all_forms.update(forms)

# ── 4) Define filtering functions ───────────────────────────────────────────
def is_english(text: str, threshold: float = 0.5) -> bool:
    """Rough check: at least `threshold` fraction of chars must be A–Z/a–z."""
    cleaned = re.sub(r"[^A-Za-z\s]", "", text)
    if not cleaned:
        return False
    alpha_count = sum(ch.isalpha() for ch in cleaned)
    return (alpha_count / len(cleaned)) >= threshold

def contains_climate_keywords(text: str, forms: set) -> bool:
    """True if any token in text (ASCII, a–z only) is in our all_forms set."""
    words = re.findall(r"\b[a-z']+\b", text.lower())
    return any(w in forms for w in words)

# ── 5) Filter the evidence ─────────────────────────────────────────────────
#  5.1 Keep only English passages
english_pairs = [
    (eid, txt)
    for eid, txt in evidence_dict.items()
    if is_english(txt)
]

#  5.2 Among those, keep only climate-related ones
climate_pairs = [
    (eid, txt)
    for eid, txt in english_pairs
    if contains_climate_keywords(txt, all_forms)
]

print(f"Step1: English keep {len(english_pairs)}/{len(evidence_dict)}")
print(f"Step2: Climate-related keep {len(climate_pairs)}/{len(english_pairs)}")

# ── 6) Write out climate-related evidence ───────────────────────────────────
output_path.parent.mkdir(parents=True, exist_ok=True)
climate_evidence = {eid: txt for eid, txt in climate_pairs}

with open(output_path, "w", encoding="utf-8") as outf:
    json.dump(climate_evidence, outf, ensure_ascii=False, indent=2)

print(f"✅ Saved {len(climate_evidence)} passages to {output_path}")

Step1: English keep 1207838/1208827
Step2: Climate-related keep 385471/1207838
✅ Saved 385471 passages to preprocessed/climate_evidence.json


In [5]:
# ───────────────────────────────────────────────────────────────────────────────
# Full Training Script for BiLSTM+Frozen-BERT Classification in one Notebook Cell
# ───────────────────────────────────────────────────────────────────────────────

import os
import json
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from tqdm.notebook import tqdm

# ───────────────────────────────────────────────────────────────────────────────
# 0) Config & Paths
# ───────────────────────────────────────────────────────────────────────────────
DATA_DIR       = Path("data")
# TRAIN_JSON     = DATA_DIR / "combined-claims.json"
# DEV_JSON       = DATA_DIR / "groundtruth_output.json"
TRAIN_JSON     = DATA_DIR / "train-claims-augmented.json"
DEV_JSON       = DATA_DIR / "dev-claims.json"
EVID_JSON      = DATA_DIR / "evidence.json"

BERT_MODEL     = "bert-base-uncased"
MAX_LEN        = 256
LSTM_HID_DIM   = 512
NUM_CLASSES    = 4
DROPOUT_PROB   = 0.2
NUM_LAYERS     = 3
BATCH_SIZE     = 16
EPOCHS         = 5
LR             = 2e-4

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Label ↔ index map
label2idx = {
    "SUPPORTS":         0,
    "NOT_ENOUGH_INFO":  1,
    "REFUTES":          2,
    "DISPUTED":         3,
}

  Referenced from: <CFED5F8E-EC3F-36FD-AAA3-2C6C7F8D3DD9> /Users/felikskong/anaconda3/envs/nlp/lib/python3.11/site-packages/torchvision/image.so
  warn(


In [6]:
# ───────────────────────────────────────────────────────────────────────────────
# 1) Load JSON data
# ───────────────────────────────────────────────────────────────────────────────
with open(TRAIN_JSON, "r", encoding="utf-8") as f:
    train_claims = json.load(f)
with open(DEV_JSON, "r", encoding="utf-8") as f:
    dev_claims = json.load(f)
with open(EVID_JSON, "r", encoding="utf-8") as f:
    evidence_dict = json.load(f)

# ───────────────────────────────────────────────────────────────────────────────
# 2) Dataset + DataLoader (num_workers=0 to avoid pickling errors)
# ───────────────────────────────────────────────────────────────────────────────
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

class ClaimEvidenceDataset(Dataset):
    def __init__(self, claims, evidences, tokenizer, max_len):
        self.items = []
        for cid, obj in claims.items():
            claim_text = obj["claim_text"]
            ev_ids     = obj.get("evidences", [])
            ev_texts   = [evidences[e] for e in ev_ids if e in evidences]
            # full sequence: claim [SEP] evidence1 evidence2 ...
            full_input = claim_text + " [SEP] " + " ".join(ev_texts)
            label = label2idx[obj["claim_label"]]
            self.items.append((full_input, label))
        self.tokenizer = tokenizer
        self.max_len   = max_len

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        text, label = self.items[idx]
        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return (
            enc["input_ids"].squeeze(0),
            enc["attention_mask"].squeeze(0),
            torch.tensor(label, dtype=torch.long),
        )

def collate_batch(batch):
    ids, masks, labs = zip(*batch)
    return torch.stack(ids), torch.stack(masks), torch.stack(labs)

# create datasets and loaders
train_ds = ClaimEvidenceDataset(train_claims, evidence_dict, tokenizer, MAX_LEN)
dev_ds   = ClaimEvidenceDataset(dev_claims,   evidence_dict, tokenizer, MAX_LEN)

train_dl = DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True,
    collate_fn=collate_batch, num_workers=0, pin_memory=True
)
dev_dl   = DataLoader(
    dev_ds,   batch_size=BATCH_SIZE, shuffle=False,
    collate_fn=collate_batch, num_workers=0, pin_memory=True
)

# ───────────────────────────────────────────────────────────────────────────────
# 3) Model Definition
# ───────────────────────────────────────────────────────────────────────────────
class BiLSTMWithBertEncoder(nn.Module):
    def __init__(self, bert_name, lstm_hid, num_classes, 
                 dropout_prob, lstm_layers):
        super().__init__()
        # 1) Frozen BERT
        self.bert = BertModel.from_pretrained(bert_name)
        for p in self.bert.parameters():
            p.requires_grad = False

        bert_dim = self.bert.config.hidden_size

        # 2) Dropout on BERT outputs
        self.dropout_bert = nn.Dropout(dropout_prob)

        # 3) 2-layer BiLSTM with inter-layer dropout
        self.lstm = nn.LSTM(
            input_size    = bert_dim,
            hidden_size   = lstm_hid,
            num_layers    = lstm_layers,
            batch_first   = True,
            bidirectional = True,
            dropout       = dropout_prob  # only applied between layers
        )

        # 4) Attention scoring layer
        #    对每个 time-step 的 2*hid 维输出打分
        self.attn_fc = nn.Linear(2 * lstm_hid, 1)

        # 5) Dropout before classifier
        self.dropout_pool = nn.Dropout(dropout_prob)

        # 6) Final classification head
        self.classifier = nn.Linear(2 * lstm_hid, num_classes)

    def forward(self, input_ids, attention_mask):
        # a) BERT encoding (frozen)
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        seq_emb  = bert_out.last_hidden_state            # (B, L, D)
        seq_emb  = self.dropout_bert(seq_emb)

        # b) BiLSTM
        lstm_out, _ = self.lstm(seq_emb)                 # (B, L, 2H)

        # c) Self-attention pooling
        #    1) 计算每个 time-step 的 attention score
        scores = self.attn_fc(lstm_out).squeeze(-1)      # (B, L)
        #    2) 对 pad 部分打 -inf
        scores = scores.masked_fill(attention_mask == 0, -1e9)
        #    3) 得到权重并做加权求和
        alphas = torch.softmax(scores, dim=1)            # (B, L)
        pooled = torch.sum(lstm_out * alphas.unsqueeze(-1), dim=1)  # (B, 2H)

        # d) Dropout + classification
        pooled = self.dropout_pool(pooled)
        logits = self.classifier(pooled)                 # (B, num_classes)
        return logits
    
model     = BiLSTMWithBertEncoder(BERT_MODEL, LSTM_HID_DIM, NUM_CLASSES, DROPOUT_PROB, NUM_LAYERS)
model.to(DEVICE)

BiLSTMWithBertEncoder(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [4]:
# ───────────────────────────────────────────────────────────────────────────────
# 4) Training Loop (with best-model saving)
# ───────────────────────────────────────────────────────────────────────────────
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

best_acc = 0.0
BEST_MODEL_PATH = "task2_best_model.pt"

for epoch in range(1, EPOCHS + 1):
    # -- train --
    model.train()
    total_loss = 0.0
    for input_ids, attn_mask, labels in tqdm(train_dl, desc=f"Train Epoch {epoch}"):
        input_ids = input_ids.to(DEVICE)
        attn_mask = attn_mask.to(DEVICE)
        labels    = labels.to(DEVICE)

        logits = model(input_ids, attn_mask)
        loss   = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dl)
    print(f"→ Epoch {epoch} Avg Loss: {avg_loss:.4f}")

    # -- eval on dev --
    model.eval()
    correct = 0
    total   = 0
    with torch.no_grad():
        for input_ids, attn_mask, labels in tqdm(dev_dl, desc=" Eval"):
            input_ids = input_ids.to(DEVICE)
            attn_mask = attn_mask.to(DEVICE)
            labels    = labels.to(DEVICE)

            preds = model(input_ids, attn_mask).argmax(dim=1)
            correct += (preds == labels).sum().item()
            total   += labels.size(0)

    acc = correct / total
    print(f"→ Dev Accuracy: {acc:.4%}")

    # -- save best model --
    if acc > best_acc:
        best_acc = acc
        torch.save(model.state_dict(), BEST_MODEL_PATH)
        print(f"✅ New best model saved (epoch {epoch}, acc {acc:.4%})\n")
    else:
        print()

Train Epoch 1:   0%|          | 0/147 [00:00<?, ?it/s]

→ Epoch 1 Avg Loss: 1.3853


 Eval:   0%|          | 0/5 [00:00<?, ?it/s]

→ Dev Accuracy: 53.9474%
✅ New best model saved (epoch 1, acc 53.9474%)



Train Epoch 2:   0%|          | 0/147 [00:00<?, ?it/s]

→ Epoch 2 Avg Loss: 1.3816


 Eval:   0%|          | 0/5 [00:00<?, ?it/s]

→ Dev Accuracy: 43.4211%



Train Epoch 3:   0%|          | 0/147 [00:00<?, ?it/s]

→ Epoch 3 Avg Loss: 1.3777


 Eval:   0%|          | 0/5 [00:00<?, ?it/s]

→ Dev Accuracy: 46.0526%



Train Epoch 4:   0%|          | 0/147 [00:00<?, ?it/s]

→ Epoch 4 Avg Loss: 1.3745


 Eval:   0%|          | 0/5 [00:00<?, ?it/s]

→ Dev Accuracy: 40.7895%



Train Epoch 5:   0%|          | 0/147 [00:00<?, ?it/s]

→ Epoch 5 Avg Loss: 1.3704


 Eval:   0%|          | 0/5 [00:00<?, ?it/s]

→ Dev Accuracy: 35.5263%



In [14]:
import json
import torch
from pathlib import Path
from tqdm.notebook import tqdm

# 假设 以下 变量/对象 已在当前 scope 中 定义好：
#   model, tokenizer, MAX_LEN, DEVICE, label2idx

# 1) 读入 test 集（带 top-k evidence IDs） & 全部 evidence 文本
TEST_JSON     = Path("test-claims-predictions.json")
EVIDENCE_JSON = Path("data") / "evidence.json"

test_claims   = json.loads(TEST_JSON.read_text(encoding="utf-8"))
evidence_dict = json.loads(EVIDENCE_JSON.read_text(encoding="utf-8"))

# 2) 构造 idx→label 映射
idx2label = {v:k for k,v in label2idx.items()}

# 3) 推断并写入结果
model = BiLSTMWithBertEncoder(BERT_MODEL, LSTM_HID_DIM, NUM_CLASSES, DROPOUT_PROB, NUM_LAYERS)
state_dict = torch.load("task2_best_model_6.pt", map_location=DEVICE)
model.load_state_dict(state_dict)
model.to(DEVICE)
model.eval()
output = {}

with torch.no_grad():
    for cid, obj in tqdm(test_claims.items(), desc="Predicting labels"):
        claim_text = obj["claim_text"]
        # 直接保留所有原始 evidence IDs（不做丢弃）
        ev_ids = obj.get("evidences", [])

        # 如果还要拿文本去做前向编码，就用 .get()，不会抛 KeyError
        ev_texts = [ evidence_dict.get(eid, "") for eid in ev_ids ]

        # 用 tokenizer 的双句接口（自动在中间插 [SEP]）
        enc = tokenizer(
            claim_text,
            " ".join(ev_texts),
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        input_ids      = enc["input_ids"].to(DEVICE)
        attention_mask = enc["attention_mask"].to(DEVICE)

        # forward + argmax → label idx
        logits = model(input_ids, attention_mask)  # (1, num_classes)
        pred   = logits.argmax(dim=-1).item()
        label  = idx2label[pred]

        # 保存：claim_text, 预测 label, **原样保留** evidence IDs 列表
        output[cid] = {
            "claim_text":  claim_text,
            "claim_label": label,
            "evidences":   ev_ids
        }

# 4) 写入 test-output.json
OUT_JSON = Path("test-output.json")
with OUT_JSON.open("w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

print(f"✅ Written predictions to {OUT_JSON.resolve()}")

  state_dict = torch.load("task2_best_model_6.pt", map_location=DEVICE)


Predicting labels:   0%|          | 0/153 [00:00<?, ?it/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

✅ Written predictions to /Users/felikskong/Desktop/NLP/NLP_Ass3/test-output.json


In [None]:
import json

with open("test-output-0.51.json", "r", encoding="utf-8") as f:
    a_data = json.load(f)
with open("test-output.json", "r", encoding="utf-8") as f:
    b_data = json.load(f)
for claim_id, a_claim in a_data.items():
    if claim_id in b_data:
        b_data[claim_id]["claim_label"] = a_claim["claim_label"]
with open("test-output.json", "w", encoding="utf-8") as f:
    json.dump(b_data, f, ensure_ascii=False, indent=2)


b 文件的 claim_label 已成功更新并保存为 test-output-0.21-0.51.json
