In [None]:
import os
import json
import re
from pathlib import Path
import spacy
import nltk
from collections import Counter
from lemminflect import getAllInflections
from nltk.tokenize import sent_tokenize
# If you haven’t already downloaded these:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ── 0) Paths ───────────────────────────────────────────────────────────────
train_claims_path = "drive/MyDrive/Data/train-claims.json"
evidence_path     = "drive/MyDrive/Data/evidence.json"
output_path       = Path("drive/MyDrive/preprocessed") / "climate_evidence.json"

# ── 1) Load data ───────────────────────────────────────────────────────────
with open(train_claims_path, "r", encoding="utf-8") as f:
    train_claims = json.load(f)

with open(evidence_path, "r", encoding="utf-8") as f:
    evidence_dict = json.load(f)

# ── 2) Extract top-100 nouns from train claims ───────────────────────────────
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
all_nouns = []
for claim in train_claims.values():
    doc = nlp(claim["claim_text"])
    all_nouns.extend([tok.lemma_.lower() for tok in doc if tok.pos_ == "NOUN"])

top_keywords = set(w for w, _ in Counter(all_nouns).most_common(100))

# ── 3) Expand each keyword to all its noun inflections ───────────────────────
all_forms = set()
for lemma in top_keywords:
    all_forms.add(lemma)
    infl_map = getAllInflections(lemma, upos="NOUN") or {}
    for forms in infl_map.values():
        all_forms.update(forms)

# ── 4) Define filtering functions ───────────────────────────────────────────
def is_english(text: str, threshold: float = 0.5) -> bool:
    """Rough check: at least `threshold` fraction of chars must be A–Z/a–z."""
    cleaned = re.sub(r"[^A-Za-z\s]", "", text)
    if not cleaned:
        return False
    alpha_count = sum(ch.isalpha() for ch in cleaned)
    return (alpha_count / len(cleaned)) >= threshold

def contains_climate_keywords(text: str, forms: set) -> bool:
    """True if any token in text (ASCII, a–z only) is in our all_forms set."""
    words = re.findall(r"\b[a-z']+\b", text.lower())
    return any(w in forms for w in words)

# ── 5) Filter the evidence ─────────────────────────────────────────────────
#  5.1 Keep only English passages
english_pairs = [
    (eid, txt)
    for eid, txt in evidence_dict.items()
    if is_english(txt)
]

#  5.2 Among those, keep only climate-related ones
climate_pairs = [
    (eid, txt)
    for eid, txt in english_pairs
    if contains_climate_keywords(txt, all_forms)
]

print(f"Step1: English keep {len(english_pairs)}/{len(evidence_dict)}")
print(f"Step2: Climate-related keep {len(climate_pairs)}/{len(english_pairs)}")

# ── 6) Write out climate-related evidence ───────────────────────────────────
output_path.parent.mkdir(parents=True, exist_ok=True)
climate_evidence = {eid: txt for eid, txt in climate_pairs}

with open(output_path, "w", encoding="utf-8") as outf:
    json.dump(climate_evidence, outf, ensure_ascii=False, indent=2)

print(f"✅ Saved {len(climate_evidence)} passages to {output_path}")

Step1: English keep 1207838/1208827
Step2: Climate-related keep 385471/1207838
✅ Saved 385471 passages to drive/MyDrive/preprocessed/climate_evidence.json


In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# Full Training Script for BiLSTM+Frozen-BERT Classification in one Notebook Cell
# ───────────────────────────────────────────────────────────────────────────────

import os
import json
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from tqdm.notebook import tqdm

# ───────────────────────────────────────────────────────────────────────────────
# 0) Config & Paths
# ───────────────────────────────────────────────────────────────────────────────
DATA_DIR       = Path("drive/MyDrive/Data")
# TRAIN_JSON     = DATA_DIR / "combined-claims.json"
# DEV_JSON       = DATA_DIR / "groundtruth_output.json"
TRAIN_JSON     = DATA_DIR / "train-claims-augmented.json"
DEV_JSON       = DATA_DIR / "dev-claims.json"
EVID_JSON      = DATA_DIR / "evidence.json"

BERT_MODEL     = "bert-base-uncased"
MAX_LEN        = 256
LSTM_HID_DIM   = 512
NUM_CLASSES    = 4
DROPOUT_PROB   = 0.2
NUM_LAYERS     = 3
BATCH_SIZE     = 16
EPOCHS         = 5
LR             = 2e-4

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Label ↔ index map
label2idx = {
    "SUPPORTS":         0,
    "NOT_ENOUGH_INFO":  1,
    "REFUTES":          2,
    "DISPUTED":         3,
}

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# 1) Load JSON data
# ───────────────────────────────────────────────────────────────────────────────
with open(TRAIN_JSON, "r", encoding="utf-8") as f:
    train_claims = json.load(f)
with open(DEV_JSON, "r", encoding="utf-8") as f:
    dev_claims = json.load(f)
with open(EVID_JSON, "r", encoding="utf-8") as f:
    evidence_dict = json.load(f)

# ───────────────────────────────────────────────────────────────────────────────
# 2) Dataset + DataLoader (num_workers=0 to avoid pickling errors)
# ───────────────────────────────────────────────────────────────────────────────
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

class ClaimEvidenceDataset(Dataset):
    def __init__(self, claims, evidences, tokenizer, max_len):
        self.items = []
        for cid, obj in claims.items():
            claim_text = obj["claim_text"]
            ev_ids     = obj.get("evidences", [])
            ev_texts   = [evidences[e] for e in ev_ids if e in evidences]
            # full sequence: claim [SEP] evidence1 evidence2 ...
            full_input = claim_text + " [SEP] " + " ".join(ev_texts)
            label = label2idx[obj["claim_label"]]
            self.items.append((full_input, label))
        self.tokenizer = tokenizer
        self.max_len   = max_len

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        text, label = self.items[idx]
        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return (
            enc["input_ids"].squeeze(0),
            enc["attention_mask"].squeeze(0),
            torch.tensor(label, dtype=torch.long),
        )

def collate_batch(batch):
    ids, masks, labs = zip(*batch)
    return torch.stack(ids), torch.stack(masks), torch.stack(labs)

# create datasets and loaders
train_ds = ClaimEvidenceDataset(train_claims, evidence_dict, tokenizer, MAX_LEN)
dev_ds   = ClaimEvidenceDataset(dev_claims,   evidence_dict, tokenizer, MAX_LEN)

train_dl = DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True,
    collate_fn=collate_batch, num_workers=0, pin_memory=True
)
dev_dl   = DataLoader(
    dev_ds,   batch_size=BATCH_SIZE, shuffle=False,
    collate_fn=collate_batch, num_workers=0, pin_memory=True
)

# ───────────────────────────────────────────────────────────────────────────────
# 3) Model Definition
# ───────────────────────────────────────────────────────────────────────────────
class BiLSTMWithBertEncoder(nn.Module):
    def __init__(self, bert_name, lstm_hid, num_classes,
                 dropout_prob, lstm_layers):
        super().__init__()
        # 1) Frozen BERT
        self.bert = BertModel.from_pretrained(bert_name)
        for p in self.bert.parameters():
            p.requires_grad = False

        bert_dim = self.bert.config.hidden_size

        # 2) Dropout on BERT outputs
        self.dropout_bert = nn.Dropout(dropout_prob)

        # 3) 2-layer BiLSTM with inter-layer dropout
        self.lstm = nn.LSTM(
            input_size    = bert_dim,
            hidden_size   = lstm_hid,
            num_layers    = lstm_layers,
            batch_first   = True,
            bidirectional = True,
            dropout       = dropout_prob  # only applied between layers
        )

        # 4) Attention scoring layer
        self.attn_fc = nn.Linear(2 * lstm_hid, 1)

        # 5) Dropout before classifier
        self.dropout_pool = nn.Dropout(dropout_prob)

        # 6) Final classification head
        self.classifier = nn.Linear(2 * lstm_hid, num_classes)

    def forward(self, input_ids, attention_mask):
        # a) BERT encoding (frozen)
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        seq_emb  = bert_out.last_hidden_state            # (B, L, D)
        seq_emb  = self.dropout_bert(seq_emb)

        # b) BiLSTM
        lstm_out, _ = self.lstm(seq_emb)                 # (B, L, 2H)

        # c) Self-attention pooling
        scores = self.attn_fc(lstm_out).squeeze(-1)      # (B, L)
        scores = scores.masked_fill(attention_mask == 0, -1e9)
        alphas = torch.softmax(scores, dim=1)            # (B, L)
        pooled = torch.sum(lstm_out * alphas.unsqueeze(-1), dim=1)  # (B, 2H)

        # d) Dropout + classification
        pooled = self.dropout_pool(pooled)
        logits = self.classifier(pooled)                 # (B, num_classes)
        return logits

model     = BiLSTMWithBertEncoder(BERT_MODEL, LSTM_HID_DIM, NUM_CLASSES, DROPOUT_PROB, NUM_LAYERS)
model.to(DEVICE)

BiLSTMWithBertEncoder(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

Cross Entropy Loss

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# 4) Training Loop (with best-model saving)
# ───────────────────────────────────────────────────────────────────────────────
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

best_acc = 0.0
BEST_MODEL_PATH = "drive/MyDrive/task2_best_model.pt"

for epoch in range(1, EPOCHS + 1):
    # -- train --
    model.train()
    total_loss = 0.0
    for input_ids, attn_mask, labels in tqdm(train_dl, desc=f"Train Epoch {epoch}"):
        input_ids = input_ids.to(DEVICE)
        attn_mask = attn_mask.to(DEVICE)
        labels    = labels.to(DEVICE)

        logits = model(input_ids, attn_mask)
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dl)
    print(f"→ Epoch {epoch} Avg Loss: {avg_loss:.4f}")

    # -- eval on dev --
    model.eval()
    correct = 0
    total   = 0
    with torch.no_grad():
        for input_ids, attn_mask, labels in tqdm(dev_dl, desc=" Eval"):
            input_ids = input_ids.to(DEVICE)
            attn_mask = attn_mask.to(DEVICE)
            labels    = labels.to(DEVICE)

            preds = model(input_ids, attn_mask).argmax(dim=1)
            correct += (preds == labels).sum().item()
            total   += labels.size(0)

    acc = correct / total
    print(f"→ Dev Accuracy: {acc:.4%}")

    # -- save best model --
    if acc > best_acc:
        best_acc = acc
        torch.save(model.state_dict(), BEST_MODEL_PATH)
        print(f"✅ New best model saved (epoch {epoch}, acc {acc:.4%})\n")
    else:
        print()

Train Epoch 1:   0%|          | 0/130 [00:00<?, ?it/s]

→ Epoch 1 Avg Loss: 1.3749


 Eval:   0%|          | 0/10 [00:00<?, ?it/s]

→ Dev Accuracy: 50.0000%
✅ New best model saved (epoch 1, acc 50.0000%)



Train Epoch 2:   0%|          | 0/130 [00:00<?, ?it/s]

→ Epoch 2 Avg Loss: 1.3714


 Eval:   0%|          | 0/10 [00:00<?, ?it/s]

→ Dev Accuracy: 50.0000%



Train Epoch 3:   0%|          | 0/130 [00:00<?, ?it/s]

→ Epoch 3 Avg Loss: 1.3686


 Eval:   0%|          | 0/10 [00:00<?, ?it/s]

→ Dev Accuracy: 49.3506%



Train Epoch 4:   0%|          | 0/130 [00:00<?, ?it/s]

→ Epoch 4 Avg Loss: 1.3655


 Eval:   0%|          | 0/10 [00:00<?, ?it/s]

→ Dev Accuracy: 47.4026%



Train Epoch 5:   0%|          | 0/130 [00:00<?, ?it/s]

→ Epoch 5 Avg Loss: 1.3628


 Eval:   0%|          | 0/10 [00:00<?, ?it/s]

→ Dev Accuracy: 49.3506%



focal loss

In [None]:
import torch
import torch.nn.functional as F

def focal_loss(logits, targets, alpha=None, gamma=2.0, reduction='mean'):
    ce_loss = F.cross_entropy(logits, targets, reduction='none', weight=alpha)
    pt = torch.exp(-ce_loss)  # pt = softmax(logits)[targets]
    focal_term = (1 - pt) ** gamma
    loss = focal_term * ce_loss
    if reduction == 'mean':
        return loss.mean()
    elif reduction == 'sum':
        return loss.sum()
    return loss

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# 4) Training Loop (with best-model saving)
# ───────────────────────────────────────────────────────────────────────────────
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=LR)
from collections import Counter

label_counts = Counter([label2idx[obj["claim_label"]] for obj in train_claims.values()])
total = sum(label_counts.values())

# The higher the weight, the more important it is; the weight can be used as an inverse weight with total / class_count.
class_weights = [total / label_counts[i] for i in range(len(label2idx))]

weights = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)


best_acc = 0.0
BEST_MODEL_PATH = "drive/MyDrive/task2_best_model.pt"

for epoch in range(1, EPOCHS + 1):
    # -- train --
    model.train()
    total_loss = 0.0
    for input_ids, attn_mask, labels in tqdm(train_dl, desc=f"Train Epoch {epoch}"):
        input_ids = input_ids.to(DEVICE)
        attn_mask = attn_mask.to(DEVICE)
        labels    = labels.to(DEVICE)

        logits = model(input_ids, attn_mask)
        loss = focal_loss(logits, labels, alpha=weights, gamma=3.0)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dl)
    print(f"→ Epoch {epoch} Avg Loss: {avg_loss:.4f}")

    # -- eval on dev --
    model.eval()
    correct = 0
    total   = 0
    with torch.no_grad():
        for input_ids, attn_mask, labels in tqdm(dev_dl, desc=" Eval"):
            input_ids = input_ids.to(DEVICE)
            attn_mask = attn_mask.to(DEVICE)
            labels    = labels.to(DEVICE)

            preds = model(input_ids, attn_mask).argmax(dim=1)
            correct += (preds == labels).sum().item()
            total   += labels.size(0)

    acc = correct / total
    print(f"→ Dev Accuracy: {acc:.4%}")

    # -- save best model --
    if acc > best_acc:
        best_acc = acc
        torch.save(model.state_dict(), BEST_MODEL_PATH)
        print(f"✅ New best model saved (epoch {epoch}, acc {acc:.4%})\n")
    else:
        print()

Train Epoch 1:   0%|          | 0/130 [00:00<?, ?it/s]

→ Epoch 1 Avg Loss: 1.3550


 Eval:   0%|          | 0/10 [00:00<?, ?it/s]

→ Dev Accuracy: 49.3506%
✅ New best model saved (epoch 1, acc 49.3506%)



Train Epoch 2:   0%|          | 0/130 [00:00<?, ?it/s]

→ Epoch 2 Avg Loss: 1.3518


 Eval:   0%|          | 0/10 [00:00<?, ?it/s]

→ Dev Accuracy: 52.5974%
✅ New best model saved (epoch 2, acc 52.5974%)



Train Epoch 3:   0%|          | 0/130 [00:00<?, ?it/s]

→ Epoch 3 Avg Loss: 1.3489


 Eval:   0%|          | 0/10 [00:00<?, ?it/s]

→ Dev Accuracy: 51.2987%



Train Epoch 4:   0%|          | 0/130 [00:00<?, ?it/s]

→ Epoch 4 Avg Loss: 1.3471


 Eval:   0%|          | 0/10 [00:00<?, ?it/s]

→ Dev Accuracy: 51.9481%



Train Epoch 5:   0%|          | 0/130 [00:00<?, ?it/s]

→ Epoch 5 Avg Loss: 1.3444


 Eval:   0%|          | 0/10 [00:00<?, ?it/s]

→ Dev Accuracy: 53.8961%
✅ New best model saved (epoch 5, acc 53.8961%)



Weighted Cross Entropy Loss

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# 4) Training Loop (with best-model saving)
# ───────────────────────────────────────────────────────────────────────────────
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=LR)
from collections import Counter

# Weighted Cross Entropy Loss
label_counts = Counter([label2idx[obj["claim_label"]] for obj in train_claims.values()])
total = sum(label_counts.values())

# The higher the weight, the more important it is; the weight can be used as an inverse weight with total / class_count.
class_weights = [total / label_counts[i] for i in range(len(label2idx))]

weights = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)

criterion = nn.CrossEntropyLoss(weight=weights)

best_acc = 0.0
BEST_MODEL_PATH = "drive/MyDrive/task2_best_model.pt"

for epoch in range(1, EPOCHS + 1):
    # -- train --
    model.train()
    total_loss = 0.0
    for input_ids, attn_mask, labels in tqdm(train_dl, desc=f"Train Epoch {epoch}"):
        input_ids = input_ids.to(DEVICE)
        attn_mask = attn_mask.to(DEVICE)
        labels    = labels.to(DEVICE)

        logits = model(input_ids, attn_mask)
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dl)
    print(f"→ Epoch {epoch} Avg Loss: {avg_loss:.4f}")

    # -- eval on dev --
    model.eval()
    correct = 0
    total   = 0
    with torch.no_grad():
        for input_ids, attn_mask, labels in tqdm(dev_dl, desc=" Eval"):
            input_ids = input_ids.to(DEVICE)
            attn_mask = attn_mask.to(DEVICE)
            labels    = labels.to(DEVICE)

            preds = model(input_ids, attn_mask).argmax(dim=1)
            correct += (preds == labels).sum().item()
            total   += labels.size(0)

    acc = correct / total
    print(f"→ Dev Accuracy: {acc:.4%}")

    # -- save best model --
    if acc > best_acc:
        best_acc = acc
        torch.save(model.state_dict(), BEST_MODEL_PATH)
        print(f"✅ New best model saved (epoch {epoch}, acc {acc:.4%})\n")
    else:
        print()

Train Epoch 1:   0%|          | 0/130 [00:00<?, ?it/s]

→ Epoch 1 Avg Loss: 5.2010


 Eval:   0%|          | 0/10 [00:00<?, ?it/s]

→ Dev Accuracy: 50.0000%
✅ New best model saved (epoch 1, acc 50.0000%)



Train Epoch 2:   0%|          | 0/130 [00:00<?, ?it/s]

→ Epoch 2 Avg Loss: 5.1905


 Eval:   0%|          | 0/10 [00:00<?, ?it/s]

→ Dev Accuracy: 50.0000%



Train Epoch 3:   0%|          | 0/130 [00:00<?, ?it/s]

→ Epoch 3 Avg Loss: 5.1701


 Eval:   0%|          | 0/10 [00:00<?, ?it/s]

→ Dev Accuracy: 50.6494%
✅ New best model saved (epoch 3, acc 50.6494%)



Train Epoch 4:   0%|          | 0/130 [00:00<?, ?it/s]

→ Epoch 4 Avg Loss: 5.1698


 Eval:   0%|          | 0/10 [00:00<?, ?it/s]

→ Dev Accuracy: 50.0000%



Train Epoch 5:   0%|          | 0/130 [00:00<?, ?it/s]

→ Epoch 5 Avg Loss: 5.1643


 Eval:   0%|          | 0/10 [00:00<?, ?it/s]

→ Dev Accuracy: 50.6494%

