In [5]:
import os
import json
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import re
import nltk
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# ───────────────────────────────────────────────────────────────────────────────
# 0) Config & Paths
# ───────────────────────────────────────────────────────────────────────────────

DATA_DIR = Path("data")
TRAIN_JSON = DATA_DIR / "train-claims.json"
DEV_JSON = DATA_DIR / "dev-claims.json"
EVID_JSON = DATA_DIR / "evidence.json"

MAX_LEN = 50
EMBED_DIM = 100
HIDDEN_DIM = 64
NUM_CLASSES = 4
DROPOUT_PROB = 0.4
BATCH_SIZE = 64
EPOCHS = 20
LR = 1e-4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ───────────────────────────────────────────────────────────────────────────────
# 1) Preprocessing utils
# ───────────────────────────────────────────────────────────────────────────────

stopwords = set(nltk_stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('english')

def lemmatize(word):
    lemma = lemmatizer.lemmatize(word, 'v')
    return lemmatizer.lemmatize(lemma, 'n')

def preprocess(text, remove_stopwords=True, lemma=True, stem=False):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [t for t in tokens if re.match('^[a-zA-Z0-9-]+$', t)]
    if remove_stopwords:
        tokens = [t for t in tokens if t not in stopwords]
    if lemma:
        tokens = [lemmatize(t) for t in tokens]
    if stem:
        tokens = [stemmer.stem(t) for t in tokens]
    return ' '.join(tokens)

# ───────────────────────────────────────────────────────────────────────────────
# 2) Load JSON data → DataFrame
# ───────────────────────────────────────────────────────────────────────────────

def load_data(claims_file, evidence_file):
    with open(claims_file, 'r', encoding='utf-8') as f:
        claims_data = json.load(f)
    with open(evidence_file, 'r', encoding='utf-8') as f:
        evid_data = json.load(f)
    
    claim_texts, evid_texts, labels = [], [], []
    for cid, cdata in claims_data.items():
        claim = preprocess(cdata['claim_text'])
        evid_ids = cdata['evidences']
        evids = ' '.join([evid_data.get(eid, '') for eid in evid_ids])
        evid = preprocess(evids)
        claim_texts.append(claim)
        evid_texts.append(evid)
        labels.append(cdata['claim_label'])
    
    df = pd.DataFrame({
        'claim': claim_texts,
        'evidence': evid_texts,
        'label': labels
    })
    return df

train_df = load_data(TRAIN_JSON, EVID_JSON)
dev_df = load_data(DEV_JSON, EVID_JSON)

# ───────────────────────────────────────────────────────────────────────────────
# 3) Prepare vocab & sequences
# ───────────────────────────────────────────────────────────────────────────────

all_text = train_df['claim'].tolist() + train_df['evidence'].tolist()
token_counts = Counter(w for text in all_text for w in text.split())
vocab = {w: idx+1 for idx, (w, _) in enumerate(token_counts.items())}
vocab_size = len(vocab) + 1

def text_to_seq(text):
    seq = [vocab.get(w, 0) for w in text.split()]
    return seq + [0]*(MAX_LEN - len(seq)) if len(seq) < MAX_LEN else seq[:MAX_LEN]

train_claims = [text_to_seq(t) for t in train_df['claim']]
train_evids = [text_to_seq(t) for t in train_df['evidence']]
dev_claims = [text_to_seq(t) for t in dev_df['claim']]
dev_evids = [text_to_seq(t) for t in dev_df['evidence']]

label_enc = LabelEncoder()
train_labels = label_enc.fit_transform(train_df['label'])
dev_labels = label_enc.transform(dev_df['label'])

# ───────────────────────────────────────────────────────────────────────────────
# 4) Dataset + DataLoader
# ───────────────────────────────────────────────────────────────────────────────

class ClaimDataset(Dataset):
    def __init__(self, claims, evidences, labels):
        self.claims = torch.tensor(claims, dtype=torch.long)
        self.evidences = torch.tensor(evidences, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return self.claims[idx], self.evidences[idx], self.labels[idx]

train_ds = ClaimDataset(train_claims, train_evids, train_labels)
dev_ds = ClaimDataset(dev_claims, dev_evids, dev_labels)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
dev_dl = DataLoader(dev_ds, batch_size=BATCH_SIZE)

# ───────────────────────────────────────────────────────────────────────────────
# 5) Self-Attention Pooling
# ───────────────────────────────────────────────────────────────────────────────

class SelfAttentionPooling(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.attention = nn.Linear(input_dim, 1)

    def forward(self, x):
        # x: [batch, seq_len, hidden_dim]
        weights = torch.softmax(self.attention(x), dim=1)  # [batch, seq_len, 1]
        pooled = torch.sum(weights * x, dim=1)  # [batch, hidden_dim]
        return pooled

# ───────────────────────────────────────────────────────────────────────────────
# 6) Model
# ───────────────────────────────────────────────────────────────────────────────

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.embed_dropout = nn.Dropout(DROPOUT_PROB)
        self.rnn_claim = nn.RNN(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.rnn_evid = nn.RNN(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.rnn_dropout = nn.Dropout(DROPOUT_PROB)
        self.attention_claim = SelfAttentionPooling(hidden_dim * 2)
        self.attention_evid = SelfAttentionPooling(hidden_dim * 2)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim * 4, 128),
            nn.ReLU(),
            nn.Dropout(DROPOUT_PROB),
            nn.Linear(128, num_classes)
        )

    def forward(self, claim, evidence):
        claim_emb = self.embed_dropout(self.embedding(claim))
        evid_emb = self.embed_dropout(self.embedding(evidence))
        
        claim_out, _ = self.rnn_claim(claim_emb)
        evid_out, _ = self.rnn_evid(evid_emb)
        
        claim_out = self.rnn_dropout(claim_out)
        evid_out = self.rnn_dropout(evid_out)
        
        claim_pool = self.attention_claim(claim_out)
        evid_pool = self.attention_evid(evid_out)
        
        combined = torch.cat([claim_pool, evid_pool], dim=1)
        return self.classifier(combined)

model = RNNModel(vocab_size, EMBED_DIM, HIDDEN_DIM, NUM_CLASSES).to(DEVICE)


[nltk_data] Downloading package punkt to /Users/sophia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sophia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sophia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

# ───────────────────────────────────────────────────────────────────────────────
# 7) Training loop
# ───────────────────────────────────────────────────────────────────────────────

for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss, total_correct = 0, 0
    for claim, evid, label in tqdm(train_dl, desc=f"Epoch {epoch}"):
        claim, evid, label = claim.to(DEVICE), evid.to(DEVICE), label.to(DEVICE)
        optimizer.zero_grad()
        out = model(claim, evid)
        loss = criterion(out, label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        total_correct += (out.argmax(1) == label).sum().item()
    acc = total_correct / len(train_ds)
    print(f"Train Loss: {total_loss/len(train_dl):.4f}, Train Acc: {acc:.4f}")
    
    model.eval()
    val_loss, val_correct = 0, 0
    with torch.no_grad():
        for claim, evid, label in dev_dl:
            claim, evid, label = claim.to(DEVICE), evid.to(DEVICE), label.to(DEVICE)
            out = model(claim, evid)
            loss = criterion(out, label)
            val_loss += loss.item()
            val_correct += (out.argmax(1) == label).sum().item()
    val_acc = val_correct / len(dev_ds)
    print(f"Val Loss: {val_loss/len(dev_dl):.4f}, Val Acc: {val_acc:.4f}")

# ───────────────────────────────────────────────────────────────────────────────
# 8) Save model + label encoder
# ───────────────────────────────────────────────────────────────────────────────

torch.save(model.state_dict(), "rnn_model.pth")
import pickle
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_enc, f)


Epoch 1:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 1.0477, Train Acc: 0.5725
Val Loss: 1.3388, Val Acc: 0.4481


Epoch 2:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 1.0412, Train Acc: 0.5733
Val Loss: 1.3230, Val Acc: 0.4675


Epoch 3:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 1.0350, Train Acc: 0.5757
Val Loss: 1.3461, Val Acc: 0.4351


Epoch 4:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 1.0188, Train Acc: 0.5741
Val Loss: 1.3484, Val Acc: 0.4351


Epoch 5:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 1.0294, Train Acc: 0.5757
Val Loss: 1.3646, Val Acc: 0.4351


Epoch 6:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 1.0251, Train Acc: 0.5790
Val Loss: 1.3629, Val Acc: 0.4351


Epoch 7:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 1.0304, Train Acc: 0.5863
Val Loss: 1.3566, Val Acc: 0.4351


Epoch 8:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 1.0217, Train Acc: 0.5863
Val Loss: 1.3563, Val Acc: 0.4416


Epoch 9:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 1.0229, Train Acc: 0.5855
Val Loss: 1.3522, Val Acc: 0.4481


Epoch 10:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 1.0228, Train Acc: 0.5855
Val Loss: 1.3749, Val Acc: 0.4351


Epoch 11:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 1.0063, Train Acc: 0.5855
Val Loss: 1.3677, Val Acc: 0.4416


Epoch 12:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 1.0176, Train Acc: 0.5855
Val Loss: 1.3847, Val Acc: 0.4481


Epoch 13:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 0.9973, Train Acc: 0.5920
Val Loss: 1.3841, Val Acc: 0.4675


Epoch 14:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 0.9931, Train Acc: 0.5961
Val Loss: 1.3961, Val Acc: 0.4545


Epoch 15:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 0.9954, Train Acc: 0.5945
Val Loss: 1.4149, Val Acc: 0.4545


Epoch 16:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 0.9988, Train Acc: 0.5879
Val Loss: 1.4091, Val Acc: 0.4545


Epoch 17:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 0.9821, Train Acc: 0.5961
Val Loss: 1.4261, Val Acc: 0.4545


Epoch 18:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 0.9638, Train Acc: 0.5904
Val Loss: 1.4283, Val Acc: 0.4610


Epoch 19:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 0.9876, Train Acc: 0.5912
Val Loss: 1.4337, Val Acc: 0.4610


Epoch 20:   0%|          | 0/20 [00:00<?, ?it/s]

Train Loss: 1.0045, Train Acc: 0.5945
Val Loss: 1.4259, Val Acc: 0.4610


In [16]:
def predict(input_file, evidence_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        claims_data = json.load(f)
    with open(evidence_file, 'r', encoding='utf-8') as f:
        evid_data = json.load(f)
    
    claim_ids, claim_texts, evid_texts = [], [], []
    for cid, cdata in claims_data.items():
        claim_ids.append(cid)
        claim = preprocess(cdata['claim_text'])
        evid_ids = cdata['evidences']
        evids = ' '.join([evid_data.get(eid, '') for eid in evid_ids])
        evid = preprocess(evids)
        claim_texts.append(claim)
        evid_texts.append(evid)
    
    test_claims = torch.tensor([text_to_seq(t) for t in claim_texts], dtype=torch.long).to(DEVICE)
    test_evid = torch.tensor([text_to_seq(t) for t in evid_texts], dtype=torch.long).to(DEVICE)
    
    model.load_state_dict(torch.load('rnn_model.pth'))
    model.eval()
    with open('label_encoder.pkl', 'rb') as f:
        label_enc = pickle.load(f)
    
    with torch.no_grad():
        outputs = model(test_claims, test_evid)
        preds = outputs.argmax(dim=1).cpu().numpy()
    pred_labels = label_enc.inverse_transform(preds)
    
    output_data = {cid: {'claim_text': ctext, 'claim_label': plabel, 'evidences': claims_data[cid]['evidences']} 
                   for cid, plabel, ctext in zip(claim_ids, pred_labels, claim_texts)}
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)

In [17]:
predict('test-claims-predictions.json', 'data/evidence.json', 'predicted_results.json')