In [2]:
import json

TRAINING_DATA_PATH = '/content/drive/MyDrive/training_data.json'

try:
    from google.colab import drive
    drive.mount('/content/drive')
except ModuleNotFoundError:
    !pip install nltk
    !pip install torch
    !pip install scikit-learn
#     !pip install numpy

    TRAINING_DATA_PATH = './training_data.json'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
with open(TRAINING_DATA_PATH, 'r') as f:
    training_data = json.load(f)

# Extract claim texts, evidence texts, and labels from the training data
# Assume those are already pre-processed
claim_texts = [data['claim_text'] for data in training_data]
evidence_texts = [data['evidence_text'] for data in training_data]
labels = [data['label'] for data in training_data]

assert len(claim_texts) == len(evidence_texts)
assert len(evidence_texts) == len(labels)

In [4]:
from collections import Counter

word_count = Counter(
    word for text in claim_texts + evidence_texts for word in text.split())
vocab = sorted(word_count.keys())
vocab_size = len(vocab)
print(f'Vocabulary Size: {vocab_size}')

word2idx = {word: idx for idx, word in enumerate(vocab, start=1)}
word2idx['<PAD>'] = 0  # Add a special token for padding

assert word2idx['<PAD>'] == 0

Vocabulary Size: 26166


# Dataset & Model Definition

## Dataset

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader

class ClaimEvidenceDataset(Dataset):
    def __init__(
        self, claim_texts, evidence_texts, labels, word2idx, max_len):
        self.claim_texts = claim_texts
        self.evidence_texts = evidence_texts
        self.labels = labels
        self.word2idx = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.claim_texts)

    def __getitem__(self, index):
        claim_text = self.claim_texts[index]
        evidence_text = self.evidence_texts[index]
        label = self.labels[index]

        claim_seq = torch.tensor(
            [self.word2idx.get(word, self.word2idx['<PAD>']) for word in claim_text.split()],
            dtype=torch.long)
        evidence_seq = torch.tensor(
            [self.word2idx.get(word, self.word2idx['<PAD>']) for word in evidence_text.split()],
            dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.float)

        claim_seq = claim_seq[:self.max_len]
        evidence_seq = evidence_seq[:self.max_len]
        claim_seq = torch.cat(
            [claim_seq, torch.zeros(self.max_len - len(claim_seq), dtype=torch.long)])
        evidence_seq = torch.cat(
            [evidence_seq, torch.zeros(self.max_len - len(evidence_seq), dtype=torch.long)])

        return claim_seq, evidence_seq, label_tensor


# Create dataset and data loaders
max_len = 128
dataset = ClaimEvidenceDataset(
    claim_texts, evidence_texts, labels, word2idx, max_len)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

## Model

In [6]:
import torch.nn as nn

class ClaimEvidenceModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(ClaimEvidenceModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True,
                             bidirectional=True, dropout=0.1)
        self.lstm2 = nn.LSTM(hidden_dim * 2, hidden_dim, batch_first=True,
                             bidirectional=True, dropout=0.1)
        # Adjusted input dimension to match the concatenated representation of claim and evidence
        self.dense1 = nn.Linear(hidden_dim * 4, hidden_dim)  # 4 * hidden_dim because we concatenate two bidirectional outputs
        self.dense2 = nn.Linear(hidden_dim, 1)
        self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()

    def forward(self, claim, evidence):
        claim_embedded = self.embedding(claim)
        evidence_embedded = self.embedding(evidence)

        claim_lstm1_out, _ = self.lstm1(claim_embedded)
        evidence_lstm1_out, _ = self.lstm1(evidence_embedded)

        claim_lstm2_out, _ = self.lstm2(claim_lstm1_out)
        evidence_lstm2_out, _ = self.lstm2(evidence_lstm1_out)

        claim_rep = claim_lstm2_out[:, -1, :]
        evidence_rep = evidence_lstm2_out[:, -1, :]

        concat_rep = torch.cat((claim_rep, evidence_rep), dim=1)

        dense1_out = self.tanh(self.dense1(concat_rep))
        output = self.sigmoid(self.dense2(dense1_out))

        return output

# Training

In [None]:
import torch.optim as optim
from torch.utils.data import random_split
from sklearn.metrics import f1_score

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Initialize the model
embedding_dim = 256
hidden_dim = 256
model = ClaimEvidenceModel(vocab_size, embedding_dim, hidden_dim).to(device)

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Training loop
num_epochs = 5
best_val_f1 = 0.0

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch_claim_seq, batch_evidence_seq, batch_labels in train_loader:

        # Send data to the same device as the model
        batch_claim_seq = batch_claim_seq.to(device)
        batch_evidence_seq = batch_evidence_seq.to(device)
        batch_labels = batch_labels.to(device)

        # Reset gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_claim_seq, batch_evidence_seq)
        loss = criterion(outputs.squeeze(), batch_labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        # Check for NaN in parameters
        for name, param in model.named_parameters():
            if torch.isnan(param).any() or torch.isinf(param).any():
                print(f"NaN or Inf found in {name}")


    train_loss /= len(train_loader)

    # Validation phase
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch_claim_seq, batch_evidence_seq, batch_labels in val_loader:
            batch_claim_seq = batch_claim_seq.to(device)
            batch_evidence_seq = batch_evidence_seq.to(device)
            batch_labels = batch_labels.to(device)

            outputs = model(batch_claim_seq, batch_evidence_seq)
            preds = (outputs.squeeze() > 0.5).int()
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(batch_labels.cpu().numpy())

    val_f1 = f1_score(val_labels, val_preds)
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val F1: {val_f1:.4f}')

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), 'best_evidence_model.pth')
        print('Model saved.')

cuda


