In [13]:
pip install datasets



In [14]:
from datasets import load_dataset
import nltk
import random
from nltk.corpus import words
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import string
import re

from datasets import load_dataset
import nltk
import random
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

In [15]:
dataset = load_dataset("race", "all")

# Extracting sentences
def extract_sentences(data):
    sentences = []
    for sample in data:
        passage = sample['article']
        sentences.extend(passage.split('.'))
    return [sentence.strip() for sentence in sentences if sentence.strip()]

train_sentences = extract_sentences(dataset['train'])
validation_sentences = extract_sentences(dataset['validation'])
sentences = train_sentences
#  + validation_sentences

In [16]:
sentences = sentences[:10000]

In [17]:
#if a word is valid
def is_valid_word(word):
    english_vocab = set(words.words())
    return word.lower() in english_vocab and word.isalpha()

In [18]:
import random
from transformers import AutoTokenizer

#Hugging Face tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

#tokenizer's vocabulary once
vocab_set = set(tokenizer.get_vocab().keys())

def is_valid_word(word):
    return word.isalpha() and word in vocab_set

def create_fill_in_blank_data(sentences, batch_size=32):
    data = []

    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]

        # Tokenizing all sentences in the batch
        tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in batch]

        for words_in_sentence in tokenized_sentences:
            # Split into halves
            mid_index = len(words_in_sentence) // 2
            latter_half = words_in_sentence[mid_index:]

            # Skip iflatter half is too short
            if len(latter_half) < 2:
                continue

            #finding valid blank words
            valid_candidates = [i for i, word in enumerate(latter_half) if is_valid_word(word)]
            if not valid_candidates:
                continue

            #selecting a random valid blank word
            blank_index = random.choice(valid_candidates)
            blank_word = latter_half[blank_index]

            # Replacing blank word with a placeholder
            latter_half[blank_index] = "[BLANK]"

            # Append processed sentence parts and the blank word to the dataset
            data.append({
                "part_a": " ".join(words_in_sentence[:mid_index]),
                "part_b": " ".join(latter_half),
                "part_b_reversed": " ".join(reversed(latter_half)),
                "blank_word": blank_word,
            })

    return data
#calling function
preprocessed_data = create_fill_in_blank_data(sentences)


In [19]:
# Split
train_data, val_data = train_test_split(preprocessed_data, test_size=0.2, random_state=42)

In [20]:
# Cleaning blank words
def clean_blank_word(blank_word):
    cleaned_word = blank_word.strip(string.punctuation + " ")
    if not cleaned_word or cleaned_word in string.punctuation or re.fullmatch(r'\d+', cleaned_word):
        return None
    if re.fullmatch(r'[\d-]+', cleaned_word):
        return None
    return cleaned_word

In [21]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
max_len = 50

# Dataset class
class FillBlankDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_a = tokenizer(item['part_a'], padding="max_length", truncation=True, max_length=max_len, return_tensors="pt")['input_ids'].squeeze()
        input_b = tokenizer(item['part_b_reversed'], padding="max_length", truncation=True, max_length=max_len, return_tensors="pt")['input_ids'].squeeze()
        cleaned_blank_word = clean_blank_word(item['blank_word'])
        if cleaned_blank_word is None:
            return None
        target_tokens = tokenizer(cleaned_blank_word, add_special_tokens=False, return_tensors="pt")['input_ids'].squeeze()
        target = target_tokens[0].item() if target_tokens.dim() > 0 else target_tokens.item()
        return {"input_a": input_a, "input_b": input_b, "target": target}

In [22]:
# DataLoader
train_dataset = FillBlankDataset(train_data)
val_dataset = FillBlankDataset(val_data)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=lambda x: list(filter(None, x)))
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=lambda x: list(filter(None, x)))

In [23]:
# LSTM Model
class LSTMModel(torch.nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, pad_idx):
        super(LSTMModel, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embed_size, padding_idx=pad_idx)
        self.lstm = torch.nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        return self.fc(lstm_out[:, -1, :])

In [24]:
# Initialize models and optimizers
vocab_size = len(tokenizer.vocab)
embed_size = 128
hidden_size = 256
num_layers = 2
pad_idx = tokenizer.pad_token_id

forward_lstm = LSTMModel(vocab_size, embed_size, hidden_size, num_layers, pad_idx).to("cuda")
backward_lstm = LSTMModel(vocab_size, embed_size, hidden_size, num_layers, pad_idx).to("cuda")

criterion = torch.nn.CrossEntropyLoss()
forward_optimizer = torch.optim.Adam(forward_lstm.parameters(), lr=0.001)
backward_optimizer = torch.optim.Adam(backward_lstm.parameters(), lr=0.001)

In [25]:
# Training func
def train_model(model, optimizer, dataloader, input_key):
    model.train()
    total_loss = 0
    for batch in dataloader:
        inputs = torch.stack([item[input_key] for item in batch]).to("cuda")
        targets = torch.tensor([item["target"] for item in batch]).to("cuda")

        outputs = model(inputs)
        loss = criterion(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [26]:
# Evaluating func
def evaluate_model(model, dataloader, input_key):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            inputs = torch.stack([item[input_key] for item in batch]).to("cuda")
            targets = torch.tensor([item["target"] for item in batch]).to("cuda")

            outputs = model(inputs)
            predictions = torch.argmax(outputs, dim=1)
            correct += (predictions == targets).sum().item()
            total += targets.size(0)

    return correct / total

In [27]:
for epoch in range(5):
    forward_loss = train_model(forward_lstm, forward_optimizer, train_loader, "input_a")
    backward_loss = train_model(backward_lstm, backward_optimizer, train_loader, "input_b")

    forward_acc = evaluate_model(forward_lstm, val_loader, "input_a")
    backward_acc = evaluate_model(backward_lstm, val_loader, "input_b")

    print(f"Epoch {epoch + 1}:")
    print(f"  Forward LSTM Loss: {forward_loss:.4f}, Accuracy: {forward_acc:.4f}")
    print(f"  Backward LSTM Loss: {backward_loss:.4f}, Accuracy: {backward_acc:.4f}")


Epoch 1:
  Forward LSTM Loss: 7.8727, Accuracy: 0.0201
  Backward LSTM Loss: 7.8644, Accuracy: 0.0592
Epoch 2:
  Forward LSTM Loss: 6.7643, Accuracy: 0.0592
  Backward LSTM Loss: 6.7656, Accuracy: 0.0592
Epoch 3:
  Forward LSTM Loss: 6.6623, Accuracy: 0.0592
  Backward LSTM Loss: 6.6654, Accuracy: 0.0592
Epoch 4:
  Forward LSTM Loss: 6.6349, Accuracy: 0.0592
  Backward LSTM Loss: 6.6371, Accuracy: 0.0592
Epoch 5:
  Forward LSTM Loss: 6.6228, Accuracy: 0.0592
  Backward LSTM Loss: 6.6237, Accuracy: 0.0592


**Bonus**

In [28]:
import torch.nn as nn
import torch.optim as optim

#Bidirectional LSTM and GRU
class BidirectionalLSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, pad_idx):
        super(BidirectionalLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        return self.fc(lstm_out[:, -1, :])  # Output for last timestep

In [29]:
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, pad_idx):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=pad_idx)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        gru_out, _ = self.gru(embedded)
        return self.fc(gru_out[:, -1, :])  # Output for last timestep

In [30]:
# Initialize alternate models
bidirectional_lstm = BidirectionalLSTMModel(vocab_size, embed_size, hidden_size, num_layers, pad_idx).to("cuda")
gru_model = GRUModel(vocab_size, embed_size, hidden_size, num_layers, pad_idx).to("cuda")

# Optimizers
bilstm_optimizer = optim.Adam(bidirectional_lstm.parameters(), lr=0.001)
gru_optimizer = optim.Adam(gru_model.parameters(), lr=0.001)

In [31]:
# Ensemble Prediction
def ensemble_prediction(forward_output, backward_output):
    alpha = 0.5  # Weight for forward model
    beta = 0.5   # Weight for backward model

    combined_output = alpha * forward_output + beta * backward_output
    predictions = torch.argmax(combined_output, dim=1)
    return predictions

In [32]:
for epoch in range(5):
    # Train LSTMs
    forward_loss = train_model(forward_lstm, forward_optimizer, train_loader, "input_a")
    backward_loss = train_model(backward_lstm, backward_optimizer, train_loader, "input_b")
    bilstm_loss = train_model(bidirectional_lstm, bilstm_optimizer, train_loader, "input_a")
    gru_loss = train_model(gru_model, gru_optimizer, train_loader, "input_a")

    # Evaluating LSTMs
    forward_acc = evaluate_model(forward_lstm, val_loader, "input_a")
    backward_acc = evaluate_model(backward_lstm, val_loader, "input_b")
    bilstm_acc = evaluate_model(bidirectional_lstm, val_loader, "input_a")
    gru_acc = evaluate_model(gru_model, val_loader, "input_a")

    # Ensemble Evaluation
    total_correct = 0
    total_samples = 0
    forward_lstm.eval()
    backward_lstm.eval()
    with torch.no_grad():
        for batch in val_loader:
            input_a = torch.stack([item["input_a"] for item in batch]).to("cuda")
            input_b = torch.stack([item["input_b"] for item in batch]).to("cuda")
            targets = torch.tensor([item["target"] for item in batch]).to("cuda")

            forward_outputs = forward_lstm(input_a)
            backward_outputs = backward_lstm(input_b)

            ensemble_preds = ensemble_prediction(forward_outputs, backward_outputs)
            total_correct += (ensemble_preds == targets).sum().item()
            total_samples += targets.size(0)

    ensemble_acc = total_correct / total_samples

    # Report Metrics
    print(f"Epoch {epoch + 1}:")
    print(f"  Forward LSTM Loss: {forward_loss:.4f}, Accuracy: {forward_acc:.4f}")
    print(f"  Backward LSTM Loss: {backward_loss:.4f}, Accuracy: {backward_acc:.4f}")
    print(f"  Bidirectional LSTM Loss: {bilstm_loss:.4f}, Accuracy: {bilstm_acc:.4f}")
    print(f"  GRU Loss: {gru_loss:.4f}, Accuracy: {gru_acc:.4f}")
    print(f"  Ensemble Accuracy: {ensemble_acc:.4f}")


Epoch 1:
  Forward LSTM Loss: 6.6153, Accuracy: 0.0592
  Backward LSTM Loss: 6.6173, Accuracy: 0.0592
  Bidirectional LSTM Loss: 7.8534, Accuracy: 0.0592
  GRU Loss: 7.9930, Accuracy: 0.0592
  Ensemble Accuracy: 0.0592
Epoch 2:
  Forward LSTM Loss: 6.6086, Accuracy: 0.0592
  Backward LSTM Loss: 6.6127, Accuracy: 0.0592
  Bidirectional LSTM Loss: 6.7644, Accuracy: 0.0592
  GRU Loss: 6.9115, Accuracy: 0.0592
  Ensemble Accuracy: 0.0592
Epoch 3:
  Forward LSTM Loss: 6.6076, Accuracy: 0.0592
  Backward LSTM Loss: 6.6084, Accuracy: 0.0592
  Bidirectional LSTM Loss: 6.6640, Accuracy: 0.0592
  GRU Loss: 6.7108, Accuracy: 0.0520
  Ensemble Accuracy: 0.0592
Epoch 4:
  Forward LSTM Loss: 6.6040, Accuracy: 0.0592
  Backward LSTM Loss: 6.6049, Accuracy: 0.0592
  Bidirectional LSTM Loss: 6.6385, Accuracy: 0.0592
  GRU Loss: 6.5142, Accuracy: 0.0443
  Ensemble Accuracy: 0.0592
Epoch 5:
  Forward LSTM Loss: 6.6012, Accuracy: 0.0592
  Backward LSTM Loss: 6.6040, Accuracy: 0.0592
  Bidirectional LSTM L