In [1]:
import torch
import collections
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from torch import Tensor
from typing import List
import json
import sklearn
import random
import matplotlib.pyplot as plt

In [2]:



# Load SQuAD data
with open("squad2.json", 'r') as file:
    squad_data = json.load(file)

# Extract dialogues
dialogues = []
for data in squad_data['data']:
    for paragraph in data['paragraphs']:
        for qas in paragraph['qas']:
            if not qas['is_impossible']:
                dialogues.append((qas['question'], qas['answers'][0]['text']))




In [3]:
import torch
from torch import nn

class WordEmbeddings(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, input):
        return self.embedding(input)


In [4]:
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from torch import Tensor
from typing import List

# Build vocab
def build_vocab(sentences: List[List[str]], min_freq: int = 3):
    counter = Counter([word for sentence in sentences for word in sentence])
    vocab = {word: i+4 for i, (word, freq) in enumerate(counter.items()) if freq >= min_freq}
    vocab['<PAD>'] = 0  # Padding
    vocab['<SOS>'] = 1  # Start of sentence
    vocab['<EOS>'] = 2  # End of sentence
    vocab['<UNK>'] = 3  # Unknown
    return vocab

# Build reverse vocab
def build_reverse_vocab(vocab):
    return {i: word for word, i in vocab.items()}

# Tokenize and convert to integer sequences
def preprocess_data(data: List[str], vocab):
    sequences = [[vocab.get(word, vocab['<UNK>']) for word in sentence.split()] for sentence in data]
    return sequences

# Convert to tensor and pad
def pad_and_convert_to_tensor(sequences: List[List[int]]):
    max_length = max(len(seq) for seq in sequences)
    padded_sequences = [sequence + [0] * (max_length - len(sequence)) for sequence in sequences]
    return pad_sequence([Tensor(sequence) for sequence in padded_sequences], batch_first=True, padding_value=0)

def pad_and_convert_to_tensor(sequences: List[List[int]]):
    max_length = max(len(seq) for seq in sequences)
    padded_sequences = [sequence + [0] * (max_length - len(sequence)) for sequence in sequences]
    return pad_sequence([Tensor(sequence) for sequence in padded_sequences], batch_first=True, padding_value=0)

from torch.utils.data import DataLoader, Dataset

class QADataset(Dataset):
    def __init__(self, question_tensors, answer_tensors):
        self.question_tensors = question_tensors
        self.answer_tensors = answer_tensors

    def __getitem__(self, index):
        return self.question_tensors[index], self.answer_tensors[index]

    def __len__(self):
        return len(self.question_tensors)




# Extract questions and answers
questions, answers = zip(*dialogues)

# Build vocabularies
question_vocab = build_vocab(questions)
answer_vocab = build_vocab(answers)
reverse_answer_vocab = build_reverse_vocab(answer_vocab)

# Preprocess questions and answers
question_sequences = preprocess_data(questions, question_vocab)
answer_sequences = preprocess_data(answers, answer_vocab)




In [5]:
# Convert to tensors and pad
question_tensors = pad_and_convert_to_tensor(question_sequences)
answer_tensors = pad_and_convert_to_tensor(answer_sequences)

# Define the proportions
train_ratio = 0.70
val_ratio = 0.15
test_ratio = 0.15

# Make sure the proportions sum to 1
assert train_ratio + val_ratio + test_ratio == 1, "Ratios must sum to 1"

# Calculate the sizes
total_size = len(question_tensors)
train_size = int(total_size * train_ratio)
val_size = int(total_size * val_ratio)
test_size = total_size - train_size - val_size

# Split the dataset
train_question_tensors = question_tensors[:train_size]
val_question_tensors = question_tensors[train_size:train_size+val_size]
test_question_tensors = question_tensors[train_size+val_size:]

train_answer_tensors = answer_tensors[:train_size]
val_answer_tensors = answer_tensors[train_size:train_size+val_size]
test_answer_tensors = answer_tensors[train_size+val_size:]

# Create datasets
train_dataset = QADataset(train_question_tensors, train_answer_tensors)
val_dataset = QADataset(val_question_tensors, val_answer_tensors)
test_dataset = QADataset(test_question_tensors, test_answer_tensors)

# Create data loaders
batch_size = 128  # You can change this value

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [6]:
# Extract a batch from the DataLoader
sample_batch = next(iter(train_loader))

# Print the integer sequences
print("Sample Question Sequences:")
print(sample_batch[0])

print("\nSample Answer Sequences:")
print(sample_batch[1])

Sample Question Sequences:
tensor([[3., 3., 3.,  ..., 0., 0., 0.],
        [3., 3., 3.,  ..., 0., 0., 0.],
        [3., 3., 3.,  ..., 0., 0., 0.],
        ...,
        [3., 3., 3.,  ..., 0., 0., 0.],
        [3., 3., 3.,  ..., 0., 0., 0.],
        [3., 3., 3.,  ..., 0., 0., 0.]])

Sample Answer Sequences:
tensor([[3., 3., 3.,  ..., 0., 0., 0.],
        [3., 0., 0.,  ..., 0., 0., 0.],
        [3., 3., 3.,  ..., 0., 0., 0.],
        ...,
        [3., 3., 0.,  ..., 0., 0., 0.],
        [3., 3., 3.,  ..., 0., 0., 0.],
        [3., 3., 3.,  ..., 0., 0., 0.]])


In [7]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        src = src.long()  #
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell


In [8]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)

        self.fc_out = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.long()  # Convert the input tensor to torch.long datatype
        input = input.unsqueeze(0)

        embedded = self.dropout(self.embedding(input))

        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))

        prediction = self.fc_out(output.squeeze(0))

        return prediction, hidden, cell


In [9]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        input = trg[0, :]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden[:, :hidden.shape[1]], cell[:, :cell.shape[1]])

            outputs[t] = output

            teacher_force = random.random() < teacher_forcing_ratio

            top1 = output.argmax(1)

            input = trg[t] if teacher_force else top1

        return outputs



In [10]:
# Instantiate the model, optimizer and loss function
INPUT_DIM = len(question_vocab)
OUTPUT_DIM = len(answer_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = answer_vocab['<PAD>'])

print("Encoder Model:")
print(enc)
print("\nDecoder Model:")
print(dec)
print("\nSeq2Seq Model:")
print(model)


Encoder Model:
Encoder(
  (embedding): Embedding(134, 256)
  (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
)

Decoder Model:
Decoder(
  (embedding): Embedding(192, 256)
  (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
  (fc_out): Linear(in_features=512, out_features=192, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

Seq2Seq Model:
Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(134, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(192, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=192, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)


In [11]:
import time
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    step_losses = []
    start_time = time.time()

    for i, batch in enumerate(iterator):
        src = batch[0].to(device)
        trg = batch[1].to(device)

        optimizer.zero_grad()

        # Check if the batch size is equal to BATCH_SIZE
        if len(src) == batch_size:
            output = model(src, trg)
        else:
            # Pad the batch to have a length divisible by BATCH_SIZE
            padding_length = batch_size - (len(src) % batch_size)
            padded_src = torch.cat((src, torch.zeros(padding_length, *src.shape[1:], dtype=src.dtype)), dim=0)
            padded_trg = torch.cat((trg, torch.zeros(padding_length, *trg.shape[1:], dtype=trg.dtype)), dim=0)
            output = model(padded_src, padded_trg)

        output_dim = output.shape[-1]

        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()
        step_losses.append(loss.item())

        if (i + 1) % 100 == 0:  # Adjust the frequency based on your preference
            elapsed_time = time.time() - start_time
            steps_per_sec = (i + 1) / elapsed_time
            avg_loss = epoch_loss / (i + 1)
            print(f"Step: {i+1}/{len(iterator)} | Loss: {avg_loss:.4f} | Steps/sec: {steps_per_sec:.2f}")

    return epoch_loss / len(iterator), step_losses


In [12]:
def evaluate(model, iterator, criterion):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            output_dim = output.shape[-1]

            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)




In [13]:
# Set the training parameters
N_EPOCHS = 10
CLIP = 1

# Create lists to store the training progress
train_losses = []
valid_losses = []
step_losses = []

# Training loop
for epoch in range(N_EPOCHS):
    train_loss, step_loss = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_loader, criterion)

    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    step_losses.extend(step_loss)







# Plot the training progress
plt.plot(range(1, N_EPOCHS+1), train_losses, label='Training Loss')
plt.plot(range(1, N_EPOCHS+1), valid_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot the step-wise losses
plt.plot(range(1, len(step_losses)+1), step_losses)
plt.xlabel('Step')
plt.ylabel('Loss')
plt.show()


RuntimeError: Expected hidden[0] size (2, 43, 512), got [2, 40, 512]