In [1]:
import torch
from torch import nn
import pickle 

# Set random seed for PyTorch CPU operations
torch.manual_seed(42)
device = torch.device('cpu') 
print('Device:',device)

# Load the vocabularies
with open('src_vocab.pkl', 'rb') as f:
    src_vocab = pickle.load(f)

with open('tgt_vocab.pkl', 'rb') as f:
    tgt_vocab = pickle.load(f)

print(len(src_vocab))
print(len(tgt_vocab))

# Parameters for the model
BATCH_SIZE = 64
embedding_dim = 256
units = 1024
vocab_inp_size = len(src_vocab)
vocab_tar_size = len(tgt_vocab)

class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, enc_units, batch_first=True)

    def forward(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, hidden)
        return output, state

    def initialize_hidden_state(self, batch_size):
        return torch.zeros((1, batch_size, self.enc_units))



import torch.nn.functional as F


class BahdanauAttention(nn.Module):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = nn.Linear(units, units)
        self.W2 = nn.Linear(units, units)
        self.V = nn.Linear(units, 1)

    def forward(self, query, values):
        query_with_time_axis = query.unsqueeze(1)  # Shape: [batch_size, 1, hidden_size]
        query_layer = self.W1(query_with_time_axis)  # Shape: [batch_size, 1, hidden_size]
        values_layer = self.W2(values)  # Shape: [batch_size, max_len, hidden_size]

        
        # Broadcasting query_layer to match the shape of values_layer
        query_layer = query_layer.expand_as(values_layer)

        # Calculate the score
        score = self.V(torch.tanh(query_layer + values_layer))  # Shape: [batch_size, max_length, 1]
        attention_weights = F.softmax(score, dim=1)

        context_vector = attention_weights * values
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector, attention_weights


class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim + dec_units, dec_units, batch_first=True)
        self.fc = nn.Linear(dec_units, vocab_size)

        # used for attention
        self.attention = BahdanauAttention(dec_units)

    def forward(self, x, hidden, enc_output):
        # Ensure hidden state is 2D [batch_size, hidden_size]
        if hidden.dim() == 3:
            hidden = hidden.squeeze(0)  # Removes the first dimension if it's of size 1

        # Attention layer
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # Concatenate context vector and x
        x = torch.cat((context_vector.unsqueeze(1), x), -1)

        # Passing the concatenated vector to the GRU
        output, state = self.gru(x, hidden.unsqueeze(0))

        # output shape == (batch_size, 1, hidden_size)
        output = output.reshape(-1, output.size(2))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state.squeeze(0), attention_weights

    



Device: cpu
2110
2146
