In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor
import torch.nn.functional as F
import math
from torch import optim

In [None]:
def generate_vocab(sentences):
  """
    sentences: (list of str) sentences
    unique_words: (list of str) unique words in the training examples
    V: (int) Size of Vocabulary
    word_to_id: (dict) mapping of words to ids
    id_to_word: (dict) mapping of ids to words
  """
  unique_words = list(set(" ".join(sentences).split()))
  unique_words += ["[PAD]", "[UNK]"]
  V = len(unique_words)

  word_to_id = {w: i for i, w in enumerate(unique_words)}
  id_to_word = {i: w for i, w in enumerate(unique_words)}
  return V, word_to_id, id_to_word

In [None]:
def tokenization(sentences, d=32, padding_size=4, decoder=False):
  """
    sentences: (list of str) sentences
    padding_size: (int) Size of padding
    d: (int) embedding dimension
    unique_words: (list of str) unique words in the training examples
    V: (int) Size of Vocabulary
    word_to_id: (dict) mapping of word to ids
    id_to_word: (dict) mapping of ids to words
    m: (int) number of training examples
    n: (int) sequence length in a single example
    word_ids: (numpy array) tokenized word IDs of shape (m, n)
    word_ids_shifted: (numpy array, optional) tokenized word IDs shifted for decoder of shape (m, n)
  """
  sentences = ["<START> "+s+" <END>" for s in sentences]

  V, word_to_id, id_to_word = generate_vocab(sentences)
  m=len(sentences)
  n=padding_size
  split_sentences = [sentence.split() for sentence in sentences]

  pad_id = word_to_id["[PAD]"]
  unk_id = word_to_id["[UNK]"]
  word_ids = np.full((m, n), pad_id, dtype=int)

  for i, words in enumerate(split_sentences):
    truncated = words[:padding_size]

    if decoder:
      truncated_shifted=[j for j in truncated if j!='<END>']
      mapped_shifted=[word_to_id.get(word, unk_id) for word in truncated_shifted]
      word_ids_shifted=word_ids[:,:-1]
      word_ids_shifted[i, :len(mapped_shifted)] = mapped_shifted

    mapped = [word_to_id.get(word, unk_id) for word in truncated]
    word_ids[i, :len(mapped)] = mapped
  if decoder:
    return word_to_id, id_to_word, V, d, n, m, word_ids_shifted, word_ids
  return word_to_id, id_to_word, V, d, n, m, word_ids

In [None]:
sentences_eng=['the dog is not sitting','the cat is sitting','a cat is playing','the dog is not playing','a cat is not sitting']

sentences_spa=['el perro no esta sentado','el gato esta sentado','un gato esta jugando','el perro no esta jugando','un gato no esta sentado']

word_to_id_en, id_to_word_en, vocab_size_en, dimension, sequence_length_en, batch_size, word_ids_en = tokenization(sentences_eng, d=8, padding_size=7)

word_to_id_dec, id_to_word_dec, vocab_size_dec, dimension, sequence_length_dec, batch_size, word_ids_shifted, target = tokenization(sentences_spa, d=8, padding_size=7, decoder = True)

In [None]:
heads = 2
expansion_dim = dimension * 4
num_layers_enc_dec = 6

In [None]:
class Embedding(nn.Module):
  def __init__(self, dimension, vocab_size):
    super().__init__()
    self.embeddings = nn.Embedding(vocab_size, dimension)

  def forward(self, x):
    return self.embeddings(x)

In [None]:
class Position_Encoding(nn.Module):
  def __init__(self, dimension, seq_len):
    super().__init__()
    position = torch.arange(seq_len)[:, torch.newaxis]
    div_term = 10000 ** (torch.arange(dimension) / dimension)
    self.pos_enc = position / div_term
    self.pos_enc[:,0::2] = torch.sin(self.pos_enc[:,0::2])
    self.pos_enc[:,1::2] = torch.sin(self.pos_enc[:,1::2])

  def forward(self, x):
    return x + self.pos_enc[:x.shape[1]]

In [None]:
class SelfAttention(nn.Module):
  def __init__(self, heads, dimension):
    super().__init__()
    self.heads=heads
    self.head_dim = int(dimension/self.heads)
    self.dimension = dimension

    self.Q = nn.Linear(self.head_dim, self.head_dim)
    self.K = nn.Linear(self.head_dim, self.head_dim)
    self.V = nn.Linear(self.head_dim, self.head_dim)

    self.linear = nn.Linear(self.dimension, self.dimension)
    self.norm = nn.LayerNorm(dimension)

  def get_mask(self, attention_scores):
    batch, heads, query_len, key_len = attention_scores.shape
    mask = torch.tril(torch.ones((query_len, key_len))).to(attention_scores.device)
    mask = mask.unsqueeze(0).unsqueeze(1)
    return mask

  def forward(self, q, k, v, m=False):
    batch = q.shape[0]
    q_len = q.shape[1]
    k_len = k.shape[1]
    v_len = v.shape[1]

    Q = self.Q(q.reshape(batch, q_len, self.heads, self.head_dim))
    K = self.K(k.reshape(batch, k_len, self.heads, self.head_dim))
    V = self.V(v.reshape(batch, v_len, self.heads, self.head_dim))

    dot_Q_K = torch.einsum("bqhd, bkhd -> bhqk", [Q, K]) / math.sqrt(self.dimension)

    if m:
      mask = self.get_mask(dot_Q_K)
      dot_Q_K = dot_Q_K.masked_fill(mask == 0, float("-inf"))

    softmax = F.softmax(dot_Q_K, dim=3)
    attention = torch.einsum("bhqk, bvhd -> bqhd", [softmax, V])

    multi_heads = attention.reshape(batch, q_len, self.dimension)

    linear = self.linear(multi_heads)

    addnorm = self.norm(linear + q)

    return addnorm

In [None]:
class FeedForward(nn.Module):
  def __init__(self, dimension, expansion_dim):
    super().__init__()
    self.ff = nn.Sequential(
        nn.Linear(dimension, expansion_dim),
        nn.ReLU(),
        nn.Linear(expansion_dim, dimension)
    )
    self.norm = nn.LayerNorm(dimension)

  def forward(self, x):
    return self.norm(x + self.ff(x))

In [None]:
class Encoder(nn.Module):
  def __init__(self, heads, dimension, expansion_dim):
    super().__init__()
    self.attention = SelfAttention(heads, dimension)
    self.ff = FeedForward(dimension, expansion_dim)

  def forward(self, q, k ,v):
    multiheadout = self.attention(q, k, v)
    return self.ff(multiheadout)

In [None]:
class Decoder(nn.Module):
  def __init__(self, heads, dimension, expansion_dim):
    super().__init__()
    self.masked_attention = SelfAttention(heads, dimension)
    self.attention = SelfAttention(heads, dimension)
    self.ff = FeedForward(dimension, expansion_dim)

  def forward(self, q, k_dec, v_dec, k_en, v_en):
    masked_headout = self.masked_attention(q, k_dec, v_dec, m=True)
    multiheadout = self.attention(masked_headout, k_en, v_en)
    feed_forw = self.ff(multiheadout)
    return feed_forw

In [None]:
class Transformer(nn.Module):
  def __init__(self, dimension, vocab_size_en, vocab_size_dec, sequence_length_en, sequence_length_dec, heads, expansion_dim, N):
    super().__init__()
    self.embedding_enc = Embedding(dimension, vocab_size_en)
    self.embedding_dec = Embedding(dimension, vocab_size_dec)
    self.pos_enc_en = Position_Encoding(dimension, sequence_length_en)
    self.pos_enc_dec = Position_Encoding(dimension, sequence_length_dec)
    self.encoder = nn.ModuleList([Encoder(heads, dimension, expansion_dim) for _ in range(N)])
    self.decoder = nn.ModuleList([Decoder(heads, dimension, expansion_dim) for _ in range(N)])
    self.linear = nn.Linear(dimension, vocab_size_dec)

  def forward(self, word_ids_en, word_ids_dec):
    embeddings_en = self.embedding_enc(word_ids_en)
    embeddings_dec = self.embedding_dec(word_ids_dec)
    enc_in = self.pos_enc_en(embeddings_en)
    dec_in = self.pos_enc_dec(embeddings_dec)
    for enc in self.encoder:
        enc_in = enc(enc_in, enc_in, enc_in)
    for dec in self.decoder:
        dec_in = dec(dec_in, dec_in, dec_in, enc_in, enc_in)
    out = self.linear(dec_in)
    out = out.reshape((out.shape[0], out.shape[2], out.shape[1]))
    return out

In [None]:
transformer = Transformer(dimension, vocab_size_en, vocab_size_dec, sequence_length_en, sequence_length_dec, heads, expansion_dim, num_layers_enc_dec)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transformer.to(device)

word_ids_en_tensor = torch.tensor(word_ids_en, dtype=torch.long).to(device)
word_ids_dec_tensor = torch.tensor(word_ids_shifted, dtype=torch.long).to(device)

target_label_tensor = torch.tensor(target, dtype=torch.long).to(device)

In [None]:
optimizer = optim.Adam(transformer.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

In [None]:
for epoch in range(5000):
    transformer.train()
    optimizer.zero_grad()
    output = transformer(word_ids_en_tensor, word_ids_dec_tensor)
    loss = criterion(output, target_label_tensor)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

Epoch 0, Loss: 2.5989
Epoch 1, Loss: 2.5901
Epoch 2, Loss: 2.5820
Epoch 3, Loss: 2.5740
Epoch 4, Loss: 2.5718
Epoch 5, Loss: 2.5659
Epoch 6, Loss: 2.5568
Epoch 7, Loss: 2.5516
Epoch 8, Loss: 2.5477
Epoch 9, Loss: 2.5407
Epoch 10, Loss: 2.5292
Epoch 11, Loss: 2.5295
Epoch 12, Loss: 2.5197
Epoch 13, Loss: 2.5123
Epoch 14, Loss: 2.5073
Epoch 15, Loss: 2.4995
Epoch 16, Loss: 2.4967
Epoch 17, Loss: 2.4950
Epoch 18, Loss: 2.4924
Epoch 19, Loss: 2.4900
Epoch 20, Loss: 2.4853
Epoch 21, Loss: 2.4792
Epoch 22, Loss: 2.4751
Epoch 23, Loss: 2.4718
Epoch 24, Loss: 2.4682
Epoch 25, Loss: 2.4648
Epoch 26, Loss: 2.4628
Epoch 27, Loss: 2.4610
Epoch 28, Loss: 2.4580
Epoch 29, Loss: 2.4545
Epoch 30, Loss: 2.4510
Epoch 31, Loss: 2.4480
Epoch 32, Loss: 2.4452
Epoch 33, Loss: 2.4426
Epoch 34, Loss: 2.4397
Epoch 35, Loss: 2.4370
Epoch 36, Loss: 2.4344
Epoch 37, Loss: 2.4317
Epoch 38, Loss: 2.4291
Epoch 39, Loss: 2.4264
Epoch 40, Loss: 2.4235
Epoch 41, Loss: 2.4205
Epoch 42, Loss: 2.4175
Epoch 43, Loss: 2.414

In [None]:
import pickle
with open('/content/drive/MyDrive/Dataset/weights/weights.pkl', 'wb') as f:
    pickle.dump(weights, f)

In [None]:
import pickle
with open('/content/drive/MyDrive/Dataset/weights/embeddings_en.pkl', 'wb') as f:
    pickle.dump({"embeddings_en":word_embeddings_en}, f)

In [None]:
import pickle
with open('/content/drive/MyDrive/Dataset/weights/embeddings_dec.pkl', 'wb') as f:
    pickle.dump({"embeddings_dec":word_embeddings_ger}, f)