In [1]:
# !pip install torch==2.3.0 torchtext==0.18.0
import torch
import torch.nn as nn

In [2]:
from dataclasses import dataclass


@dataclass
class ModelArgs:
    device = 'cuda'
    no_of_neurons = 128
    block_size = 32
    batch_size = 32
    en_vocab_size = None
    de_vocab_size = None
    dropout = 0.1
    epoch = 10
    max_lr = 1e-4
    embedding_dims = 1024
    num_layers = 4
    hidden_dim = 4*embedding_dims

In [3]:
if torch.cuda.is_available():
    ModelArgs.device = 'cuda'
    torch.set_default_device('cuda')
else:

    torch.set_default_device('cpu')
    ModelArgs.device='cpu'

if torch.cuda.is_available():
  torch.set_default_device(ModelArgs.device)

In [None]:

!python -m spacy download de_core_news_sm

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import wandb
wandb.login()

In [7]:
torch.set_default_device(ModelArgs.device)

In [None]:
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator
from torchtext.utils import download_from_url, extract_archive
import io
from torch.utils.data import DataLoader, Dataset

# Download and extract data
url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.de.gz', 'train.en.gz')
val_urls = ('val.de.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

# Load SpaCy tokenizers
de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

# Build vocabulary
def build_vocab(filepath, tokenizer):
    counter = Counter()
    with io.open(filepath, encoding="utf8") as f:
        for string_ in f:
            counter.update(tokenizer(string_))
    vocab = build_vocab_from_iterator(
        [counter.keys()],
        specials=['<unk>', '<bos>', '<eos>', '<pad>']
    )
    vocab.set_default_index(vocab['<unk>'])
    return vocab

de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
ModelArgs.de_vocab_size = len(de_vocab) 
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)
ModelArgs.en_vocab_size = len(en_vocab) 


def data_process(filepaths):
    raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
    raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
    data = []

    # Get the indices for <bos> and <eos> tokens
    de_bos_idx = de_vocab['<bos>']
    de_eos_idx = de_vocab['<eos>']
    en_bos_idx = en_vocab['<bos>']
    en_eos_idx = en_vocab['<eos>']

    for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
        # Tokenize and convert to indices
        de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)], dtype=torch.long)
        en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)], dtype=torch.long)

        # Add <bos> and <eos> tokens
        # de_tensor_ = torch.cat([torch.tensor([de_bos_idx]), de_tensor_, torch.tensor([de_eos_idx])])
        en_tensor_ = torch.cat([torch.tensor([en_bos_idx]), en_tensor_, torch.tensor([en_eos_idx])])

        # Flip the German tensor (if required)
        # de_tensor_ = torch.flip(de_tensor_, dims=[0])

        # Append to data
        data.append((de_tensor_, en_tensor_))

    return data


train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)

# Create a custom Dataset class
class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Create Dataset instances
train_dataset = TranslationDataset(train_data)
val_dataset = TranslationDataset(val_data)
test_dataset = TranslationDataset(test_data)

from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch, block_size=32):
    """
    Collate function to pad or truncate sequences to a fixed block size.

    Args:
        batch: A list of tuples (de_tensor, en_tensor).
        block_size: The fixed length to pad or truncate sequences to.

    Returns:
        de_batch: Padded/truncated German sequences (batch_size, block_size).
        en_batch: Padded/truncated English sequences (batch_size, block_size).
    """
    de_batch, en_batch = zip(*batch)

    # Function to pad or truncate a sequence to the block size
    def pad_or_truncate(sequence, block_size, pad_value):
        if len(sequence) > block_size:
            # Truncate the sequence if it's longer than block_size
            return sequence[:block_size]
        else:
            # Pad the sequence if it's shorter than block_size
            padding_length = block_size - len(sequence)
            return torch.cat([sequence, torch.full((padding_length,), pad_value, dtype=sequence.dtype)])

    # Pad or truncate each sequence in the batch
    de_batch = [pad_or_truncate(seq, block_size, de_vocab['<pad>']) for seq in de_batch]
    en_batch = [pad_or_truncate(seq, block_size, en_vocab['<pad>']) for seq in en_batch]

    # Stack the sequences into a single tensor
    de_batch = torch.stack(de_batch)
    en_batch = torch.stack(en_batch)

    return de_batch, en_batch

generator = torch.Generator(device=ModelArgs.device)


# Create DataLoader instances
batch_size = ModelArgs.batch_size
train_loader = DataLoader(train_dataset, generator=generator, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)
val_loader = DataLoader(val_dataset, generator=generator, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, drop_last=False)

# Example usage
for de_batch, en_batch in train_loader:
    print(f"German batch shape: {de_batch.shape}")
    print(f"English batch shape: {en_batch.shape}")
    break

In [9]:
class RNNCell(nn.Module):
    def __init__(self, device, no_of_neurons, features = None, bi=False):
        super().__init__()

        # if(bi):
        #     print("eher")
        self.linear_layer_bi = nn.Linear(in_features=(2*ModelArgs.no_of_neurons) + ModelArgs.embedding_dims, out_features=no_of_neurons, device=ModelArgs.device)
        # else:
        self.in_features = features
        self.default_features = None
        if(self.in_features is  None):
          self.default_features = ModelArgs.no_of_neurons + ModelArgs.embedding_dims
        else:
          self.default_features = self.in_features

        self.linear_layer = nn.Linear(in_features=self.default_features, out_features=no_of_neurons, device=ModelArgs.device)
        self.bi = bi
    def forward(self, x, ht_1):
      # print(self.bi)
      if(self.bi):
        # print(x.shape)
        # print(ht_1.shape)
        x = self.linear_layer_bi(torch.cat([x, ht_1], dim=1))
      else:
        # print("here")
        x =  self.linear_layer(torch.cat([x, ht_1], dim=1))

      ht = torch.nn.functional.sigmoid(x)
      return ht


In [10]:
class LuongAttention(nn.Module):
    def __init__(self, hidden_size):
        super(LuongAttention, self).__init__()
        # self.linear_layer_1 = nn.Linear(2 * ModelArgs.no_of_neurons, ModelArgs.hidden_dim, device=ModelArgs.device)
        # self.linear_layer_2 = nn.Linear(ModelArgs.hidden_dim, ModelArgs.embedding_dims, device=ModelArgs.device)

    def forward(self, st, ht):
        # print("inside att st: ", st.shape)
        # print("inside att ht: ", ht.shape)
        st = st.expand(-1, ht.shape[1], -1)
        
        # dot = torch.dot(st, ht, )
        dot = torch.sum(st * ht, dim=-1)
        # combined = torch.cat([st, ht], dim=-1)
        # out = self.linear_layer_1(combined)
        # out = torch.nn.functional.tanh(out)
        # out = self.linear_layer_2(out)
        attention_weights = torch.nn.functional.softmax(dot, dim=1)
        # print("attn: ", attention_weights.shape)
        # attention_weights = attention_weights.expand(ht.shape[1], -1, -1)
        # print("attn: ", attention_weights.shape)
        # print((attention_weights * ModelArgs.block_size).shape)
        # print(torch.sum((attention_weights * ModelArgs.block_size), dim=1).shape)
        # ht = ht.unsqueeze(-1)
        # print("ht: ", ht.shape)
        # out = attention_weights * ht
       
        attention_weights = attention_weights.unsqueeze(-1) #Batch size ,Timesteps , embeddings_dimensions

        out = attention_weights * ht  # Shape: (batch_size, timesteps, hidden_size)

        context_vector = torch.sum(out, dim=1)
        return context_vector, attention_weights

In [11]:
class RNNLayer(nn.Module):
    def __init__(self, device, no_of_neurons, bi=False, features=None):
        super().__init__()

        self.feature = features
        self.default_features = None

        if(self.feature is None):
          self.default_features = 2 * ModelArgs.no_of_neurons + ModelArgs.embedding_dims

        elif(self.feature is not None):
          self.default_features = self.feature


        self.rnn_layer = RNNCell(bi=bi, device=device, no_of_neurons=no_of_neurons, features=self.default_features)
        self.linear_layer = nn.Linear(in_features=ModelArgs.no_of_neurons, out_features=no_of_neurons, device=ModelArgs.device)
        self.attention = LuongAttention(ModelArgs.hidden_dim)
        self.linear_layer = nn.Linear(in_features=ModelArgs.no_of_neurons, out_features=ModelArgs.embedding_dims, device=device)
        # self.birnn_layer = RNNCell(device=device, no_of_neurons=no_of_neurons, bi=True)
    def forward(self, x, ht_1=None, attn=None, outputs=None):

        if(ht_1 is None):
          ht_1 = torch.zeros((x.shape[0], ModelArgs.no_of_neurons), device=ModelArgs.device, requires_grad=True, dtype=torch.float32)

        seq_len = x.shape[1]

        if(outputs is None):
          outputs = []
          for t in range(seq_len):
              xt = x[:, t, :]
              # xt = xt.unsqueeze(-1)
              ht = self.rnn_layer(xt, ht_1)
              ht_1 = ht
              outputs.append(ht_1)

          outputs = torch.stack(outputs, dim=1)
          return ht_1, outputs

        if(attn == True and outputs is not None):
          new_outputs = []
          count = 0
          st_current=None
          for t in range(seq_len):
            timestep = x[:, t, :]
            if(count == 0):
              st_current = torch.zeros((x.shape[0], ModelArgs.no_of_neurons), device=ModelArgs.device, requires_grad=True, dtype=torch.float32)
              count += 1
              # print(outputs.shape)
            # st_current = 
            xt = outputs
            st_current = st_current.unsqueeze(1)
            # print("HERE: ", xt.shape)
            # print("st_1: ", st_1.shape)
            # xt = self.linear_layer(outputs)
            context_vector, attention_weights = self.attention(st_current, xt)
            # xt = torch.cat([xt, context_vector], dim=1)
            # print("timestep: ", timestep.shape) 
            # print("herere: ", context_vector.shape)
            # context_vector = context_vector.unsqueeze(0)
            # print("info: ", context_vector.shape)
            # context_vector = context_vector.expand(-1, timestep.shape[1], -1)
            # print("info: ", context_vector.shape)
            timestep = torch.cat([timestep, context_vector], dim=-1)
            # print("time step now: ", timestep.shape)
            # xt = xt.expand(-1,-1, context_vector.shape[-1])
            # print("xt: ", xt.shape)
            # xt = xt + context_vector
            # xt = torch.concat([xt, context_vector], dim=-1)
            # xt = xt + context_vector
            # print("After xt: ", xt.shape)
            # st_1 = st_1.expand(-1, xt, -1)
            st_current = st_current.squeeze(1)
            # print("xt now : ", xt.shape)
            # print("st now: ", st_1.shape)
            st = self.rnn_layer(st_current, timestep)
            # print("Here: ", st.shape)
            st_current = st
            new_outputs.append(st_current)

          new_outputs = torch.stack(new_outputs, dim=1)
          return st_current, new_outputs

In [12]:


class EmbeddingTable_de(nn.Module):
  def __init__(self, device):
    super().__init__()

    self.embed_de =  nn.Embedding(num_embeddings=ModelArgs.de_vocab_size, embedding_dim=ModelArgs.embedding_dims, device=device)

  def forward(self, x):
    # print('Indie: ', x)
    return self.embed_de(x)

In [13]:

class EmbeddingTable_en(nn.Module):
  def __init__(self, device):
    super().__init__()

    self.embed_en = nn.Embedding(num_embeddings=ModelArgs.en_vocab_size, embedding_dim=ModelArgs.embedding_dims, device=device)

  def forward(self, x):
    return self.embed_en(x)

In [14]:


class Decoder(nn.Module):
    def __init__(self, device, no_of_neurons, out_features, bi=False):
        super().__init__()
        self.rnn = RNNLayer(device=device, no_of_neurons=no_of_neurons, bi=False, features = None)
        self.embeds_table_en = EmbeddingTable_en(device=device)
        self.output = nn.Linear(in_features=ModelArgs.no_of_neurons, out_features=ModelArgs.en_vocab_size, device=device, dtype=torch.float32)
        self.dropout = nn.Dropout(p=ModelArgs.dropout)

    def forward(self, x, ctx=None, inf=None, embeds=None, initial=None):

      if(inf is not True and initial is True):
        x = self.embeds_table_en(x)
      if(inf is True):
        # print("Before: ", x.shape)
        x = embeds(x)

      ht, outputs = self.rnn(x, outputs=ctx, attn=True)
      out = self.output(outputs)
      out = self.dropout(out)
      return out, outputs


In [15]:

class Encoder(nn.Module):
    def __init__(self, device, no_of_neurons, out_features):
        super().__init__()
        self.rnn = RNNLayer(device=device, no_of_neurons=no_of_neurons, bi=False, features=(ModelArgs.embedding_dims + ModelArgs.no_of_neurons))
        self.embeds_table_de = EmbeddingTable_de(device=device)
        self.output = nn.Linear(in_features=2 * ModelArgs.no_of_neurons, out_features=ModelArgs.no_of_neurons, device=device, dtype=torch.float32)
        # self.dropout = nn.Dropout(p=ModelArgs.dropout)

    def forward(self, x, initial=None):
        if(initial is not None and initial is True):
          x = self.embeds_table_de(x)
        ht_fd, outputs_fd = self.rnn(x)
        x_rev = torch.flip(x, dims=[1])
        ht_bwd, outputs_bwd = self.rnn(x_rev)
        outputs = torch.cat([outputs_fd, outputs_bwd], dim=-1)
        out = self.output(outputs)
        # out = self.dropout(out)
        return out, self.embeds_table_de


In [16]:



class Seq2Seq(nn.Module):

    def __init__(self, device, no_of_neurons, out_features):
        super().__init__()

        self.encoder = Encoder(device, no_of_neurons, out_features)
        self.embeds_table_en = EmbeddingTable_en(device=device)
        self.decoder = Decoder(device, no_of_neurons, out_features)
        # self.encoders = nn.ModuleList(Encoder(device, no_of_neurons, out_features) for _  in range(ModelArgs.num_layers))
        # self.decoders = nn.ModuleList(Decoder(device, no_of_neurons, out_features) for x in range(ModelArgs.num_layers))

    def forward(self, x, y=None, inf=None):

        # count = 0
        # for i in self.encoders:
        #   if(count == 0):
        #     ht_encoder, ct_encoder,outputs_encoder, embeds_de = i(x, initial=True)
        #     # x = ht_encoder
        #   else:
        ht_encoder , embeds_de = self.encoder(x, initial=True)
        # print("encoder: ", ht_encoder.shape)
            # x = ht_encoder
          # count += 1

        res = None
        count = 0
        if(y is not None and inf==False):
          # for i in self.decoders:

          #   # print("Hiii")
          #   if(count == 0):
          y , outputs = self.decoder(y, ht_encoder, inf, embeds_de, True)
              # res = x
            # else:
            #   y, outputs = i(y, ht_encoder, inf, embeds_de, outputs=outputs)
              # res = x
            # return res
          # elif(y is not None and inf==False):
            # print("Here")
            # res = self.decoder(y, ht_encoder)
            # return res
            # count += 1
          return y


        elif(inf==True and y is None):
          # x_init = x
          # count = 0
          # for i in self.decoders:

          #   if(count == 0):
          x, outputs = self.decoder(x, ht_encoder, inf, embeds_de, True)
            # res = x

            # else:
            #   x, outputs = i(x_init, ht_encoder, inf, embeds_de, outputs=outputs)

            # count += 1
          return x

In [17]:

# model = GRU(device=ModelArgs.device, no_of_neurons=ModelArgs.no_of_neurons, out_features=1)
model = Seq2Seq(device=ModelArgs.device, no_of_neurons=ModelArgs.no_of_neurons, out_features=1)
model = model.to(ModelArgs.device)

In [18]:
# x = torch.randint(0, 100, (ModelArgs.batch_size,ModelArgs.block_size))  # Random integer between 0 and 100
# x2 = torch.randint(0, 100, (ModelArgs.batch_size)).unsqueeze(1)  # Random integer between 0 and 100
# torch.cat([x, x1], dim=1)

In [None]:


# !pip install torchinfo

from torchinfo import summary

# x = torch.randint(0, 100, (ModelArgs.batch_size,ModelArgs.block_size))  # Random integer between 0 and 100
# y = torch.randint(0, 100, (ModelArgs.batch_size,ModelArgs.block_size))
# y = y.to(ModelArgs.device)
# x = x.to(ModelArgs.device)


# x = torch.randint(0, 100, (ModelArgs.batch_size,ModelArgs.block_size))  # Random integer between 0 and 100
x,y = next(iter(train_loader))
x = x.to(ModelArgs.device)
y = y.to(ModelArgs.device)


summary(model=model,
        input_data=[x, y, False],
        # input_size=(ModelArgs.batch_size, ModelArgs.block_size, ModelArgs.embeddings_dims),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])


In [20]:
# from andrej karapathy github
import torch.nn.functional as F
def topk_sampling(model, prompt, tokenizer, device, max_length=50, top_k=50, temperature=1.0):

    # input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

    input_ids = torch.tensor([de_vocab[token] for token in de_tokenizer(prompt)]).unsqueeze(0)
    oov = []
    generated_text = ""
    for _ in range(max_length):
        with torch.no_grad():
            outputs = model(input_ids, None, True)
            logits = outputs[:, -1, :]

            probs = F.softmax(logits, dim=-1)

            # Top-k filtering
            top_k_probs, top_k_indices = torch.topk(probs, top_k, dim=-1)
#
            # Apply temperature scaling
            # probs = probs / temperature

            # Sample from top-k
            next_token = torch.multinomial(top_k_probs, num_samples=1)

            # generated_tokens.append(next_token.item())

            xcol = torch.gather(top_k_indices, -1, next_token)
            # xcol = torch.argmax(probs, dim=-1)

            # if(xcol == '<eos>'):
            #   break
            # print(xcol.shape)
            # print(input_ids.shape)
            # print(xcol.shape)
            input_ids = torch.cat([input_ids, xcol], dim=-1) #1 because is it the dimension of the sequence
    # print(input_ids)
    count = 0
    de_len = torch.tensor([de_vocab[token] for token in de_tokenizer(prompt)])
    for i in input_ids[0]:
      # print(de_len.shape)
      if(count > de_len.shape[0]):
      # print(i)
      # try:
        
        token = en_vocab.vocab.get_itos()[i]
        generated_text += token

        generated_text += ' '

        if(en_vocab.vocab.get_itos()[i] == '<eos>'):
          print("Done")
          break
        
      # except:
        # oov.append(i)
      else:
        count += 1

    return generated_text


In [21]:
# criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=ModelArgs.max_lr)

In [None]:
model.train()
train_losses =  torch.zeros(len(train_loader))
val_losses = torch.zeros(len(val_loader))
wandb.init(
    project='Seq2Seq-From-Scratch'
)
for epoch in range(ModelArgs.epoch):

    count = 0
    print("Starting train...")
    for de, en in train_loader:
        logits = model(de, en, False)
        # print(logits.shape)

        batch_size, block_size, vocab = logits.shape
        # print("Va: ", vocab)
        logits = logits.view(batch_size*block_size, vocab)
        targets = en.view(batch_size * block_size)
        # print("HiiiL ", en.shape)
        # print("HiiiT ", logits.shape)
        loss = nn.functional.cross_entropy(logits, targets, ignore_index=en_vocab['<pad>'])
        train_losses[count] = loss.item()
        # print("Loss: ", loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        count += 1
        # print(count)
    

    # count = 0
    model.eval()
    count = 0
    print("Starting val...")
    for de, en in val_loader:
        logits = model(de, en, False)
        # print(logits.shape)
        batch_size, block_size, vocab = logits.shape

        logits = logits.view(batch_size*block_size, vocab)
        # print("Va: ", vocab)
        targets = en.view(batch_size * block_size)
        loss = nn.functional.cross_entropy(logits, targets, ignore_index=en_vocab['<pad>'])

        # print("Loss: ", loss.item())
        val_losses[count] = loss.item()

        # optimizer.zero_grad()
        # loss.backward()
        # optimizer.step()
        count += 1


    # print("eval")
    print("Generating text...")
    generated_text = topk_sampling(model, 'Ich fahre heute mit dem Rad zur Schule', de_tokenizer, device=ModelArgs.device, max_length=50, top_k=50, temperature=1.0)

    print(generated_text)


    model.train()
    wandb.log({
      "Train Loss": train_losses.mean(),
      "Val Loss": val_losses.mean(),
      "epoch": epoch
    })
    print("Epoch: ", epoch, "|", "Train Loss: ", train_losses.mean(),  "|", "Val Loss: ", val_losses.mean())


: 