In [206]:
!pip install torch==2.3.0 torchtext==0.18.0
import torch
import torch.nn as nn



In [207]:
import wandb
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mrajceo2031[0m ([33mrentio[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [208]:
from dataclasses import dataclass


@dataclass
class ModelArgs:
    device = 'cuda'
    no_of_neurons = 256
    block_size = 32
    batch_size = 32
    en_vocab_size = None
    de_vocab_size = None
    dropout = 0.1
    epoch = 10
    max_lr = 1e-4
    embedding_dims = 1000
    num_layers = 2

In [209]:
!python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [210]:
if torch.cuda.is_available():
    ModelArgs.device = 'cuda'
    torch.set_default_device('cuda')
else:

    torch.set_default_device('cpu')
    ModelArgs.device='cpu'

if torch.cuda.is_available():
  torch.set_default_device(ModelArgs.device)

In [211]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [212]:
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator
from torchtext.utils import download_from_url, extract_archive
import io
from torch.utils.data import DataLoader, Dataset

# Download and extract data
url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.de.gz', 'train.en.gz')
val_urls = ('val.de.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

# Load SpaCy tokenizers
de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

# Build vocabulary
def build_vocab(filepath, tokenizer):
    counter = Counter()
    with io.open(filepath, encoding="utf8") as f:
        for string_ in f:
            counter.update(tokenizer(string_))
    vocab = build_vocab_from_iterator(
        [counter.keys()],
        specials=['<unk>', '<bos>', '<eos>']
    )
    vocab.set_default_index(vocab['<unk>'])
    return vocab

de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
ModelArgs.de_vocab_size = len(de_vocab) + 1
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)
ModelArgs.en_vocab_size = len(en_vocab) + 1


def data_process(filepaths):
    raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
    raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
    data = []

    # Get the indices for <bos> and <eos> tokens
    de_bos_idx = de_vocab['<bos>']
    de_eos_idx = de_vocab['<eos>']
    en_bos_idx = en_vocab['<bos>']
    en_eos_idx = en_vocab['<eos>']

    for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
        # Tokenize and convert to indices
        de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)], dtype=torch.long)
        en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)], dtype=torch.long)

        # Add <bos> and <eos> tokens
        # de_tensor_ = torch.cat([torch.tensor([de_bos_idx]), de_tensor_, torch.tensor([de_eos_idx])])
        en_tensor_ = torch.cat([torch.tensor([en_bos_idx]), en_tensor_, torch.tensor([en_eos_idx])])

        # Flip the German tensor (if required)
        # de_tensor_ = torch.flip(de_tensor_, dims=[0])

        # Append to data
        data.append((de_tensor_, en_tensor_))

    return data

def data_process_flip(filepaths):
    raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
    raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
    data = []

    # Get the indices for <bos> and <eos> tokens
    de_bos_idx = de_vocab['<bos>']
    de_eos_idx = de_vocab['<eos>']
    en_bos_idx = en_vocab['<bos>']
    en_eos_idx = en_vocab['<eos>']

    for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
        # Tokenize and convert to indices
        de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)], dtype=torch.long)
        en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)], dtype=torch.long)

        # Add <bos> and <eos> tokens
        # de_tensor_ = torch.cat([torch.tensor([de_bos_idx]), de_tensor_, torch.tensor([de_eos_idx])])
        en_tensor_ = torch.cat([torch.tensor([en_bos_idx]), en_tensor_, torch.tensor([en_eos_idx])])

        # Flip the German tensor (if required)
        de_tensor_ = torch.flip(de_tensor_, dims=[0])

        # Append to data
        data.append((de_tensor_, en_tensor_))

    return data

train_data = data_process_flip(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)

# Create a custom Dataset class
class TranslationDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Create Dataset instances
train_dataset = TranslationDataset(train_data)
val_dataset = TranslationDataset(val_data)
test_dataset = TranslationDataset(test_data)

from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch, block_size=32):
    """
    Collate function to pad or truncate sequences to a fixed block size.

    Args:
        batch: A list of tuples (de_tensor, en_tensor).
        block_size: The fixed length to pad or truncate sequences to.

    Returns:
        de_batch: Padded/truncated German sequences (batch_size, block_size).
        en_batch: Padded/truncated English sequences (batch_size, block_size).
    """
    de_batch, en_batch = zip(*batch)

    # Function to pad or truncate a sequence to the block size
    def pad_or_truncate(sequence, block_size, pad_value):
        if len(sequence) > block_size:
            # Truncate the sequence if it's longer than block_size
            return sequence[:block_size]
        else:
            # Pad the sequence if it's shorter than block_size
            padding_length = block_size - len(sequence)
            return torch.cat([sequence, torch.full((padding_length,), pad_value, dtype=sequence.dtype)])

    # Pad or truncate each sequence in the batch
    de_batch = [pad_or_truncate(seq, block_size, de_vocab['<pad>']) for seq in de_batch]
    en_batch = [pad_or_truncate(seq, block_size, en_vocab['<pad>']) for seq in en_batch]

    # Stack the sequences into a single tensor
    de_batch = torch.stack(de_batch)
    en_batch = torch.stack(en_batch)

    return de_batch, en_batch

generator = torch.Generator(device=ModelArgs.device)


# Create DataLoader instances
batch_size = ModelArgs.batch_size
train_loader = DataLoader(train_dataset, generator=generator, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)
val_loader = DataLoader(val_dataset, generator=generator, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, drop_last=False)

# Example usage
for de_batch, en_batch in train_loader:
    print(f"German batch shape: {de_batch.shape}")
    print(f"English batch shape: {en_batch.shape}")
    break

German batch shape: torch.Size([32, 32])
English batch shape: torch.Size([32, 32])


In [213]:

train_data

[(tensor([    4,    23,  1837, 18454,  7371, 13750, 15714,  3421, 15709, 17568,
           7132, 18698, 15827, 12383]),
  tensor([    1,  1734, 10818,    18,  1838,  6504,  2166,  7166,  6930,  6535,
           2965,    20,     3,     2])),
 (tensor([    4,    23,   392, 13963, 13081,  9237, 16420,  7132,  6712]),
  tensor([   1, 1489, 6645, 5820, 5497, 5513, 2166, 7101, 1879, 5194, 7908, 9707,
            20,    3,    2])),
 (tensor([    4,    23,  4658, 12917,  9874, 13963, 15714, 15952,  7116, 15947,
           2472]),
  tensor([    1,   101,  6364,  5207,  3400,  5933,  1879, 10731,  7615,    20,
              3,     2])),
 (tensor([    4,    23,  3052, 13963, 16877, 18099,  6288, 13977, 12789, 17738,
           4474, 13477, 13974, 15714,  6545,  2472]),
  tensor([    1,   101,  6508,  5820,  1879,  2687,  8760,  5954,  9348,  7077,
           1879,  6160,  3377,  1879, 10677,    20,     3,     2])),
 (tensor([    4,    23, 18905,  2755, 13250, 18099,  4492, 12580, 17730,  7132,
  

In [214]:
ModelArgs.en_vocab_size

10838

In [215]:
x = torch.randn((ModelArgs.batch_size, 1, ModelArgs.embedding_dims))
x1 = torch.randn((ModelArgs.batch_size, ModelArgs.block_size, ModelArgs.embedding_dims))
print(torch.cat([x, x1], dim=1).shape)

torch.Size([32, 33, 1000])


In [216]:

class InputGate(nn.Module):
    def __init__(self, device, no_of_neurons):
        super().__init__()
        self.it = nn.Linear(in_features= ModelArgs.no_of_neurons + ModelArgs.embedding_dims, out_features=no_of_neurons, device=device, dtype=torch.float32)
        self.ct_bar = nn.Linear(in_features=ModelArgs.no_of_neurons + ModelArgs.embedding_dims, out_features=no_of_neurons, device=device, dtype=torch.float32)

    def forward(self, x, ht_1):
        x = torch.cat([x, ht_1], dim=1)
        _it = torch.nn.functional.sigmoid(self.it(x))
        _ct_bar = torch.nn.functional.tanh(self.ct_bar(x))
        # out = torch.nn.functional.sigmoid(self.linear(x))
        return _it, _ct_bar

In [217]:
class OutputGate(nn.Module):
    def __init__(self, device, no_of_neurons) -> None:
        super().__init__()
        self.linear = nn.Linear(in_features= ModelArgs.no_of_neurons + ModelArgs.embedding_dims, out_features=no_of_neurons, device=device, dtype=torch.float32)
    def forward(self, x, ht_1):
        x = torch.cat([x, ht_1], dim=1)
        out = torch.nn.functional.sigmoid(self.linear(x))
        return out

In [218]:
class ForgetGate(nn.Module):

    def __init__(self, device, no_of_neurons):
        super().__init__()
        self.linear = nn.Linear(in_features=ModelArgs.no_of_neurons + ModelArgs.embedding_dims, out_features=no_of_neurons, device=device, dtype=torch.float32)

    def forward(self, x, ht_1):
        # print("Forgot: ", x.shape)
        # print("Forget: ", ht_1.shape)
        x = torch.cat([x, ht_1], dim=1)
        out = torch.nn.functional.sigmoid(self.linear(x))
        return out

In [219]:
class LSTMBlock(nn.Module):
    def __init__(self, device, no_of_neurons):
        super().__init__()
        self.ip = InputGate(device=device, no_of_neurons=no_of_neurons)
        self.op = OutputGate(device=device, no_of_neurons=no_of_neurons)
        self.forget = ForgetGate(device=device, no_of_neurons=no_of_neurons)
        self.no_of_neurons = no_of_neurons
        self.device=device
        self.linear_layer = nn.Linear(in_features=ModelArgs.no_of_neurons, out_features=ModelArgs.embedding_dims, device=device)
    def forward(self, x, ht_1=None, outputs=None, embeds=None):
        # print("Block: ", x.shape)
        # print("Block: ", ht_1.shape)
        # print("Block: ", ct_1.shape)

        if(ht_1 is None):
          ht_1 = torch.randn( (x.shape[0], self.no_of_neurons), device=self.device, requires_grad=True, dtype=torch.float32)

        ct_1 = torch.randn((x.shape[0], self.no_of_neurons),device=self.device, requires_grad=True, dtype=torch.float32)
        seq_len = x.shape[1]
        # ht_1 = ht_1.unsqueeze(-1)
        if(outputs == None):
            # print("New")
            # print(x)
            outputs = []
            for t in range(seq_len):
                # print("Block: ", x.shape)
                # print("Block: ", ht_1.shape)
                # print("Block: ", ct_1.shape)
                xt = x[:, t, :]
                # print(xt.shape)
                ft = self.forget(xt, ht_1) * ct_1
                it, ct_bar = self.ip(xt , ht_1)
                ct_bar_prime = it * ct_bar
                ct = ft * ct_1 + ct_bar_prime
                ht = self.op(xt, ht_1) * torch.nn.functional.tanh(ct)
                outputs.append(ht)
            return ht, ct, torch.stack(outputs, dim=1)

        elif(outputs is not None):
          # print("Other")
          new_output = []
          for t in range(seq_len):
                # print("Block: ", x.shape)
                # print("Block: ", ht_1.shape)
                # print("Block: ", ct_1.shape)
                xt = outputs[:, t, :]
                # print("Shape: ", xt.shape)
                xt = self.linear_layer(xt)
                # print("After: ", xt.shape)
                ft = self.forget(xt, ht_1) * ct_1
                it, ct_bar = self.ip(xt , ht_1)
                ct_bar_prime = it * ct_bar
                ct = ft * ct_1 + ct_bar_prime
                ht = self.op(xt, ht_1) * torch.nn.functional.tanh(ct)
                new_output.append(ht)
          return ht, ct, torch.stack(new_output, dim=1)

In [220]:

class EmbeddingTable_en(nn.Module):
  def __init__(self, device):
    super().__init__()

    self.embed_en = nn.Embedding(num_embeddings=ModelArgs.en_vocab_size, embedding_dim=ModelArgs.embedding_dims, device=device)

  def forward(self, x):
    return self.embed_en(x)

In [221]:


class EmbeddingTable_de(nn.Module):
  def __init__(self, device):
    super().__init__()

    self.embed_de =  nn.Embedding(num_embeddings=ModelArgs.de_vocab_size, embedding_dim=ModelArgs.embedding_dims, device=device)

  def forward(self, x):
    # print('Indie: ', x)
    return self.embed_de(x)

In [222]:
class Encoder(nn.Module):
    def __init__(self, device, no_of_neurons, out_features):
        super().__init__()
        self.block1 = LSTMBlock(device=device, no_of_neurons=no_of_neurons)
        # self.block2 = LSTMBlock(device=device, no_of_neurons=no_of_neurons)
        # self.embeds_table_en = EmbeddingTable_en(device=device)
        self.embeds_table_de = EmbeddingTable_de(device=device)
        # self.ht_1 = torch.randn(ModelArgs.batch_size, no_of_neurons, device=device, requires_grad=True, dtype=torch.float32)
        # self.ct_1 = torch.randn(ModelArgs.batch_size, no_of_neurons,device=device, requires_grad=True, dtype=torch.float32)
        # self.output = nn.Linear(in_features=ModelArgs.no_of_neurons, out_features=out_features, device=device, dtype=torch.float32)
        self.dropout = nn.Dropout(p=ModelArgs.dropout)
        # self.embedding = nn.Embedding()

    def forward(self, x, outputs=None, initial=None):
        # x =
        # print("LSTM: ",x.shape)
        # print("LSTM: ", self.ht_1.shape)
        # print("LSTM: ", self.ct_1.shape)
        # if(encoder):
        if(initial is not None and initial is True):
          x = self.embeds_table_de(x)
        # print(x.shape)
        # elif(decoder):
        #   x = self.embeds_table_en(x)
        if(outputs is not None):
          # print(outputs)
          ht, ct, outputs = self.block1(x, outputs=outputs)
        elif(outputs is None):
          ht, ct, outputs = self.block1(x)
        # print(ht.shape)
        # print(ct.shape)
        # ht, ct = self.block2(x, ht, ct)
        # ht = self.dropout(ht)
        # print("Aft: ", outputs.shape)
        # out = self.output(ht)
        return  ht, ct, outputs, self.embeds_table_de

In [223]:
class Decoder(nn.Module):
    def __init__(self, device, no_of_neurons, out_features):
        super().__init__()
        self.block1 = LSTMBlock(device=device, no_of_neurons=no_of_neurons)
        # self.block2 = LSTMBlock(device=device, no_of_neurons=no_of_neurons)
        self.embeds_table_en = EmbeddingTable_en(device=device)
        # self.embeds_table_de = EmbeddingTable_de(device=device)
        # self.ht_1 = torch.randn(ModelArgs.batch_size, no_of_neurons, device=device, requires_grad=True, dtype=torch.float32)
        # self.ct_1 = torch.randn(ModelArgs.batch_size, no_of_neurons,device=device, requires_grad=True, dtype=torch.float32)
        self.output = nn.Linear(in_features=ModelArgs.no_of_neurons, out_features=ModelArgs.en_vocab_size, device=device, dtype=torch.float32)
        self.dropout = nn.Dropout(p=ModelArgs.dropout)
        # self.embedding = nn.Embedding()

    def forward(self, x, ctx=None, inf=None, embeds=None, initial=None, outputs=None):
        # x =
        # print("LSTM: ",x.shape)
        # print("LSTM: ", self.ht_1.shape)
        # print("LSTM: ", self.ct_1.shape)
        # if(encoder):
        #   x = self.embeds_table_de(x)
          # print(x.shape)
        # elif(decoder):
        if(inf is not True and initial is True):
          x = self.embeds_table_en(x)
        if(inf is True):
          # print("Before: ", x.shape)
          x = embeds(x)
          # print("After: ", x.shape)
        ht, ct, outputs = self.block1(x, ctx, outputs=outputs)
        # print(ht.shape)
        # print(ct.shape)
        # ht, ct = self.block2(x, ht, ct)
        out = self.dropout(outputs)
        # print("After: ", outputs.shape)
        out = self.output(out)
        return  out, outputs

In [273]:

class Seq2Seq(nn.Module):

    def __init__(self, device, no_of_neurons, out_features):
        super().__init__()

        # self.encoder = Encoder(device, no_of_neurons, out_features)
        # self.decoder = Decoder(device, no_of_neurons, out_features)
        self.encoders = nn.ModuleList(Encoder(device, no_of_neurons, out_features) for _  in range(ModelArgs.num_layers))
        self.decoders = nn.ModuleList(Decoder(device, no_of_neurons, out_features) for x in range(ModelArgs.num_layers))

    def forward(self, x, y=None, inf=None):

        count = 0
        for i in self.encoders:
          if(count == 0):
            ht_encoder, ct_encoder,outputs_encoder, embeds_de = i(x, initial=True)
            # x = ht_encoder
          else:
            ht_encoder, ct_encoder,outputs_encoder, embeds_de = i(x, outputs=outputs_encoder)
            # x = ht_encoder
          count += 1

        res = None
        count = 0
        if(y is not None and inf==False):
          for i in self.decoders:

            # print("Hiii")
            if(count == 0):
              y , outputs = i(y, ht_encoder, inf, embeds_de, True)
              # res = x
            else:
              y, outputs = i(y, ht_encoder, inf, embeds_de, outputs=outputs)
              # res = x
            # return res
          # elif(y is not None and inf==False):
            # print("Here")
            # res = self.decoder(y, ht_encoder)
            # return res
            count += 1
          return y


        elif(inf==True and y is None):
          x_init = x
          count = 0
          for i in self.decoders:

            if(count == 0):
              x, outputs = i(x, ht_encoder, inf, embeds_de, True)
            # res = x

            else:
              x, outputs = i(x_init, ht_encoder, inf, embeds_de, outputs=outputs)

            count += 1
          return x

In [274]:
model = Seq2Seq(device=ModelArgs.device, no_of_neurons=ModelArgs.no_of_neurons, out_features=1)
model = model.to(ModelArgs.device)

In [275]:
!pip install torchinfo

from torchinfo import summary

# x = torch.randint(0, 100, (ModelArgs.batch_size,ModelArgs.block_size))  # Random integer between 0 and 100
x,y = next(iter(train_loader))
x = x.to(ModelArgs.device)
y = y.to(ModelArgs.device)

summary(model=model,
        input_data=[x,y, False],
        # input_size=(ModelArgs.batch_size, ModelArgs.block_size, ModelArgs.embeddings_dims),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])




Layer (type (var_name))                                 Input Shape          Output Shape         Param #              Trainable
Seq2Seq (Seq2Seq)                                       [32, 32]             [32, 32, 10838]      --                   True
├─ModuleList (encoders)                                 --                   --                   --                   True
│    └─Encoder (0)                                      [32, 32]             [32, 256]            --                   True
│    │    └─EmbeddingTable_de (embeds_table_de)         [32, 32]             [32, 32, 1000]       19,215,000           True
│    │    └─LSTMBlock (block1)                          [32, 32, 1000]       [32, 256]            1,544,168            True
│    └─Encoder (1)                                      [32, 32]             [32, 256]            19,215,000           True
│    │    └─LSTMBlock (block1)                          [32, 32]             [32, 256]            1,544,168            True
├─M

In [276]:



# x,y = next(iter(train_loader))
# x.shape

In [277]:
# from andrej karapathy github
import torch.nn.functional as F
def topk_sampling(model, prompt, tokenizer, device, max_length=50, top_k=50, temperature=1.0):

    # input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

    input_ids = torch.tensor([de_vocab[token] for token in de_tokenizer(prompt)]).unsqueeze(0)
    oov = []
    generated_text = ""
    for _ in range(max_length):
        with torch.no_grad():
            outputs = model(input_ids, None, True)
            logits = outputs[:, -1, :]

            probs = F.softmax(logits, dim=-1)

            # Top-k filtering
            top_k_probs, top_k_indices = torch.topk(probs, top_k, dim=-1)
#
            # Apply temperature scaling
            # probs = probs / temperature

            # Sample from top-k
            next_token = torch.multinomial(top_k_probs, num_samples=1)

            # generated_tokens.append(next_token.item())

            xcol = torch.gather(top_k_indices, -1, next_token)
            # xcol = torch.argmax(probs, dim=-1)

            # if(xcol == '<eos>'):
            #   break
            # print(xcol.shape)
            # print(input_ids.shape)
            # print(xcol.shape)
            input_ids = torch.cat([input_ids, xcol], dim=-1) #1 because is it the dimension of the sequence
    # print(input_ids)
    count = 0
    de_len = torch.tensor([de_vocab[token] for token in de_tokenizer(prompt)])
    for i in input_ids[0]:
      # print(de_len.shape)
      if(count > de_len.shape[0]):
      # print(i)
      # try:
        if(en_vocab.vocab.get_itos()[i] == '<eos>'):
          print("Done")
          break
        token = en_vocab.vocab.get_itos()[i]
        generated_text += token

        generated_text += ' '
      # except:
        # oov.append(i)
      else:
        count += 1

    return generated_text

In [278]:


criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=ModelArgs.max_lr)

In [279]:
model.train()
train_losses =  torch.zeros(len(train_loader))
val_losses = torch.zeros(len(val_loader))
wandb.init(
    project='Encoder_decoder-From-Scratch'
)
for epoch in range(ModelArgs.epoch):

    count = 0
    for de, en in train_loader:
        logits = model(de, en, False)
        print(logits.shape)

        batch_size, block_size, vocab = logits.shape
        # print("Va: ", vocab)
        logits = logits.view(batch_size*block_size, vocab)
        targets = en.view(batch_size * block_size)
        # print("HiiiL ", en.shape)
        # print("HiiiT ", logits.shape)
        loss = criterion(logits, targets)
        train_losses[count] = loss.item()
        # print("Loss: ", loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        count += 1
        # print(count)
        break
    # count = 0
    model.eval()
    count = 0
    for de, en in val_loader:
        logits = model(de, en, False)
        print(logits.shape)
        batch_size, block_size, vocab = logits.shape

        logits = logits.view(batch_size*block_size, vocab)
        # print("Va: ", vocab)
        targets = en.view(batch_size * block_size)
        loss = criterion(logits, targets)

        # print("Loss: ", loss.item())
        val_losses[count] = loss.item()

        # optimizer.zero_grad()
        # loss.backward()
        # optimizer.step()
        count += 1
        break
    # print("eval")
    generated_text = topk_sampling(model, 'Ich fahre heute mit dem Rad zur Schule', de_tokenizer, device=ModelArgs.device, max_length=50, top_k=50, temperature=1.0)

    print(generated_text)
    # break
    break
    model.train()
    wandb.log({
      "Train Loss": train_losses.mean(),
      "Val Loss": val_losses.mean(),
      "epoch": epoch
    })
    print("Epoch: ", epoch, "|", "Train Loss: ", train_losses.mean(),  "|", "Val Loss: ", val_losses.mean())


torch.Size([32, 32, 10838])
torch.Size([32, 32, 10838])
oxfords compared bullfight pull irons spins stroke hangs Big establishment exhibition pee nearby hedges Yellow glassy waterfall bathed strangely punts strumming full leg scientist punts Costumed strumming Rod operated circles dimmed Morgan irons sporty vac task hole international kaki entrepreneur bazaar scarf test sad cherry Morning jogging Elton traps 


In [None]:
len(train_loader)

In [None]:
ModelArgs.en_vocab_size

In [None]:
en_vocab.vocab.get_itos()[1]

In [None]:
oov

In [None]:
[de_vocab[token] for token in de_tokenizer('Ich fahre heute mit dem Rad zur Schule')]