In [1]:
import torch
import torch.nn as nn

In [2]:
import wandb
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mrajceo2031[0m ([33mrentio[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
from dataclasses import dataclass


@dataclass
class ModelArgs:
    device = 'cuda'
    no_of_neurons = 128
    block_size = 32
    batch_size = 32
    dropout = 0.1
    epoch = 50
    max_lr = 1e-4

In [4]:
print(torch.__version__)

2.3.0.post101


In [5]:
import torchtext
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.utils import download_from_url, extract_archive
import io

url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.de.gz', 'train.en.gz')
val_urls = ('val.de.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

de_tokenizer = get_tokenizer('spacy', language='de')
en_tokenizer = get_tokenizer('spacy', language='en')

from torchtext.vocab import build_vocab_from_iterator

def build_vocab(filepath, tokenizer):
    counter = Counter()
    with io.open(filepath, encoding="utf8") as f:
        for string_ in f:
            counter.update(tokenizer(string_))
    # Ensure '<pad>' is at index 0 by placing it first in the specials list
    vocab = build_vocab_from_iterator(
        [counter.keys()],
        specials=['<unk>', '<eos>']  # '<pad>' comes first
    )
    vocab.set_default_index(vocab['<unk>'])
    return vocab


de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)

def data_process(filepaths):
  raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
  raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
  data = []
  for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
    de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],
                            dtype=torch.long)
    en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],
                            dtype=torch.long)
    data.append((de_tensor_, en_tensor_))
  return data

train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)

OSError: /home/yuvrajsingh/anaconda3/envs/unsloth_env/lib/python3.11/site-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch7LibraryC1ENS0_4KindESsSt8optionalIN3c1011DispatchKeyEEPKcj

In [4]:
if torch.cuda.is_available():
    ModelArgs.device = 'cuda'
    torch.set_default_device('cuda')
else:

    torch.set_default_device('cpu')
    ModelArgs.device='cpu'

if torch.cuda.is_available():
  torch.set_default_device(ModelArgs.device)

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader


torch.manual_seed(0)


num_samples = 10000
seq_length = ModelArgs.block_size
device = ModelArgs.device


t = torch.linspace(0, 100, num_samples + seq_length, device=device)
# data = torch.sin(t) + 0.1 * torch.randn_like(t)
data = t

X_tensor = torch.stack([data[i:i+seq_length] for i in range(num_samples)])
y_tensor = data[seq_length:]  # Next value prediction

train_size = int(0.8 * num_samples)

X_train, y_train = X_tensor[:train_size], y_tensor[:train_size]
X_val, y_val = X_tensor[train_size:], y_tensor[train_size:]


class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


train_dataset = TimeSeriesDataset(X_train, y_train)
val_dataset = TimeSeriesDataset(X_val, y_val)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


generator = torch.Generator(device=device)


train_loader = DataLoader(
    train_dataset,
    batch_size=ModelArgs.batch_size,
    shuffle=True,
    generator=generator,
    drop_last=True
)

val_loader = DataLoader(
    val_dataset,
    generator=generator,
    drop_last=True,
    batch_size=ModelArgs.batch_size,
    shuffle=True,
)



In [6]:

class InputGate(nn.Module):
    def __init__(self, device, no_of_neurons):
        super().__init__()
        self.it = nn.Linear(in_features= ModelArgs.no_of_neurons + 1, out_features=no_of_neurons, device=device, dtype=torch.float32)
        self.ct_bar = nn.Linear(in_features=ModelArgs.no_of_neurons + 1, out_features=no_of_neurons, device=device, dtype=torch.float32)

    def forward(self, x, ht_1):
        x = torch.cat([x, ht_1], dim=-1)
        _it = torch.nn.functional.sigmoid(self.it(x))
        _ct_bar = torch.nn.functional.tanh(self.ct_bar(x))
        # out = torch.nn.functional.sigmoid(self.linear(x))
        return _it, _ct_bar

In [7]:
class OutputGate(nn.Module):
    def __init__(self, device, no_of_neurons) -> None:
        super().__init__()
        self.linear = nn.Linear(in_features= ModelArgs.no_of_neurons + 1, out_features=no_of_neurons, device=device, dtype=torch.float32)
    def forward(self, x, ht_1):
        x = torch.cat([x, ht_1], dim=-1)
        out = torch.nn.functional.sigmoid(self.linear(x))
        return out

In [8]:
class ForgetGate(nn.Module):

    def __init__(self, device, no_of_neurons):
        super().__init__()
        self.linear = nn.Linear(in_features=ModelArgs.no_of_neurons + 1, out_features=no_of_neurons, device=device, dtype=torch.float32)

    def forward(self, x, ht_1):
        # print("Forgot: ", x.shape)
        # print("Forget: ", ht_1.shape)
        x = torch.cat([x, ht_1], dim=-1)
        out = torch.nn.functional.sigmoid(self.linear(x))
        return out

In [9]:
class LSTMBlock(nn.Module):
    def __init__(self, device, no_of_neurons):
        super().__init__()
        self.ip = InputGate(device=device, no_of_neurons=no_of_neurons)
        self.op = OutputGate(device=device, no_of_neurons=no_of_neurons)
        self.forget = ForgetGate(device=device, no_of_neurons=no_of_neurons)
        self.no_of_neurons = no_of_neurons
    def forward(self, x, outputs=None):
        # print("Block: ", x.shape)
        # print("Block: ", ht_1.shape)
        # print("Block: ", ct_1.shape)

        ht_1 = torch.randn(ModelArgs.batch_size, self.no_of_neurons, device=device, requires_grad=True, dtype=torch.float32)
        ct_1 = torch.randn(ModelArgs.batch_size, self.no_of_neurons,device=device, requires_grad=True, dtype=torch.float32)
        seq_len = x.shape[1]
        if(outputs == None):
            outputs = []
            for t in range(seq_len):
                xt = x[:, t].unsqueeze(-1)
                # print(xt.shape)
                ft = self.forget(xt, ht_1) * ct_1
                it, ct_bar = self.ip(xt , ht_1)
                ct_bar_prime = it * ct_bar
                ct = ft * ct_1 + ct_bar_prime
                ht = self.op(xt, ht_1) * torch.nn.functional.tanh(ct)
                outputs.append(ht)
                return ht, ct, torch.stack(outputs, dim=1)

In [10]:
class LSTM(nn.Module):
    def __init__(self, device, no_of_neurons, out_features):
        super().__init__()
        self.block1 = LSTMBlock(device=device, no_of_neurons=no_of_neurons)
        self.block2 = LSTMBlock(device=device, no_of_neurons=no_of_neurons)
        # self.ht_1 = torch.randn(ModelArgs.batch_size, no_of_neurons, device=device, requires_grad=True, dtype=torch.float32)
        # self.ct_1 = torch.randn(ModelArgs.batch_size, no_of_neurons,device=device, requires_grad=True, dtype=torch.float32)
        # self.output = nn.Linear(in_features=ModelArgs.no_of_neurons, out_features=out_features, device=device, dtype=torch.float32)
        self.dropout = nn.Dropout(p=ModelArgs.dropout)
        # self.embedding = nn.Embedding()

    def forward(self, x):
        # x =
        # print("LSTM: ",x.shape)
        # print("LSTM: ", self.ht_1.shape)
        # print("LSTM: ", self.ct_1.shape)
        ht, ct, outputs = self.block1(x)
        # print(ht.shape)
        # print(ct.shape)
        # ht, ct = self.block2(x, ht, ct)
        # ht = self.dropout(ht)
        # print("After: ", ht.shape)
        # out = self.output(ht)
        return  ht, ct, outputs

In [11]:
class Seq2Seq(nn.Module):

    def __init__(self, device, no_of_neurons, out_features):
        super().__init__()

        self.encoder = LSTM(device, no_of_neurons, out_features)
        self.decoder = LSTM(device, no_of_neurons, out_features)

    def forward(self, x):

        ht_encoder, ct_encoder,outputs_encoder = self.encoder(x)
        ht_decoder, ct_decoder, outputs_decoder = self.decoder(ht_encoder)

        return outputs_decoder

In [12]:
model = Seq2Seq(device=ModelArgs.device, no_of_neurons=ModelArgs.no_of_neurons, out_features=1)
model = model.to(ModelArgs.device)

In [13]:
!pip install torchinfo

from torchinfo import summary

x = torch.randint(0, 100, (ModelArgs.batch_size,ModelArgs.block_size))  # Random integer between 0 and 100
x = x.to(ModelArgs.device)

summary(model=model,
        input_data=x,
        # input_size=(ModelArgs.batch_size, ModelArgs.block_size, ModelArgs.embeddings_dims),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])




Layer (type (var_name))                  Input Shape          Output Shape         Param #              Trainable
Seq2Seq (Seq2Seq)                        [32, 32]             [32, 1, 128]         --                   True
├─LSTM (encoder)                         [32, 32]             [32, 128]            66,560               True
│    └─LSTMBlock (block1)                [32, 32]             [32, 128]            --                   True
│    │    └─ForgetGate (forget)          [32, 1]              [32, 128]            16,640               True
│    │    └─InputGate (ip)               [32, 1]              [32, 128]            33,280               True
│    │    └─OutputGate (op)              [32, 1]              [32, 128]            16,640               True
├─LSTM (decoder)                         [32, 128]            [32, 128]            66,560               True
│    └─LSTMBlock (block1)                [32, 128]            [32, 128]            --                   True
│    │    └─Fo

In [14]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=ModelArgs.max_lr)

In [None]:
model.train()
train_losses =  torch.zeros(len(train_loader))
val_losses = torch.zeros(len(val_loader))
wandb.init(
    project='Encoder_decoder-From-Scratch'
)
for epoch in range(ModelArgs.epoch):

    count = 0
    for X, y in train_loader:
        y_pred = model(X)
        # print(y_pred.shape)
        loss = criterion(y_pred, y)
        train_losses[count] = loss.item()
        # print("Loss: ", loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        count += 1

    # count = 0
    model.eval()
    count = 0
    for X, y in val_loader:
        y_pred = model(X)
        # print(y_pred.shape)
        loss = criterion(y_pred, y)

        # print("Loss: ", loss.item())
        val_losses[count] = loss.item()

        # optimizer.zero_grad()
        # loss.backward()
        # optimizer.step()
        count += 1
    model.train()
    wandb.log({
      "Train Loss": train_losses.mean(),
      "Val Loss": val_losses.mean(),
      "epoch": epoch
    })
    print("Epoch: ", epoch, "|", "Train Loss: ", train_losses.mean(),  "|", "Val Loss: ", val_losses.mean())
