In [1]:
import torch
import bpe_tokenizer as D
import string

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset

In [3]:
import torch
print(torch.__version__)

2.7.0


In [4]:
ds = load_dataset("cfilt/iitb-english-hindi")

english_characters = list(string.ascii_lowercase) + list(string.ascii_uppercase)

punctuation_list = list(string.punctuation)

char_to_keep = english_characters + punctuation_list + [' ']

def custom_filter(example):

    for word in example['translation']['en']:
        if word not in char_to_keep:
            return False
        

    for word in example['translation']['hi']:
        if not ((ord(u'\u0900') <= ord(word) <= ord(u'\u097F') ) or (word in list(string.punctuation)) or (word == ' ')):
            return False
        
    # removed sentences greater than 90th percentile     
    if len(example['translation']['en']) > 161:
        return False
    
    if len(example['translation']['hi']) > 115:
        return False

    return True


ds_filtered = ds.filter(custom_filter)

# corpus = ds_filtered['train']['translation']

In [5]:
max_tokens = 200

In [6]:
all_tokens = D.bpe_en_obj.base_vocab + ['<unk>', '<pad>']
word2idx_en = {}

for ind, ele in enumerate(all_tokens):
    word2idx_en[ele] = ind

all_tokens = D.bpe_hin_obj.base_vocab + ['<unk>', '<pad>', '<eos>', '<start>']
word2idx_hin = {}

for ind, ele in enumerate(all_tokens):
    word2idx_hin[ele] = ind

def tokenize_en(x):

    res = D.bpe_en_obj.tokenize(x)
    while len(res) < max_tokens:
        res.append('<pad>')

    
    return torch.tensor([word2idx_en[ele] for ele in res])

def tokenize_hin(x):

    res = D.bpe_hin_obj.tokenize(x)
    key = 0
    
    while len(res) < max_tokens:

        if not key:
            res.insert(0, '<start>')
            res.append('<eos>')
            key = 1
            continue

        res.append('<pad>')
    
    return torch.tensor([word2idx_hin[ele] for ele in res])


# enc_input = torch.stack([tokenize(x) for x in x_en], dim = 0)
# dec_input = torch.stack([tokenize(x) for x in x_hi], dim = 0)




In [20]:
ds_filtered

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1059018
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 348
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1189
    })
})

In [27]:
# from datasets import load_from_disk
# training_ds = load_from_disk('/Users/ayushsinha/Project/Transformer')
training_ds = ds_filtered['train']

In [12]:
# x_en = ds_filtered['train'][0]['translation']['en']
# x_hi = ds_filtered['train'][0]['translation']['hi']

# x_en = [x_en]
# x_hi = [x_hi]

# type(ds)
# ds_filtered['train'].map()

# print(x_en)
# print(x_hi)

# all_tokens = D.bpe_en_obj.base_vocab + ['<unk>', '<pad>']
# word2idx = {}
# for ind, ele in enumerate(all_tokens):
#     word2idx[ele] = ind

# def tokenize(x):

#     res = D.bpe_en_obj.tokenize(x)
#     while len(res) < max_tokens:
#         res.append('<pad>')

    
#     return torch.tensor([word2idx[ele] for ele in res])



# enc_input = torch.stack([tokenize(x) for x in x_en], dim = 0)

# all_tokens = D.bpe_hin_obj.base_vocab + ['<unk>', '<pad>', '<eos>', '<start>']
# word2idx = {}
# for ind, ele in enumerate(all_tokens):
#     word2idx[ele] = ind


# def tokenize(x):

#     res = D.bpe_hin_obj.tokenize(x)
#     key = 0
    
#     while len(res) < max_tokens:

#         if not key:
#             res.insert(0, '<start>')
#             res.append('<eos>')
#             key = 1
#             continue

#         res.append('<pad>')
    
#     return torch.tensor([word2idx[ele] for ele in res])

# dec_input = torch.stack([tokenize(x) for x in x_hi], dim = 0)

# training_ds = ds_filtered['train']

# training_ds.save_to_disk('/Users/ayushsinha/Project/Transformer')

# tokenized_iterable_dataset = iterable_dataset.map(lambda input: {'translation': {'en' : tokenize_en(input['translation']['en']) , 'hi' : tokenize_hin(input['translation']['hi'])}} )

# def make_target_output(x):
#     temp = []

#     for ele in x:
#         shifted_tensor = torch.roll(ele, shifts=-1, dims=-1)
#         shifted_tensor[-1] = 201
#         temp.append(shifted_tensor)

#     return torch.stack(temp, dim = 0)
        

# target = make_target_output(dec_input)

# target.shape

# enc_input
# dec_input
# target

# from decoder import decoder_stack
# from encoder import encoder_stack

# loss = torch.nn.CrossEntropyLoss()

# enc = encoder_stack(4, 4, 512)

# enc_output = enc(enc_input)

# dec = decoder_stack(4, 4, 512, enc_output)

# output = dec(dec_input)

# output = output.reshape(-1, 204)
# target = target.reshape(-1)

# loss(output, target)
# # add the cross entropy loss function

# # do the backward pass

# # add batching logic

# # visualize training loss and see if its converging !

# model

In [28]:
iterable_dataset = training_ds.to_iterable_dataset()

In [29]:
def custom_mapper(x):
    
    en_tok = tokenize_en(x['translation']['en'])
    hi_tok = tokenize_hin(x['translation']['hi'])
    tar_tok = torch.roll(hi_tok, shifts=-1, dims=-1)
    tar_tok[-1] = 201
    return {'translation': {'en' : en_tok , 'hi' : hi_tok, 'tar': tar_tok}}


In [30]:
tokenized_iterable_dataset = iterable_dataset.map(lambda input: custom_mapper(input))

In [11]:
import torch.nn as nn
from decoder import decoder_stack
from encoder import encoder_stack

class Transformer_MT(nn.Module):

    def __init__(self):

        super().__init__()

        self.mps_device = torch.device("mps")
        self.enc = encoder_stack(4, 4, 512).to(self.mps_device)
    
    def forward(self, enc_input, dec_input):

        enc_output = self.enc(enc_input)
        self.dec = decoder_stack(4, 4, 512).to(self.mps_device)

        output = self.dec(dec_input, enc_output)

        output = output.reshape(-1, 204)
    
        return output

In [12]:
model = Transformer_MT()
learning_rate = 0.05

In [13]:
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [14]:
loss = torch.nn.CrossEntropyLoss()

In [15]:
from torch.utils.data import DataLoader

dataloader = DataLoader(tokenized_iterable_dataset, batch_size=32)



In [40]:
# torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x163fcebb0>

In [16]:
# temp = torch.tensor([1])

def train_loop(dataloader, model, loss_fn, optimizer):
    model.train()
    batch_size = 32
    mps_device = torch.device("mps")
    # size = 
    for batch, X in enumerate(dataloader):
        # print(batch, X['translation']['en'][0], X['translation']['tar'][0])

        optimizer.zero_grad()

        inputs_1 = X['translation']['en'].to(mps_device)
        inputs_2 = X['translation']['hi'].to(mps_device)
        model_output = model(inputs_1, inputs_2)

        target = X['translation']['tar'].reshape(-1)
        target = target.to(mps_device)
        
        loss = loss_fn(model_output, target)
        
        # print(loss)
        # print(model_output.shape)

        loss.backward()
        optimizer.step()
        

        if batch % 10 == 0:
            loss, current = loss.item(), batch * batch_size + len(X)
            print(f"loss: {loss:>7f}, current: {current:>7f}" )
            torch.mps.empty_cache()

    # shifted_tensor = torch.roll(X['translation']['en'], shifts=-1, dims=-1).clone()
    # # shifted_tensor[:][-1] = 201
   
    # shifted_tensor[:][-1] = 201

    # temp = shifted_tensor
    # break

In [17]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    print ("MPS device found!")
else:
    print ("MPS device not found.")

MPS device found!


In [18]:
model = model.to(mps_device)
# data = data.to(mps_device)
# labels = labels.to(mps_device) # If applicable

In [19]:
epochs = 10

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(dataloader, model, loss, optimizer)
print("Done!")

Epoch 1
-------------------------------
loss: 5.319954, current: 1.000000
loss: 5.317126, current: 321.000000
loss: 5.319087, current: 641.000000
loss: 5.316188, current: 961.000000
loss: 5.319928, current: 1281.000000
loss: 5.313426, current: 1601.000000
loss: 5.318495, current: 1921.000000
loss: 5.319056, current: 2241.000000
loss: 5.319101, current: 2561.000000
loss: 5.320077, current: 2881.000000
loss: 5.320329, current: 3201.000000
loss: 5.319688, current: 3521.000000
loss: 5.320481, current: 3841.000000
loss: 5.317501, current: 4161.000000
loss: 5.317886, current: 4481.000000
loss: 5.320276, current: 4801.000000


KeyboardInterrupt: 

In [45]:
# import os
# os.environ['TORCH_SHOW_CPP_STACKTRACES'] = "1"

In [72]:
# import torch
# print(torch.__version__)
# !pip3 install --upgrade torch torchvision torchaudio

You should consider upgrading via the '/Users/ayushsinha/Project/Transformer/.venv/bin/python3 -m pip install --upgrade pip' command.[0m


In [None]:
train_loop(dataloader, model, loss, optimizer)

In [None]:
# loss = torch.nn.CrossEntropyLoss()
# input = torch.randn(3, 5, requires_grad=True)
# target = torch.empty(3, dtype=torch.long).random_(5)
# output = loss(input, target)
# print(torch.empty(3, dtype=torch.long).random_(5))
# torch.squeeze(output).shape
# torch.squeeze(target)
# loss(torch.squeeze(output), torch.squeeze(target))
# temp = torch.randn(size = (2,2))
# print(temp)
# print(temp.reshape(-1))

tensor([[-0.7054,  0.8946],
        [ 0.2116, -2.9205]])
tensor([-0.7054,  0.8946,  0.2116, -2.9205])
