<a href="https://colab.research.google.com/github/arjuns238/MachineTranslation/blob/main/MachineTranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from torch.utils.data import Dataset, DataLoader
from typing import Iterable, List
import torchtext

# # We need to modify the URLs for the dataset since the links to the original dataset are broken
# # Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
# multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
# multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"


# Place-holders
token_transform = {}
vocab_transform = {}

In [6]:
# # Installing dependencies
# !pip install -U torchdata
# !pip install -U spacy
!pip install 'portalocker>=2.0.0'
# !python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

Collecting portalocker>=2.0.0
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.8.2
Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [7]:
batch_size = 64
block_size = 256
learning_rate = 1e-2
max_iters = 5000
eval_interval = 500
eval_iters = 200
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_embd = 384
dropout = 0.2
no_of_heads = 6
n_layer = 6
device
SRC_LANGUAGE = 'Fr'
TGT_LANGUAGE = 'En'

In [8]:
data = pd.read_csv("/content/eng-fra.txt", sep="\t", header=None)
data = data.set_axis(['En','Fr'], axis = 1) # Rename indices
data.tail()

Unnamed: 0,En,Fr
135837,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
135838,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
135839,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...
135840,If someone who doesn't know your background sa...,Si quelqu'un qui ne connaît pas vos antécédent...
135841,It may be impossible to get a completely error...,Il est peut-être impossible d'obtenir un Corpu...


In [9]:
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='fr_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')

# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter[language]:
        yield token_transform[language](data_sample)

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    print(ln)
    # Training data Iterator
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(data, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set ``UNK_IDX`` as the default index. This index is returned when the token is not found.
# If not set, it throws ``RuntimeError`` when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

Fr
En


In [None]:
print(vocab_transform["En"].lookup_token(200))
print(vocab_transform["En"].lookup_indices(["left"]))

In [10]:
# Step 1: Encode a sentence
def encode_sentence(sentence: str, language: str, tokenizer, vocab) -> List[int]:
    # Tokenize the sentence
    tokens = tokenizer(sentence)
    # Convert tokens to indices using vocabulary
    indices = vocab_transform[language].lookup_indices(tokens)
    return indices

# Step 2: Decode a sequence
def decode_sequence(indices: List[int], language: str, vocab) -> str:
    # Convert indices to tokens
    tokens = [vocab_transform[language].lookup_token(index) for index in indices]
    # Remove <bos> and <eos> tokens if present
    if tokens[0] == '<bos>':
        tokens = tokens[1:]
    if tokens[-1] == '<eos>':
        tokens = tokens[:-1]
    # Convert tokens to a sentence
    sentence = ' '.join(tokens)
    return sentence

# Example usage
sentence = "Je suis froid"
encoded = encode_sentence(sentence, SRC_LANGUAGE, token_transform[SRC_LANGUAGE], vocab_transform[SRC_LANGUAGE])
decoded = decode_sequence(encoded, SRC_LANGUAGE, vocab_transform[SRC_LANGUAGE])
print("Original sentence:", sentence)
print("Encoded sequence:", encoded)
print("Decoded sentence:", decoded)

Original sentence: Je suis froid
Encoded sequence: [6, 34, 448]
Decoded sentence: Je suis froid


In [11]:
from torch.nn.utils.rnn import pad_sequence

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

# ``src`` and ``tgt`` language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor


# function to collate data samples into batch tensors
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    # src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    # tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

In [12]:
class CustomDataset(Dataset):
    def __init__(self, inputText, outputText):
        """
        Initialize the dataset by passing the data and corresponding labels.

        Args:
            data (list or numpy array): The input data.
            labels (list or numpy array): The corresponding labels.
        """
        self.inputText = inputText
        self.outputText = outputText

    def __len__(self):
        """
        Return the total number of samples in the dataset.
        """
        return len(self.inputText)

    def __getitem__(self, idx):
        """
        Retrieve a sample and its corresponding label at the given index.

        Args:
            idx (int): Index of the sample to retrieve.

        Returns:
            tuple: A tuple containing the sample data and its corresponding label.
        """
        sample = self.inputText[idx]
        output = self.outputText[idx]

        # sample = encode_sentence(sample, SRC_LANGUAGE, token_transform[SRC_LANGUAGE], vocab_transform[SRC_LANGUAGE])
        # output = encode_sentence(output, TGT_LANGUAGE, token_transform[TGT_LANGUAGE], vocab_transform[TGT_LANGUAGE])

        return sample, output

dataset = CustomDataset(data["Fr"], data["En"])
# Create a DataLoader to iterate over batches of data
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
train_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

