In [1]:
!pip install -q torchtext==0.15.2
!pip install -q spacy
!pip install -q indic-nlp-library
!python3 -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.6.0/en_core_web_lg-3.6.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [2]:
import math
import time
import io, gc
from collections import Counter
import pandas as pd

from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory

import spacy
from spacy.lang.hi.examples import sentences

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.nn import TransformerEncoder, TransformerDecoder, TransformerEncoderLayer, TransformerDecoderLayer
from torch import Tensor

import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab

#spacy.prefer_gpu() use only during multiple GPUs to avoid memory consumption 

In [3]:
!nvidia-smi

Sat Jul 29 19:18:21 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    41W / 300W |      2MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
data = pd.read_csv('gs://bilingualdata/SHORT_HI_EN.csv', encoding='utf-8', delimiter = ',')

In [5]:
data.shape

(127607, 3)

In [6]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [7]:
data['source'].value_counts()

tides        50000
ted          39881
indic2012    37726
Name: source, dtype: int64

In [8]:
data['english_sentence'][0], data['hindi_sentence'][0]

('politicians do not have permission to do what needs to be done.',
 'राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है .')

In [9]:
user_input_df = data['english_sentence'].tolist()
user_output_df = data['hindi_sentence'].tolist()

In [10]:
len(user_input_df), len(user_output_df)

(127607, 127607)

In [11]:
# train, val, and test sets partitioning ratio
127607*0.7, 127607*0.15, 127607*0.15, 89325+19141+19141, 89325+19141

(89324.9, 19141.05, 19141.05, 127607, 108466)

In [12]:
# Assign training, validation, and test sets
input_train_df = user_input_df[0:89325]
output_train_df = user_output_df[0:89325]
input_valid_df = user_input_df[89325:108466]
output_valid_df = user_output_df[89325:108466]
input_test_df = user_input_df[108466:127607]
output_test_df = user_output_df[108466:127607]

In [13]:
len(input_train_df), len(output_train_df)

(89325, 89325)

In [14]:
len(input_valid_df), len(output_valid_df)

(19141, 19141)

In [15]:
len(input_test_df), len(output_test_df)

(19141, 19141)

In [16]:
# Load EN tokenizer
eng_tokenizer = get_tokenizer('spacy', language = 'en_core_web_lg')

def hindi_tokenizer(data):
    factory = IndicNormalizerFactory()
    normalizer = factory.get_normalizer('hi',remove_nuktas = True)
    text = normalizer.normalize(data)
    words = indic_tokenize.trivial_tokenize(text)
    return words

In [17]:
# generate a Vocabulary with only words that occur a minimum of 1 times.

def build_bivocab(filedata, tokenizer):
    counter = Counter()
    for string_ in filedata:
        #print(string_)
        counter.update(tokenizer(str(string_)))
    #print(counter)
    return vocab(counter, specials = ['<unk>', '<pad>', '<bos>', '<eos>'])

In [18]:
eng_vocab = build_bivocab(input_train_df, eng_tokenizer)
eng_vocab.set_default_index(eng_vocab['<unk>'])
eng_vocab[' ']

0

In [19]:
hindi_vocab = build_bivocab(output_train_df,hindi_tokenizer)
hindi_vocab.set_default_index(hindi_vocab['<unk>'])
hindi_vocab['<eos>']

3

In [20]:
print(eng_vocab['politics'])

3273


In [21]:
#eng_vocab.parameters()
gc.collect()

2477

In [22]:
def process_df(filepath_input, filepath_output):
    raw_eng_iter = iter(filepath_input)
    raw_hindi_iter = iter(filepath_output)
    data = []
    for (raw_eng, raw_hindi) in zip(raw_eng_iter, raw_hindi_iter):
        eng_tensor_ = torch.tensor([eng_vocab[token] for token in eng_tokenizer(str(raw_eng))],dtype = torch.long)
        hindi_tensor_ = torch.tensor([hindi_vocab[token] for token in hindi_tokenizer(str(raw_hindi))],dtype = torch.long)
        data.append((eng_tensor_, hindi_tensor_))
    return data

In [None]:
train_df = process_df(input_train_df, output_train_df)
val_df = process_df(input_valid_df, output_valid_df)
test_df = process_df(input_test_df, output_test_df)

In [None]:
print("Size of the EN-HI train set:", len(train_df))
print("Size of the EN-HI val set:", len(val_df))
print("Size of the EN-HI test set:", len(test_df))

In [None]:
BATCH_SIZE = 128
PAD_IDX = eng_vocab['<pad>']
BOS_IDX = eng_vocab['<bos>']
EOS_IDX = eng_vocab['<eos>']
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def generate_batch(data_batch):
    eng_batch, hindi_batch = [], []
    for (eng_item, hindi_item) in data_batch:
        eng_batch.append(torch.cat([torch.tensor([BOS_IDX]), eng_item, torch.tensor([EOS_IDX])], dim = 0))
        hindi_batch.append(torch.cat([torch.tensor([BOS_IDX]), hindi_item, torch.tensor([EOS_IDX])], dim = 0))
    eng_batch = pad_sequence(eng_batch, padding_value = PAD_IDX)
    hindi_batch = pad_sequence(hindi_batch, padding_value = PAD_IDX)
    return eng_batch, hindi_batch

In [None]:
train_iter = DataLoader(train_df, batch_size = BATCH_SIZE, shuffle = True, collate_fn = generate_batch)
valid_iter = DataLoader(val_df, batch_size = BATCH_SIZE, shuffle = True, collate_fn = generate_batch)
test_iter = DataLoader(test_df, batch_size = BATCH_SIZE, shuffle = True, collate_fn = generate_batch)

In [None]:
print(len(train_iter))

In [None]:
#for (idx, batch) in enumerate(valid_iter):
    #print(idx)
    #print(batch[0])
torch.cuda.empty_cache()

## Modelling (standard transformer model based on https://arxiv.org/abs/1409.0473)

In [None]:
class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers:int, num_decoder_layers:int, emb_size:int, src_vocab_size:int, tgt_vocab_size:int, dim_feedforward:int = 512, dropout:float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        encoder_layer = TransformerEncoderLayer(d_model = emb_size, nhead = NHEAD, dim_feedforward = dim_feedforward)
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers = num_encoder_layers)
        decoder_layer = TransformerDecoderLayer(d_model = emb_size, nhead = NHEAD, dim_feedforward = dim_feedforward)
        self.transformer_decoder = TransformerDecoder(decoder_layer, num_layers = num_decoder_layers)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout = dropout)
        
    def forward(self, src: Tensor, tgt: Tensor, src_mask:Tensor, tgt_mask:Tensor, src_padding_mask:Tensor, tgt_padding_mask:Tensor, memory_key_padding_mask:Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(tgt))
        memory = self.transformer_encoder(src_emb, src_mask, src_padding_mask)
        outs = self.transformer_decoder(tgt_emb, memory, tgt_mask, None, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)
    
    def encode(self, src:Tensor, src_mask:Tensor):
        return self.transformer_encoder(self.positional_encoding(self.src_tok_emb(src)), src_mask)
    
    def decode(self, tgt:Tensor, memory:Tensor, tgt_mask:Tensor):
        return self.transformer_decoder(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask)   
    
    
class TokenEmbedding(nn.Module):
    def __init__(self,vocab_size:int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size
        
    def forward(self, tokens:Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)
    
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size:int, dropout, maxlen:int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(-torch.arange(0,emb_size, 2) * math.log(10000)/emb_size)
        pos = torch.arange(0,maxlen).reshape(maxlen,1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:,0::2] = torch.sin(pos*den)
        pos_embedding[:,1::2] = torch.cos(pos*den)
        pos_embedding = pos_embedding.unsqueeze(-2)
        
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding',pos_embedding)
    
    def forward(self, token_embedding:Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0),:])

In [None]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz,sz), device = DEVICE)) == 1).transpose(0,1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src,tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]
    
    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device = DEVICE).type(torch.bool)
    
    src_padding_mask = (src == PAD_IDX).transpose(0,1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0,1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [None]:
EN_VOCAB_SIZE = len(eng_vocab)
HI_VOCAB_SIZE = len(hindi_vocab)
EMB_SIZE = 512
NHEAD = 8
FFN_HID_DIM = 32
NUM_ENCODER_LAYERS = 6
NUM_DECODER_LAYERS = 6
NUM_EPOCHS = 500

In [None]:
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, EN_VOCAB_SIZE, HI_VOCAB_SIZE, FFN_HID_DIM)

for p in transformer.parameters():
    if p.dim()>1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index = PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr = 0.0001, betas = (0.9,0.98), eps = 1e-9)

In [None]:
def train_epoch(model, train_iter, optimizer):
    model.train()
    losses = 0
    for idx, (eng,hindi) in enumerate(train_iter):
        eng = eng.to(DEVICE)
        hindi = hindi.to(DEVICE)
        hindi_input = hindi[:-1,:]
        eng_mask, hindi_mask, eng_padding_mask, hindi_padding_mask = create_mask(eng, hindi_input)
        logits = model(eng, hindi_input, eng_mask, hindi_mask, eng_padding_mask, hindi_padding_mask, eng_padding_mask)
        optimizer.zero_grad()
        hindi_out = hindi[1:,:]
        loss = loss_fn(logits.reshape(-1,logits.shape[-1]), hindi_out.reshape(-1))
        loss.backward()
        optimizer.step()
        losses += loss.item()
    torch.save(model,'s2sattn_model_500.pth')
    return losses/len(train_iter)

In [None]:
def evaluate(model, val_iter):
    model.eval()
    losses = 0
    for idx, (eng, hindi) in (enumerate(valid_iter)):
        eng = eng.to(DEVICE)
        hindi = hindi.to(DEVICE)
        hindi_input = hindi[:-1,:]
        eng_mask, hindi_mask, eng_padding_mask, hindi_padding_mask = create_mask(eng, hindi_input)
        logits = model(eng, hindi_input, eng_mask, hindi_mask, eng_padding_mask, hindi_padding_mask, eng_padding_mask)
        hindi_out = hindi[1:,:]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), hindi_out.reshape(-1))
        losses += loss.item()
    return losses/ len(val_iter)

In [40]:
for epoch in range(1, NUM_EPOCHS+1):
    start_time = time.time()
    train_loss = train_epoch(transformer, train_iter, optimizer)
    end_time = time.time()
    val_loss = evaluate(transformer, valid_iter)
    print((f"Epoch : {epoch}, Train loss: {train_loss:.3f}, Val Loss: {val_loss:.3f}," 
           f"Epoch Time= {(end_time - start_time):.3f}s"))



OutOfMemoryError: CUDA out of memory. Tried to allocate 2.97 GiB (GPU 0; 15.89 GiB total capacity; 14.52 GiB already allocated; 527.88 MiB free; 14.71 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF