In [1]:
import numpy as np
import pandas as pd
import spacy
from string import digits
import random
from torchtext.data.utils import get_tokenizer
import torch
import torchtext
from collections import Counter
from torchtext.vocab import vocab
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
#torch.cuda.empty_cache()

import math
import time

import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/en-fr-translation-dataset/en-fr.csv


In [2]:
SEED = 97

random.seed(SEED)
np.random.seed(SEED)
#torch.manual_seed(SEED)
# torch.cuda.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True

In [3]:
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting fr-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.5.0/fr_core_news_sm-3.5.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.5.0
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [4]:
MAX_LEN = 256
check_len = lambda x: len(x.split(' ')) > MAX_LEN

In [5]:
data = pd.read_csv('/kaggle/input/en-fr-translation-dataset/en-fr.csv', nrows=5000)
data = data.dropna().drop_duplicates()
data = data.drop(data[data.en.apply(check_len) | data.fr.apply(check_len)].index)
data.head(5)

Unnamed: 0,en,fr
0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...
1,Site map,Plan du site
2,Feedback,Rétroaction
3,Credits,Crédits
4,Français,English


In [6]:
len(data)

4998

In [7]:
fr_tokenizer = get_tokenizer('spacy', language='fr_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [8]:
val_frac = 0.1
test_frac = 0.05
val_split_idx = int(len(data)*val_frac)
test_split_idx = int(len(data)*(val_frac + test_frac))
data_idx = list(range(len(data)))
np.random.shuffle(data_idx)

val_idx, test_idx, train_idx = data_idx[:val_split_idx], data_idx[val_split_idx:test_split_idx], data_idx[test_split_idx:]
print('Length of train set: ', len(train_idx))
print('Length of val set: ', len(val_idx))
print('Length of test set: ', len(test_idx))

df_train = data.iloc[train_idx].reset_index().drop('index',axis=1)
df_test = data.iloc[test_idx].reset_index().drop('index',axis=1)
df_val = data.iloc[val_idx].reset_index().drop('index',axis=1)

Length of train set:  4249
Length of val set:  499
Length of test set:  250


In [9]:
def build_vocab(data, source_tokenizer, target_tokenizer):
    en_counter = Counter()
    fr_counter = Counter()
    translations = data.values.tolist()
    for translation in translations:
        en_counter.update(source_tokenizer(translation[0]))
        fr_counter.update(target_tokenizer(translation[1]))
    return vocab(en_counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'], min_freq=5), vocab(fr_counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'], min_freq=5)

In [10]:
en_vocab, fr_vocab = build_vocab(df_train, en_tokenizer, fr_tokenizer)
en_vocab.set_default_index(en_vocab['<unk>'])
fr_vocab.set_default_index(fr_vocab['<unk>'])

In [11]:
def data_process(data):
    translations = data.values.tolist()
    pairs = []
    for translation in translations:
        en_tensor = torch.tensor([en_vocab[token] for token in en_tokenizer(translation[0])],
                            dtype=torch.long)
        fr_tensor = torch.tensor([fr_vocab[token] for token in fr_tokenizer(translation[1])],
                            dtype=torch.long)
        pairs.append((en_tensor, fr_tensor))
    return pairs

In [12]:
train_data = data_process(df_train)
val_data = data_process(df_val)
test_data = data_process(df_test)

In [13]:
BATCH_SIZE = 16
PAD_IDX = en_vocab['<pad>']
BOS_IDX = en_vocab['<bos>']
EOS_IDX = en_vocab['<eos>']

In [14]:
def generate_batch(data_batch):
    en_batch, fr_batch = [], []
    for (en_item, fr_item) in data_batch:
        en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0).to(device))
        fr_batch.append(torch.cat([torch.tensor([BOS_IDX]), fr_item, torch.tensor([EOS_IDX])], dim=0).to(device))  
        
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    fr_batch = pad_sequence(fr_batch, padding_value=PAD_IDX)
    return en_batch, fr_batch

In [15]:
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)

In [16]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, device):
        super().__init__()
        
        assert kernel_size % 2 == 1, "Kernel size must be odd!"
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.kernel_size = kernel_size
        self.dropout = dropout
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
        self.tok_embedding = nn.Embedding(input_dim, emb_dim)
        self.pos_embedding = nn.Embedding(100, emb_dim)
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        self.convs = nn.ModuleList([nn.Conv1d(in_channels = hid_dim, 
                                              out_channels = 2 * hid_dim, 
                                              kernel_size = kernel_size, 
                                              padding = (kernel_size - 1) // 2)
                                    for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [batch size, src sent len]
        
        #create position tensor
        pos = torch.arange(0, src.shape[1]).unsqueeze(0).repeat(src.shape[0], 1).to(self.device)
        
        #pos = [batch size, src sent len]
        
        #embed tokens and positions
        tok_embedded = self.tok_embedding(src)
        pos_embedded = self.pos_embedding(pos)
        
        #tok_embedded = pos_embedded = [batch size, src sent len, emb dim]
        
        #combine embeddings by elementwise summing
        embedded = self.dropout(tok_embedded + pos_embedded)
        
        #embedded = [batch size, src sent len, emb dim]
        
        #pass embedded through linear layer to go through emb dim -> hid dim
        conv_input = self.emb2hid(embedded)
        
        #conv_input = [batch size, src sent len, hid dim]
        
        #permute for convolutional layer
        conv_input = conv_input.permute(0, 2, 1) 
        
        #conv_input = [batch size, hid dim, src sent len]
        
        for i, conv in enumerate(self.convs):
        
            #pass through convolutional layer
            conved = conv(self.dropout(conv_input))

            #conved = [batch size, 2*hid dim, src sent len]

            #pass through GLU activation function
            conved = F.glu(conved, dim = 1)

            #conved = [batch size, hid dim, src sent len]
            
            #apply residual connection
            conved = (conved + conv_input) * self.scale

            #conved = [batch size, hid dim, src sent len]
            
            #set conv_input to conved for next loop iteration
            conv_input = conved
        
        #permute and convert back to emb dim
        conved = self.hid2emb(conved.permute(0, 2, 1))
        
        #conved = [batch size, src sent len, emb dim]
        
        #elementwise sum output (conved) and input (embedded) to be used for attention
        combined = (conved + embedded) * self.scale
        
        #combined = [batch size, src sent len, emb dim]
        
        return conved, combined

In [17]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, pad_idx, device):
        super().__init__()
        
        self.output_dim = output_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.kernel_size = kernel_size
        self.dropout = dropout
        self.pad_idx = pad_idx
        self.device = device
        
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
        
        self.tok_embedding = nn.Embedding(output_dim, emb_dim)
        self.pos_embedding = nn.Embedding(100, emb_dim)
        
        self.emb2hid = nn.Linear(emb_dim, hid_dim)
        self.hid2emb = nn.Linear(hid_dim, emb_dim)
        
        self.attn_hid2emb = nn.Linear(hid_dim, emb_dim)
        self.attn_emb2hid = nn.Linear(emb_dim, hid_dim)
        
        self.out = nn.Linear(emb_dim, output_dim)
        
        self.convs = nn.ModuleList([nn.Conv1d(hid_dim, 2*hid_dim, kernel_size)
                                    for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
      
    def calculate_attention(self, embedded, conved, encoder_conved, encoder_combined):
        
        #embedded = [batch size, trg sent len, emb dim]
        #conved = [batch size, hid dim, trg sent len]
        #encoder_conved = encoder_combined = [batch size, src sent len, emb dim]
        
        #permute and convert back to emb dim
        conved_emb = self.attn_hid2emb(conved.permute(0, 2, 1))
        
        #conved_emb = [batch size, trg sent len, emb dim]
        
        combined = (embedded + conved_emb) * self.scale
        
        #combined = [batch size, trg sent len, emb dim]
                
        energy = torch.matmul(combined, encoder_conved.permute(0, 2, 1))
        
        #energy = [batch size, trg sent len, src sent len]
        
        attention = F.softmax(energy, dim=2)
        
        #attention = [batch size, trg sent len, src sent len]
            
        attended_encoding = torch.matmul(attention, (encoder_conved + encoder_combined))
        
        #attended_encoding = [batch size, trg sent len, emd dim]
        
        #convert from emb dim -> hid dim
        attended_encoding = self.attn_emb2hid(attended_encoding)
        
        #attended_encoding = [batch size, trg sent len, hid dim]
        
        attended_combined = (conved + attended_encoding.permute(0, 2, 1)) * self.scale
        
        #attended_combined = [batch size, hid dim, trg sent len]
        
        return attention, attended_combined
        
    def forward(self, trg, encoder_conved, encoder_combined):
        
        #trg = [batch size, trg sent len]
        #encoder_conved = encoder_combined = [batch size, src sent len, emb dim]
                
        #create position tensor
        pos = torch.arange(0, trg.shape[1]).unsqueeze(0).repeat(trg.shape[0], 1).to(device)
        
        #pos = [batch size, trg sent len]
        
        #embed tokens and positions
        tok_embedded = self.tok_embedding(trg)
        pos_embedded = self.pos_embedding(pos)
        
        #tok_embedded = [batch size, trg sent len, emb dim]
        #pos_embedded = [batch size, trg sent len, emb dim]
        
        #combine embeddings by elementwise summing
        embedded = self.dropout(tok_embedded + pos_embedded)
        
        #embedded = [batch size, trg sent len, emb dim]
        
        #pass embedded through linear layer to go through emb dim -> hid dim
        conv_input = self.emb2hid(embedded)
        
        #conv_input = [batch size, trg sent len, hid dim]
        
        #permute for convolutional layer
        conv_input = conv_input.permute(0, 2, 1) 
        
        #conv_input = [batch size, hid dim, trg sent len]
        
        for i, conv in enumerate(self.convs):
        
            #apply dropout
            conv_input = self.dropout(conv_input)
        
            #need to pad so decoder can't "cheat"
            padding = torch.zeros(conv_input.shape[0], conv_input.shape[1], self.kernel_size-1).fill_(self.pad_idx).to(device)
            padded_conv_input = torch.cat((padding, conv_input), dim=2)
        
            #padded_conv_input = [batch size, hid dim, trg sent len + kernel size - 1]
        
            #pass through convolutional layer
            conved = conv(padded_conv_input)

            #conved = [batch size, 2*hid dim, trg sent len]
            
            #pass through GLU activation function
            conved = F.glu(conved, dim=1)

            #conved = [batch size, hid dim, trg sent len]
            
            attention, conved = self.calculate_attention(embedded, conved, encoder_conved, encoder_combined)
            
            #attention = [batch size, trg sent len, src sent len]
            #conved = [batch size, hid dim, trg sent len]
            
            #apply residual connection
            conved = (conved + conv_input) * self.scale
            
            #conved = [batch size, hid dim, trg sent len]
            
            #set conv_input to conved for next loop iteration
            conv_input = conved
            
        conved = self.hid2emb(conved.permute(0, 2, 1))
         
        #conved = [batch size, trg sent len, hid dim]
            
        output = self.out(self.dropout(conved))
        
        #output = [batch size, trg sent len, output dim]
            
        return output, attention

In [18]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg):
        
        #src = [batch size, src sent len]
        #trg = [batch size, trg sent len]
           
        #calculate z^u (encoder_conved) and e (encoder_combined)
        #encoder_conved is output from final encoder conv. block
        #encoder_combined is encoder_conved plus (elementwise) src embedding plus positional embeddings 
        encoder_conved, encoder_combined = self.encoder(src)
            
        #encoder_conved = [batch size, src sent len, emb dim]
        #encoder_combined = [batch size, src sent len, emb dim]
        
        #calculate predictions of next words
        #output is a batch of predictions for each word in the trg sentence
        #attention a batch of attention scores across the src sentence for each word in the trg sentence
        output, attention = self.decoder(trg, encoder_conved, encoder_combined)
        
        #output = [batch size, trg sent len, output dim]
        #attention = [batch size, trg sent len, src sent len]
        
        return output, attention

In [19]:
INPUT_DIM = len(en_vocab)
OUTPUT_DIM = len(fr_vocab)
EMB_DIM = 256
HID_DIM = 512
ENC_LAYERS = 10
DEC_LAYERS = 10
ENC_KERNEL_SIZE = 3
DEC_KERNEL_SIZE = 3
ENC_DROPOUT = 0.25
DEC_DROPOUT = 0.25
PAD_IDX = fr_vocab['<pad>']
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, ENC_LAYERS, ENC_KERNEL_SIZE, ENC_DROPOUT, device)
dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, DEC_LAYERS, DEC_KERNEL_SIZE, DEC_DROPOUT, PAD_IDX, device)

model = Seq2Seq(enc, dec, device).to(device)

In [20]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 33,838,523 trainable parameters


In [21]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [22]:
def train(model, dataloader, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, (src, trg) in enumerate(dataloader):   
        print(src.size())
        src = src.permute(1, 0)
        trg = trg.permute(1, 0)
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
        
        #output = [batch size, trg sent len - 1, output dim]
        #trg = [batch size, trg sent len]
        
        output = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:,1:].contiguous().view(-1)
        
        #output = [batch size * trg sent len - 1, output dim]
        #trg = [batch size * trg sent len - 1]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(dataloader)

In [23]:
def evaluate(model, dataloader, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
        for i, (src, trg) in enumerate(dataloader):
            src = src.permute(1, 0)
            trg = trg.permute(1, 0)
            output, _ = model(src, trg[:,:-1])
        
            #output = [batch size, trg sent len - 1, output dim]
            #trg = [batch size, trg sent len]

            output = output.contiguous().view(-1, output.shape[-1])
            trg = trg[:,1:].contiguous().view(-1)

            #output = [batch size * trg sent len - 1, output dim]
            #trg = [batch size * trg sent len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(dataloader)

In [24]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [25]:
MODEL_PATH = 'cnn-encoder-decoder-attn-model.pt'

In [26]:
N_EPOCHS = 5
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_PATH)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

torch.Size([32, 16])
torch.Size([32, 16])
torch.Size([36, 16])
torch.Size([78, 16])
torch.Size([32, 16])
torch.Size([39, 16])
torch.Size([41, 16])
torch.Size([28, 16])
torch.Size([40, 16])
torch.Size([44, 16])
torch.Size([83, 16])
torch.Size([30, 16])
torch.Size([28, 16])
torch.Size([30, 16])
torch.Size([62, 16])
torch.Size([40, 16])
torch.Size([76, 16])
torch.Size([59, 16])
torch.Size([41, 16])
torch.Size([39, 16])
torch.Size([34, 16])
torch.Size([57, 16])
torch.Size([44, 16])
torch.Size([78, 16])
torch.Size([48, 16])
torch.Size([33, 16])
torch.Size([43, 16])
torch.Size([103, 16])


/usr/local/src/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [200,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/usr/local/src/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [200,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/usr/local/src/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [200,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/usr/local/src/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [200,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/usr/local/src/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [200,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/usr/local/src/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [200,0,0], thread: [37,0,0] Assertion 

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [None]:
model.load_state_dict(torch.load(MODEL_PATH))
test_loss = evaluate(model, test_loader, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')