# Fall 2022: DS-GA 1011 NLP with Representation Learning
## Homework 2
## Part 3: Neural Machine Translation (30 pts)
In this part, you implement Transformer encoder for Neural Machine Translation (NMT) using a sequence to sequence (seq2seq) model for English to French translation with PyTorch.

---
### 1 Transformer Encoder (18 pts)

In [106]:
# Add utilities path
import sys

path_to_utils = 'utils'
sys.path.append(path_to_utils)

In [107]:
# Import modules
import time
from tqdm import notebook
from functools import partial

import torch
from torch import optim
import torch.nn as nn

import global_variables
import nmt_dataset
import nnet_models_new

In [4]:
# Load data
import os

source_name = 'en'
target_name = 'fr'

base_saved_models_dir = '.'
saved_models_dir = os.path.join(base_saved_models_dir, source_name+'2'+target_name)

main_data_path = './data/'

path_to_train_data = {'source':main_data_path+'train.'+source_name, 
                      'target':main_data_path+'train.'+target_name}
path_to_val_data = {'source': main_data_path+'valid.'+source_name, 
                      'target':main_data_path+'valid.'+target_name}

In [5]:
! head -5 './data/train.en'

i think we may have something that you d be interested in buying .
they got it .
i m glad to see you .
he got into his car in a hurry .
do you like mozart s music ?


In [6]:
saved_language_model_dir = os.path.join(saved_models_dir, 'lang_obj')

dataset_dict = {'train': nmt_dataset.LanguagePair(source_name = source_name, target_name=target_name, 
                    filepath = path_to_train_data, 
                    lang_obj_path = saved_language_model_dir,
                     minimum_count = 1), 

                'val': nmt_dataset.LanguagePair(source_name = source_name, target_name=target_name, 
                    filepath = path_to_val_data, 
                    lang_obj_path = saved_language_model_dir,
                    minimum_count = 1)}

In [13]:
MAX_LEN = int(dataset_dict['train'].main_df['source_len'].quantile(0.9999))
batchSize = 64
print('MAX_LEN:', MAX_LEN)

MAX_LEN: 32


In [14]:
from functools import partial
from torch.utils.data import DataLoader

dataloader_dict = {'train': DataLoader(dataset_dict['train'], batch_size = batchSize, 
                            collate_fn = partial(nmt_dataset.vocab_collate_func, MAX_LEN=MAX_LEN),
                            shuffle = True, num_workers=0), 
                    'val': DataLoader(dataset_dict['val'], batch_size = batchSize, 
                            collate_fn = partial(nmt_dataset.vocab_collate_func, MAX_LEN=MAX_LEN),
                            shuffle = True, num_workers=0) }

In [15]:
# Configuration
source_lang_obj = dataset_dict['train'].source_lang_obj
target_lang_obj = dataset_dict['train'].target_lang_obj

source_vocab = dataset_dict['train'].source_lang_obj.n_words
target_vocab = dataset_dict['train'].target_lang_obj.n_words
hidden_size = 512
enc_layers = 1
lr = 0.25
longest_label = 1
gradient_clip = 0.3
use_cuda = True

num_epochs = 20

#### 1.1 Encoder (9 pts)

In [48]:
import math
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout, max_len=100):
        """Initialize positional encoder.
        :param d_model: hidden size of desired model
        :param dropout: pct of embeddings to randomly zero out
        :param max_len: maximum sentence length in batch

        Note that d_model is the same as model hidden size because embeddings need to be added together
        Implementation from http://nlp.seas.harvard.edu/annotated-transformer/#encoder-and-decoder-stacks
        """
        
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)

        position = torch.arange(0, max_len).unsqueeze(1) # shape = (max_len, 1) for downstream vectorized computation
        
        # div_term simplifies to 10000^(2i/d_model), but we define it this way to make vectorized computations easier
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term) # Every 2i in positional embedding is sin(position / 10000^(2i/d_model))
        pe[:, 1::2] = torch.cos(position * div_term) # Every 2i+1 in positional embedding is cos(position / 10000^(2i/d_model))
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        # Takes x as batched embedding matrix w/ size (batch_size, sentence_length, hidden_size) and adds positional embedding values
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

In [98]:
# Add transformer as encoder in seq2seq model
import math

# code below can help you to start it, but feel free to start from scratch
class EncoderTransformer(nn.Module):
    def __init__(self, options):
        """Initialize encoder.
        :param options: architecture parameters which must include:
            - hidden_size: transformer hidden size
            - max_len: maximum sentence length
            - d_model: input embedding size
            - num_heads: number of attn heads
            - num_layers: number of transformer layers
            - vocab_size: input vocab size (dimension 0 of self.encoder)
        """

        super().__init__()
        # you need to add more things here
        self.position_encoder = PositionalEncoding(options['d_model'], 0, options['max_len'])
        encoder_layer = nn.TransformerEncoderLayer(d_model=options['d_model'], nhead=options['num_heads'], batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=options['num_layers'])

        self.encoder = nn.Embedding(options['vocab_size'], options['d_model'], padding_idx=global_variables.PAD_IDX)
        self.d_model = options['d_model']

        self.init_weights()
    
    def init_weights(self) -> None:  
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        # some helpful directions below, check the MLM lab for more details

        embedded = self.encoder(src) * math.sqrt(self.d_model)
        embedded_wpos = self.position_encoder(embedded) # add pos embeddings to text embeddings
        outputs = self.transformer(embedded_wpos)
        hidden = torch.mean(outputs, dim=1).unsqueeze(0)
        
        return outputs, hidden
        

In [99]:
# Architecture parameters
hidden_size = 512
num_layers = 1

t_encoder_options = {
    'hidden_size': hidden_size,
    'max_len': MAX_LEN,
    'd_model': hidden_size,
    'num_heads': 2,
    'num_layers': num_layers,
    'vocab_size': source_vocab,
}

encoder_t = EncoderTransformer(t_encoder_options)

#### 1.2 Decoder(s) (9 pts)

In [100]:
# Training parameters
lr = 0.25
longest_label = 1
gradient_clip = 0.3
use_cuda = True
num_epochs = 20

In [101]:
# Basic RNN decoder (no attention)
rnn_layers = 1
decoder_rnn_basic = nnet_models_new.DecoderRNN(output_size=target_vocab, hidden_size=hidden_size, numlayers=rnn_layers)

In [102]:
# RNN Decoder with Encoder attention
encoder_attention = True
self_attention = False

decoder_encoderattn = nnet_models_new.Decoder_SelfAttn(output_size=target_vocab,
                                                    hidden_size=hidden_size, 
                                                    encoder_attention = encoder_attention,
                                                    self_attention = self_attention)

In [103]:
# RNN Decoder with Encoder & Self attention
encoder_attention = True 
self_attention = True

decoder_encoderselfattn = nnet_models_new.Decoder_SelfAttn(output_size=target_vocab,
                                                    hidden_size=hidden_size, 
                                                    encoder_attention = encoder_attention,
                                                    self_attention = self_attention)

#### Training & Evaluation

In [104]:
def get_full_filepath(path, dec_type):
    filename = 'nmt_t_enc_'+dec_type+'_dec.pth'
    return os.path.join(path, filename)

def save_models(nmt_model, path, dec_type):
    if not os.path.exists(path):
            os.makedirs(path)
    filename = 'nmt_t_enc_'+dec_type+'_dec.pth'
    torch.save(nmt_model, os.path.join(path, filename))

def train_model(dataloader, nmt, num_epochs=50, val_every=1, saved_model_path = 'models/', dec_type ='basic_rnn'):

    best_bleu = -1
    for epoch in range(num_epochs):

        start = time.time()
        running_loss = 0

        print('Epoch: [{}/{}]'.format(epoch, num_epochs));
        
        for i, data in notebook.tqdm(enumerate(dataloader['train']), total=len(dataloader['train'])):  
            _, curr_loss = nmt.train_step(data);
            running_loss += curr_loss

        epoch_loss = running_loss / len(dataloader['train']) 
        
        print("epoch {} loss = {}, time = {}".format(epoch, epoch_loss,
                                                        time.time() - start))
        sys.stdout.flush()
   
        if epoch%val_every == 0:
            val_bleu_score = nmt.get_bleu_score(dataloader['val']);
            print('validation bleu: ', val_bleu_score)
            sys.stdout.flush()
            
            nmt.scheduler_step(val_bleu_score);
            
            if val_bleu_score > best_bleu:
                best_bleu = val_bleu_score
                save_models(nmt, saved_model_path, dec_type);

        print('='*50)

    print("Training completed. Best BLEU is {}".format(best_bleu))

In [105]:
# Train
decoders = {
    'basic_rnn': decoder_rnn_basic,
    'rnn_encoderattn': decoder_encoderattn,
    'rnn_encoderselfattn': decoder_encoderselfattn
}

train_again = False
saved_model_path = 'models/'

for decoder in decoders:
    print(f'Training NMT with transformer encoder and {decoder} decoder')
    if os.path.exists(get_full_filepath(saved_model_path, decoder)) and (not train_again):
        nmt_t_rnn = torch.load(get_full_filepath(saved_model_path, decoder), map_location=global_variables.device)
    else:
        nmt_t_rnn = nnet_models_new.seq2seq(
                                        encoder_t, decoders[decoder],
                                        lr=lr,
                                        use_cuda=use_cuda,
                                        hiddensize=hidden_size,
                                        target_lang=dataset_dict['train'].target_lang_obj,
                                        longest_label=longest_label,
                                        clip=gradient_clip
                                    )
        
        train_model(dataloader_dict, nmt_t_rnn, 
                    num_epochs = num_epochs, 
                    saved_model_path = saved_model_path, 
                    dec_type = decoder)

Training NMT with transformer encoder and basic_rnn decoder
Epoch: [0/20]


  0%|          | 0/1805 [00:00<?, ?it/s]

KeyboardInterrupt: 

---
### 2 Attention visualization (12 pts)

In [None]:
# Model was trained in ~2 hours, i.e. you can expect attention maps
# to look quite 'hard' (less soft spreading) i.e. attending to some particular token in the input