# Data preproceessing

## Download data and define dataset class.

In [0]:
%%capture
!pip install transformers
!pip install pytorch-pretrained-bert

In [0]:
import torch
from torch.utils.data import Dataset
from pytorch_pretrained_bert import BertTokenizer
import pandas as pd
import numpy as np
from os.path import exists

class MT_Dataset(Dataset):

    def __init__(self, mode, maxlen):

      self.mode = mode

      #Download data
      if not exists('ende_data.zip'):
        !wget -O ende_data.zip https://competitions.codalab.org/my/datasets/download/c748d2c0-d6be-4e36-9f12-ca0e88819c4d
        !unzip ende_data.zip

      if self.mode == 'train':
        with open("./train.ende.src", "r") as ende_src:
          src = [line.rstrip('\n') for line in ende_src]
        with open("./train.ende.mt", "r") as ende_mt:
          mt = [line.rstrip('\n') for line in ende_mt]
        with open("./train.ende.scores", "r") as ende_scores:
          score = [float(line.rstrip('\n')) for line in ende_scores]

      elif self.mode == 'dev':
        with open("./dev.ende.src", "r") as ende_src:
          src = [line.rstrip('\n') for line in ende_src]
        with open("./dev.ende.mt", "r") as ende_mt:
          mt = [line.rstrip('\n') for line in ende_mt]
        with open("./dev.ende.scores", "r") as ende_scores:
          score = [float(line.rstrip('\n')) for line in ende_scores]

      elif self.mode == 'test':
        with open("./test.ende.src", "r") as ende_src:
          src = [line.rstrip('\n') for line in ende_src]
        with open("./test.ende.mt", "r") as ende_mt:
          mt = [line.rstrip('\n') for line in ende_mt]

      elif self.mode == 'traindev':
        with open("./train.ende.src", "r") as ende_src:
          src_t = [line.rstrip('\n') for line in ende_src]
        with open("./train.ende.mt", "r") as ende_mt:
          mt_t = [line.rstrip('\n') for line in ende_mt]
        with open("./train.ende.scores", "r") as ende_scores:
          score_t = [float(line.rstrip('\n')) for line in ende_scores]
        with open("./dev.ende.src", "r") as ende_src:
          src_d = [line.rstrip('\n') for line in ende_src]
        with open("./dev.ende.mt", "r") as ende_mt:
          mt_d = [line.rstrip('\n') for line in ende_mt]
        with open("./dev.ende.scores", "r") as ende_scores:
          score_d = [float(line.rstrip('\n')) for line in ende_scores]

        src = src_t + src_d
        mt = mt_t + mt_d
        score = score_t + score_d

      else:
        raise InputError("Wrong mode")
      
      #Store the contents of the file in a pandas dataframe
      if self.mode in ['train', 'dev', 'traindev']:
        self.df = pd.DataFrame([src,mt,score]).T
        self.df.columns = ['src', 'mt', 'score']
      else:
        self.df = pd.DataFrame([src,mt]).T
        self.df.columns = ['src', 'mt']

      self.length = len(self.df)

      #Initialize the BERT tokenizer
      self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

      #Maximal length of input
      self.maxlen = maxlen

      #Storing preprocessed inputs
      self.token_ids_tensors = []
      self.token_type_ids_tensors = []
      self.attn_masks = []

      #Preprocessing
      for index in range(len(self.df)):
        src = self.df.loc[index, 'src']
        mt = self.df.loc[index, 'mt']

        if self.mode in ['train', 'dev', 'traindev']:
          score = self.df.loc[index, 'score']
        
        #Tokenize sentences
        tokens_src = self.tokenizer.tokenize(src) 
        tokens_mt = self.tokenizer.tokenize(mt) 

        #Insering the CLS and SEP tokens
        tokens = ['[CLS]'] + tokens_src + ['[SEP]'] + tokens_mt + ['[SEP]']

        #A vector encoding which token belongs to which sentence (values either 0 or 1) 
        token_type_ids = [0 for _ in range(len(tokens_src)+2)] + [1 for _ in range(len(tokens_mt)+1)]

        #Adding padding to keep constant input length
        if len(tokens) < self.maxlen:
          token_type_ids = token_type_ids + [0 for _ in range(self.maxlen - len(tokens))] #here [PAD] tokens belongs to first sentence
          tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
          raise RuntimeError('Sentences are jointly too long for specified maxlen.')

        #Obtaining the indices of the tokens in the BERT Vocabulary
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens) 
        
        #Converting the lists to a torch tensors
        token_ids_tensor = torch.tensor(token_ids) 
        token_type_ids_tensor = torch.LongTensor(token_type_ids) 

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (token_ids_tensor != 0).long()

        self.token_ids_tensors.append(token_ids_tensor)
        self.token_type_ids_tensors.append(token_type_ids_tensor)
        self.attn_masks.append(attn_mask)

    def __len__(self):
      return self.length

    def __getitem__(self, i):
      if self.mode in ['train', 'dev', 'traindev']:
        return self.token_ids_tensors[i], self.token_type_ids_tensors[i], self.attn_masks[i], self.df.loc[i, 'score']
      else:
        return self.token_ids_tensors[i], self.token_type_ids_tensors[i], self.attn_masks[i]

## Create dataloaders

In [0]:
%%capture
from torch.utils.data import DataLoader

#Creating instances of training and validation set
train_set = MT_Dataset('train', maxlen = 128)
val_set = MT_Dataset('dev', maxlen = 128)
test_set = MT_Dataset('test', maxlen = 128)
traindev_set = MT_Dataset('traindev', maxlen = 128)

#Creating intsances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size = 32, shuffle=True, num_workers = 5)
val_loader = DataLoader(val_set, batch_size = 32, shuffle=True, num_workers = 5)
test_loader = DataLoader(test_set, batch_size = 32, shuffle=False, num_workers = 5)
traindev_loader = DataLoader(traindev_set, batch_size = 32, shuffle=True, num_workers = 5)

# Fine-tuning BERT

## Build the network

In [0]:
import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertModel

class MTQualityEstimator(nn.Module):

  def __init__(self, freeze_bert):
    super(MTQualityEstimator, self).__init__()
    #Instantiating BERT model object 
    self.bert_layer = BertModel.from_pretrained('bert-base-multilingual-uncased')
    
    #Freeze bert layers
    if freeze_bert:
      for p in self.bert_layer.parameters():
        p.requires_grad = False
    
    #Classification layer
    self.fc = nn.Linear(768, 1)

  def forward(self, seq, token_type_ids, attn_masks):
    '''
    Inputs:
        -seq : Tensor of shape [B, T] containing token ids of sequences
        -token_type_ids : Tensor of shape [B, T] encoding segments
        -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
    '''

    #Feeding the input to BERT model to obtain pooled output representation
    _, pool_out = self.bert_layer(seq, token_type_ids = token_type_ids, attention_mask = attn_masks)
    

    #Feeding cls_rep to the classifier layer
    out = 3*torch.tanh(self.fc(pool_out))        #99.7% probability to lie within three standard deviations from the mean for normal distribution

    return out

## Set hyperparameters

In [0]:
%%capture
import torch.nn as nn
import torch.optim as optim
from transformers import get_linear_schedule_with_warmup

model = MTQualityEstimator(freeze_bert = False)
criterion = nn.MSELoss()

""" for fine-tuning authors recommend:
        batch_size: 16, 32 
        learning_rate: 5e-5, 3e-5, 2e-5 
        epochs: 2, 3, 4 """

epochs = 2
optimizer = optim.AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_loader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

## Training loop

In [0]:
from scipy.stats.stats import pearsonr
from torch.nn.functional import mse_loss

def check_accuracy(loader, model):
  # function for test accuracy on validation and test set
  print('Checking accuracy on validation set:')
  model.eval()  # set model to evaluation mode
  predicted_scores = []
  true_scores = []
  with torch.no_grad():
    for it, (token_ids, token_type_ids, attn_masks, batch_scores) in enumerate(loader):
      #Converting to cuda tensors
      token_ids, token_type_ids, attn_masks, batch_scores = token_ids.cuda(), token_type_ids.cuda(), attn_masks.cuda(), batch_scores.cuda()

      #Forward pass
      batch_predict = model(token_ids, token_type_ids, attn_masks).squeeze()

      #Aggregate output batches (to calculate correlation over whole dataset later)
      predicted_scores.append(batch_predict)
      true_scores.append(batch_scores)
    
  predicted_scores = torch.cat(predicted_scores)
  true_scores = torch.cat(true_scores)

  pearson = pearsonr(predicted_scores.cpu().numpy(), true_scores.cpu().numpy())

  print(f'RMSE: {torch.sqrt(mse_loss(predicted_scores,true_scores))} Pearson {pearson[0]}', '\n')


def train(model, criterion, optimizer, epochs, train_loader, val_loader=None):
  model.to('cuda')
  for epoch in range(epochs):
    train_loss = 0
    for it, (token_ids, token_type_ids, attn_masks, scores) in enumerate(train_loader):
      #Clear gradients
      optimizer.zero_grad()

      #Converting to cuda tensors
      token_ids, token_type_ids, attn_masks, scores = token_ids.cuda(), token_type_ids.cuda(), attn_masks.cuda(), scores.cuda()

      #Forward pass
      out = model(token_ids, token_type_ids, attn_masks)

      #Compute loss
      loss = criterion(out.squeeze(), scores.float())
      train_loss += loss.item()

      #Backpropagating the gradients
      loss.backward()

      #Optimization step
      optimizer.step()

    print("Epoch {} complete. Loss: {}".format(epoch, train_loss/len(train_loader)))
    if val_loader:
      acc = check_accuracy(val_loader, model)


## Validation on hold-out set

In [14]:
train(model, criterion, optimizer, epochs, train_loader, val_loader)

Epoch 0 complete. Loss: 0.6888832414531272
Checking accuracy on validation set:
RMSE: 0.8664133681433251 Pearson 0.14466003327015103 

Epoch 1 complete. Loss: 0.6688045892337141
Checking accuracy on validation set:
RMSE: 0.8507752228033728 Pearson 0.2039785258195178 



## Train with all available data and save predictions

In [0]:
def bert_predict_test(loader, model):
  # compute prediction on unseen data
  model.eval()  # set model to evaluation mode
  scores = []
  with torch.no_grad():
    for it, (token_ids, token_type_ids, attn_masks) in enumerate(loader):
      #Converting to cuda tensors
      token_ids, token_type_ids, attn_masks = token_ids.cuda(), token_type_ids.cuda(), attn_masks.cuda()

      #Forward pass
      batch_predict = model(token_ids, token_type_ids, attn_masks).squeeze()

      #Aggregate output batches (to calculate correlation over whole dataset later)
      scores.append(batch_predict)
      
    
  scores = torch.cat(scores)
  
  with open('predictions_finetuned.txt', 'w') as output_file:
        for idx, x in enumerate(scores):
            output_file.write(f"{x}\n")
  

In [16]:
#Fine-tune on all available data
model = MTQualityEstimator(freeze_bert = False)
criterion = nn.MSELoss()
epochs = 2
optimizer = optim.AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

train(model, criterion, optimizer, epochs, traindev_loader)
#Save predictions
bert_predict_test(test_loader, model)

Epoch 0 complete. Loss: 0.7028763084709644
Epoch 1 complete. Loss: 0.6518015866279602


# Regression on pooled output (BERT weights frozen)

## Extract feature vector from [CLS]

In [0]:
def get_pooled_representation(loader, bert):
  pooled_rep=[]
  bert.to('cuda')
  bert.eval()  # set model to evaluation mode
  with torch.no_grad():
    if loader.dataset.mode in ['train', 'dev']:
      true_scores = []
      for it, (token_ids, token_type_ids, attn_masks, batch_scores) in enumerate(loader):

        #Converting to cuda tensors
        token_ids, token_type_ids, attn_masks = token_ids.cuda(), token_type_ids.cuda(), attn_masks.cuda()

        #Forward pass
        _, batch_pooled_rep = bert(token_ids, token_type_ids = token_type_ids, attention_mask = attn_masks)

        #Aggregate output batches (to calculate correlation over whole dataset later)
        pooled_rep.append(batch_pooled_rep)
        true_scores.append(batch_scores)

      pooled_rep = torch.cat(pooled_rep)
      true_scores = torch.cat(true_scores)

      return pooled_rep.cpu().numpy(), true_scores.numpy()

    else:
      for it, (token_ids, token_type_ids, attn_masks) in enumerate(loader):

        #Converting to cuda tensors
        token_ids, token_type_ids, attn_masks = token_ids.cuda(), token_type_ids.cuda(), attn_masks.cuda()

        #Forward pass
        _, batch_pooled_rep = bert(token_ids, token_type_ids = token_type_ids, attention_mask = attn_masks)

        #Aggregate output batches (to calculate correlation over whole dataset later)
        pooled_rep.append(batch_pooled_rep)
        
      pooled_rep = torch.cat(pooled_rep)

      return pooled_rep.cpu().numpy()

## Choose best regressor according to performance on validation set

In [0]:
bert = BertModel.from_pretrained('bert-base-multilingual-uncased')
X_train, y_train = get_pooled_representation(train_loader, bert)
X_val, y_val = get_pooled_representation(val_loader, bert)
X_test = get_pooled_representation(test_loader, bert)

In [19]:
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge
from scipy.stats.stats import pearsonr

for name, regressor in zip(['Linear', 'Ridge', 'SVR linear', 'SVR rbf'], [LinearRegression(), Ridge(), SVR('linear'), SVR('rbf')]):
    regressor.fit(X_train, y_train)
    print(name)
    predictions = regressor.predict(X_val)
    pearson = pearsonr(y_val, predictions)
    print(f'RMSE: {torch.sqrt(mse_loss(torch.tensor(y_val), torch.tensor(predictions)))} Pearson {pearson[0]}')
    print()


Linear
RMSE: 0.8962844473926784 Pearson 0.10672373789206752

Ridge
RMSE: 0.8554293837041853 Pearson 0.15613759681270084

SVR linear
RMSE: 0.8722189585388788 Pearson 0.12465365586565982

SVR rbf
RMSE: 0.8756795526767164 Pearson 0.12344757866176322



## Train with all available data and save the best model

In [0]:
def regressor_predict_test(regressor, X_train, y_train, X_test):
  regressor.fit(X_train, y_train)
  scores = regressor.predict(X_test)
  with open('predictions_frozen.txt', 'w') as output_file:
    for idx, x in enumerate(scores):
        output_file.write(f"{x}\n")

In [0]:
X, y = np.concatenate([X_train, X_val], axis=0), np.concatenate([y_train, y_val], axis=0)
regressor_predict_test(Ridge(), X, y, X_test)