In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
torch.set_printoptions(4)
from torch import optim
import random
import time

In [2]:
# For downloading and saving model locally.

# tokenizer_en = BertTokenizer.from_pretrained('bert-base-cased')
# model_en = BertModel.from_pretrained('..\..\Dump Files\models\BERT_cased_en')

# tokenizer_de = BertTokenizer.from_pretrained('bert-base-german-cased')
# model_de = BertModel.from_pretrained('..\..\Dump Files\models\BERT_cased_de')

# model_en.save_pretrained('..\..\Dump Files\models\BERT_cased_en')

# model_de.save_pretrained('..\..\Dump Files\models\BERT_cased_de')

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

#model = BertModel.from_pretrained('..\..\Dump Files\models\BERT_cased_multi')

#model.save_pretrained('..\..\Dump Files\models\BERT_cased_multi')

### Loading data

In [3]:
import io
import os

def load_data(set_name):
    """
    set name: "train", "dev", "test"
    """

# Load train data into variables
    with open(os.path.join("..", "data", "{}.ende.src".format(set_name)), "r", encoding="utf8") as ende_src:
        en_set = ende_src.read().split('\n')
    with open(os.path.join("..", "data", "{}.ende.mt".format(set_name)), "r", encoding="utf8") as ende_mt:
        de_set = ende_mt.read().split('\n')
    

    del en_set[len(en_set)-1]
    del de_set[len(de_set)-1]

    
    return en_set, de_set

def load_scores(set_name):
    
    with open(os.path.join("..", "data", "{}.ende.scores".format(set_name)), "r", encoding="utf8") as ende_scores:
        scores = [float(x) for x in ende_scores.read().split('\n')[:-1]]
        
    #del scores[len(scores)-1]
    print(len(scores))
    
    return scores

In [4]:
en_train, de_train = load_data("train")
scores_train = load_scores("train")

en_val, de_val = load_data("dev")
scores_val = load_scores("dev")

en_test, de_test = load_data("test")

7000
1000


In [5]:
def preprocess(X, tokenizer):
    inputs = []
    max_len_inps = 0

    # Tokenize
    for i in range(len(X)-1):
        seq = X[i][:-1]
        # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
        input_ids = torch.tensor([tokenizer.encode(seq, add_special_tokens=True)])  
        inputs.append(input_ids)
        if input_ids.shape[-1] > max_len_inps:
            max_len_inps = input_ids.shape[-1]
            
    # Convert to tensor
    inp_tensor = torch.zeros((len(X), max_len_inps))
    
    for i in range(len(inputs)):
    # Add tokens
        tokens = inputs[i].squeeze()
        inp_tensor[i, : len(tokens)] = tokens
        
    print(inp_tensor.shape)
    return inp_tensor

In [6]:
en_inputs = preprocess(en_train, tokenizer)

torch.Size([7000, 65])


In [7]:
de_inputs = preprocess(de_train, tokenizer)

torch.Size([7000, 69])


In [8]:
en_val_inputs = preprocess(en_val, tokenizer)
de_val_inputs = preprocess(de_val, tokenizer)

torch.Size([1000, 48])
torch.Size([1000, 54])


### Obtaining embeddings


In [9]:
def get_sentence_embeddings(model, inp_tensor):

    model = model.to(device='cuda')
    inp_tensor = inp_tensor.to(device='cuda', dtype=torch.long)

    batches = torch.split(inp_tensor, 250, dim=0)
    list_bert_embs = []
    for X in batches:
        with torch.no_grad():
            last_hidden_states = model(X)[0]    # <-- take word embeddings ([1] gives sentence embeddings)

        list_bert_embs.append(last_hidden_states)

        #print(last_hidden_states.shape)

        torch.cuda.empty_cache()
    
    bert_embs = torch.cat(list_bert_embs, dim=0)
    
    # Now, slice the tensor to only keep the [CLS] embedding

    print(bert_embs.shape)
    
    return bert_embs[:,0,:]

In [10]:
#bert_embs_en = get_sentence_embeddings(model_en, en_inputs)

In [11]:
#bert_embs_en.shape

In [12]:
#bert_embs_de = get_sentence_embeddings(model_de, de_inputs)

In [24]:
class BertRegressor(nn.Module):
    def __init__(self):
        
        super(BertRegressor, self).__init__()
        
        self.device = "cuda"
        self.bert = BertModel.from_pretrained('..\..\Dump Files\models\BERT_cased_multi').to("cuda")
        
        
        self.linear1 = nn.Linear(in_features = 2*768,                   # <-- 768 is the dim of Bert embedding
                             out_features = 256, bias = True).to("cuda")
        self.bn1 = nn.BatchNorm1d(256).to("cuda")
        
        self.linear2 = nn.Linear(in_features = 256,                  
                             out_features = 128, bias = True).to("cuda")
        
        self.bn2 = nn.BatchNorm1d(128).to("cuda")
        
        self.linear3 = nn.Linear(in_features = 128,                  
                             out_features = 1, bias = True).to("cuda")

    def forward(self, X_en, X_de, attention_mask_en=None, attention_mask_de=None):
        with torch.no_grad():
            if attention_mask_en is not None:
                X_en = self.bert(X_en, attention_mask_en)[1]   
            else:
                X_en = self.bert(X_en)[1]

            if attention_mask_de is not None:
                X_de = self.bert(X_de, attention_mask_de)[1]  
            else:
                X_de = self.bert(X_de)[1]
        
#         X_en_CLS = X_en[:,0,:]
#         X_de_CLS = X_de[:,0,:]
#         print(X_en.shape)

        X = torch.cat((X_en, X_de), 1) # concatenate along 1st dimension
        
        X = F.tanh(self.linear1(X))
        #X = self.bn1(X)
        X = F.tanh(self.linear2(X))
        #X = self.bn2(X)
        preds = self.linear3(X)
        
        #print(self.bert.parameters())
        
        return preds

    def loss(self, scores, pred_scores):
        rmse = (((pred_scores - scores)**2).mean())**0.5
        return rmse

    def check_r(self, y_pred, y):
        return pearsonr(y_pred.cpu().squeeze(), y.cpu().squeeze())[0]

    def zero_grad(self):
        self.bert.zero_grad()
        #self.bert_de.zero_grad()
        self.linear1.zero_grad()
        self.linear2.zero_grad()
        self.linear3.zero_grad()
        self.bn1.zero_grad()
        self.bn2.zero_grad()

    def params(self):
        params = list(self.bert.parameters()) + list(self.linear1.parameters()) + list(self.linear2.parameters()) + list(self.linear3.parameters())
        #    list(self.bn1.parameters()) + 

        #list(self.bn2.parameters()) + 
        
        return params

    def __call__(self, X_en, X_de, attention_mask_en=None, attention_mask_de=None):
        return self.forward(X_en, X_de, attention_mask_en, attention_mask_de)


In [14]:
def attention_mask(list_input_tensor):
    list_attention_masks = []
    for input_tensor in list_input_tensor:
        attention_mask = torch.zeros((input_tensor.shape))
        attention_mask[input_tensor != 0] = 1
        list_attention_masks.append(attention_mask)

    return list_attention_masks

def get_batches(inp_tensor, scores, BATCH_N):
    inp_tensor = inp_tensor
    scores = torch.tensor(scores).view(-1,1)

    # Split data into train, test and val batches
    inp_tensor_batches = torch.split(inp_tensor, BATCH_N)
    scores_batches = torch.split(scores, BATCH_N)
#     X_train_val, X_test, y_train_val, y_test = train_test_split(inp_tensor_batches, scores_batches, shuffle=False, test_size=0.1)
#     X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, shuffle = False, test_size=1/9)
    
    
    #print(len(inp_tensor_batches), inp_tensor_batches[0].shape)

    # Create attention masks
    X_mask = attention_mask(inp_tensor_batches)
    #print(len(X_mask), X_mask[0].shape)

    # Batch X, mask and y together
    batches = [(X, mask, y) for X,mask,y in zip(inp_tensor_batches, X_mask, scores_batches)]


    return batches


In [26]:
USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

#bert_config = 'bert-base-multilingual-cased'
BATCH_N = 20
EPOCHS = 20
LR = 2e-5

# Set seeds
seed_val = 111
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Preprocessing


batches_en_train = get_batches(en_inputs, scores_train, BATCH_N)
batches_de_train = get_batches(de_inputs, scores_train, BATCH_N)

batches_en_val = get_batches(en_val_inputs, scores_val, BATCH_N)
batches_de_val = get_batches(de_val_inputs, scores_val, BATCH_N)

In [16]:
len(batches_en_val)

50

In [17]:
def validation(val_batches_en, val_batches_de):

    print("")
    print("Running Validation...")

    t0 = time.time()
    eval_loss = 0
    correlations = []

    for i in range(len(val_batches_en)):     

        batch_en = val_batches_en[i]
        batch_de = val_batches_de[i]
        
        # Untie batch and put on GPU
        X_en = batch_en[0].to(device=device, dtype=torch.long)
        mask_en = batch_en[1].to(device)
        
        X_de = batch_de[0].to(device=device, dtype=torch.long)
        mask_de = batch_de[1].to(device)
        
        y = batch_en[2].to(device)

        with torch.no_grad():        
            y_pred = model(X_en, X_de)                      
                
        # Compute and record batch accuracy
        corr = model.check_r(y_pred, y)
        correlations.append(corr)

    # Report the final accuracy for this validation run.
    print(f"|  Correlation: {np.mean(correlations):.2f}     |")
    print(f"|  Validation took: {time.time() - t0:0f} s |")

    torch.cuda.empty_cache()


def train(model, optimizer, train_batches_en, val_batches_en, train_batches_de, val_batches_de, epochs):

    losses = []
    
    for epoch_i in range(epochs):
        
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        t0 = time.time()
        total_loss = 0

        for step in range(len(train_batches_en)):

                       
            batch_en = train_batches_en[step]
            batch_de = train_batches_de[step]
            
            batch_en[0].requires_grad = True
            batch_de[0].requires_grad = True

            # Untie batch and put on GPU
            X_en = batch_en[0].to(device=device, dtype=torch.long)
            mask_en = batch_en[1].to(device)

            X_de = batch_de[0].to(device=device, dtype=torch.long)
            mask_de = batch_de[1].to(device)

            y = batch_en[2].to(device)

            model.zero_grad()                   # Reset grads
            y_pred = model(X_en, X_de, mask_en, mask_de)             # Forward pass
            loss = model.loss(y, y_pred)        # Compute loss
            total_loss += loss.item()           # Accumulate loss
            loss.backward()                     # Backward pass

            optimizer.step()                    # Update params

            torch.cuda.empty_cache()            # Clear GPU cache to avoid memory issues
            
            if step % 100 == 0:
                # Progress update every 40 batches                
                print('  Batch {:>5,}  of  {:>5,}    |    Elapsed: {:.0f}s.'.format(step, len(train_batches_en), time.time() - t0))
                print(f'  Loss = {loss.item():.2f}')
            
        # Compute and store avg loss
        avg_train_loss = total_loss / len(X_en)
        losses.append(avg_train_loss)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:.0f}s".format(time.time() - t0))

        validation(val_batches_en, val_batches_de) 

    print("")
    print("Training complete!")

In [27]:
# Model to train
model = BertRegressor()

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=LR, eps=1e-8)
total_steps = len(batches_en_train[0]) * EPOCHS // BATCH_N
# scheduler = get_linear_schedule_with_warmup(optimizer, 
#                                             num_warmup_steps = 0,
#                                             num_training_steps = total_steps)

torch.cuda.empty_cache()
train(model, optimizer, batches_en_train, batches_en_val, batches_de_train, batches_de_val, epochs=EPOCHS)
torch.save(model.state_dict(), '..\..\Dump Files\models\model.pt')


Training...
  Batch     0  of    350    |    Elapsed: 0s.
  Loss = 1.55
  Batch   100  of    350    |    Elapsed: 8s.
  Loss = 0.66
  Batch   200  of    350    |    Elapsed: 16s.
  Loss = 0.50
  Batch   300  of    350    |    Elapsed: 25s.
  Loss = 0.42

  Average training loss: 13.08
  Training epoch took: 29s

Running Validation...
|  Correlation: -0.03     |
|  Validation took: 2.981506 s |

Training...
  Batch     0  of    350    |    Elapsed: 0s.
  Loss = 1.60
  Batch   100  of    350    |    Elapsed: 8s.
  Loss = 0.66
  Batch   200  of    350    |    Elapsed: 16s.
  Loss = 0.50
  Batch   300  of    350    |    Elapsed: 25s.
  Loss = 0.42

  Average training loss: 13.06
  Training epoch took: 29s

Running Validation...
|  Correlation: -0.00     |
|  Validation took: 3.127589 s |

Training...
  Batch     0  of    350    |    Elapsed: 0s.
  Loss = 1.60
  Batch   100  of    350    |    Elapsed: 8s.
  Loss = 0.66
  Batch   200  of    350    |    Elapsed: 17s.
  Loss = 0.50
  Batch   