# NLP Coursework: CNNs with BERT

### Imports and data/model loading


In [0]:
! pip install transformers



In [0]:
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# Transformer library for pre-trained BERT
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup


from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr

import io
import os
import random
import time
from tqdm import tqdm

# Set printing
torch.set_printoptions(4)

In [0]:
if not os.path.exists('ende_data.zip'):
    !wget -O ende_data.zip https://competitions.codalab.org/my/datasets/download/c748d2c0-d6be-4e36-9f12-ca0e88819c4d
    !unzip ende_data.zip

In [0]:
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
# Load the model
model = BertModel.from_pretrained("bert-base-multilingual-cased").to(device='cuda')

In [0]:
# A function used for dataloading

def load_data(set_name):
    """
    set name: "train", "dev", "test"
    """

# Load data into variables
    with open("./{}.ende.src".format(set_name), "r") as ende_src:
        en_set = ende_src.read().split('\n')
    with open("./{}.ende.mt".format(set_name), "r") as ende_mt:
        de_set = ende_mt.read().split('\n')
    
    # Clear the last row as the function reads 7001/1001 lines
    del en_set[len(en_set)-1]
    del de_set[len(de_set)-1]

    
    return en_set, de_set

def load_scores(set_name):
    
    with open("./{}.ende.scores".format(set_name), "r") as ende_scores:
        scores = [float(x) for x in ende_scores.read().split('\n')[:-1]]
    
    return scores

In [0]:
# Load all data
en_train, de_train = load_data("train")
scores_train = load_scores("train")

en_val, de_val = load_data("dev")
scores_val = load_scores("dev")

en_test, de_test = load_data("test")

### Preprocessing and embeddings

In [0]:
def preprocess(X, tokenizer):
    inputs = []
    max_len_inps = 0

    # Tokenize
    for i in range(len(X)-1):
        seq = X[i][:-1]
        # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
        input_ids = torch.tensor([tokenizer.encode(seq, add_special_tokens=True)])  
        inputs.append(input_ids)
        if input_ids.shape[-1] > max_len_inps:
            max_len_inps = input_ids.shape[-1]
            
    # Convert to tensor
    inp_tensor = torch.zeros((len(X), max_len_inps))
    
    for i in range(len(inputs)):
    # Add tokens
        tokens = inputs[i].squeeze()
        inp_tensor[i, : len(tokens)] = tokens
        
    print(inp_tensor.shape)
    return inp_tensor

In [0]:
# Preprocess data
en_inputs = preprocess(en_train, tokenizer)
de_inputs = preprocess(de_train, tokenizer)

en_val_inputs = preprocess(en_val, tokenizer)
de_val_inputs = preprocess(de_val, tokenizer)

en_test_inputs = preprocess(en_test, tokenizer)
de_test_inputs = preprocess(de_test, tokenizer)

torch.Size([7000, 65])
torch.Size([7000, 69])
torch.Size([1000, 48])
torch.Size([1000, 54])
torch.Size([1000, 44])
torch.Size([1000, 73])


In [0]:
# Pad all data with zeros to bring it to the same size
en_inputs_new = torch.zeros((7000, de_test_inputs.shape[1]))
en_inputs_new[:,:65] = en_inputs

de_inputs_new = torch.zeros((7000, de_test_inputs.shape[1]))
de_inputs_new[:,:69] = de_inputs

en_val_inputs_new = torch.zeros((1000,de_test_inputs.shape[1]))
en_val_inputs_new[:,:48] = en_val_inputs

de_val_inputs_new = torch.zeros((1000, de_test_inputs.shape[1]))
de_val_inputs_new[:,:54] = de_val_inputs

en_test_inputs_new = torch.zeros((1000,de_test_inputs.shape[1]))
en_test_inputs_new[:,:44] = en_test_inputs

de_test_inputs_new = de_test_inputs

In [0]:
# Produces attention mask for BERT, which is need to communicate the padding locations to it
def attention_mask(list_input_tensor):
    list_attention_masks = []
    for input_tensor in list_input_tensor:
        attention_mask = torch.zeros((input_tensor.shape))
        attention_mask[input_tensor != 0] = 1
        list_attention_masks.append(attention_mask)

    return list_attention_masks

# Function to split data into batches for obtaining embeddings
def get_batches(inp_tensor, scores, BATCH_N):
    inp_tensor = inp_tensor
    scores = torch.tensor(scores).view(-1,1)

    # Split data into train, test and val batches
    inp_tensor_batches = torch.split(inp_tensor, BATCH_N)
    scores_batches = torch.split(scores, BATCH_N)

    # Create attention masks
    X_mask = attention_mask(inp_tensor_batches)

    # Batch X, mask and y together
    batches = [(X, mask, y) for X,mask,y in zip(inp_tensor_batches, X_mask, scores_batches)]

    return batches


In [0]:
def get_sentence_embeddings(batches, model):
    # This function gets full per-word embeddings as well as [CLS] sentence embedding
    with torch.no_grad():
        
        list_bert_embs = []
        for X in batches:
            with torch.no_grad():
                # Obtain embeddings
                last_hidden_states = model(X[0].type(torch.LongTensor).to('cuda'), X[1].to('cuda'))[0]    # <-- take word embeddings ([1] gives sentence embeddings)

            list_bert_embs.append(last_hidden_states)


            torch.cuda.empty_cache()

        bert_embs = torch.cat(list_bert_embs, dim=0)

        # Check for correct shape
        print(bert_embs.shape)
    
    return bert_embs

In [0]:
USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# Batch Size
BATCH_N = 25
# Set to few epochs to show training loop. In practice, was trained for longer
EPOCHS = 4
LR = 0.00001

# Set seeds for reproducibility
seed_val = 111
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Split into batches

batches_en_train = get_batches(en_inputs_new, scores_train, BATCH_N)
batches_de_train = get_batches(de_inputs_new, scores_train, BATCH_N)

batches_en_val = get_batches(en_val_inputs_new, scores_val, BATCH_N)
batches_de_val = get_batches(de_val_inputs_new, scores_val, BATCH_N)

batches_en_test = get_batches(en_test_inputs_new, scores_val, BATCH_N)
batches_de_test = get_batches(de_test_inputs_new, scores_val, BATCH_N)

In [0]:
embs_en_train = get_sentence_embeddings(batches_en_train, model).to('cpu')
embs_de_train = get_sentence_embeddings(batches_de_train, model).to('cpu')
embs_en_val = get_sentence_embeddings(batches_en_val, model).to('cpu')
embs_de_val = get_sentence_embeddings(batches_de_val, model).to('cpu')
embs_en_test = get_sentence_embeddings(batches_en_test, model).to('cpu')
embs_de_test = get_sentence_embeddings(batches_de_test, model).to('cpu')

torch.Size([7000, 73, 768])
torch.Size([7000, 73, 768])
torch.Size([1000, 73, 768])
torch.Size([1000, 73, 768])
torch.Size([1000, 73, 768])
torch.Size([1000, 73, 768])


In [0]:
# Concatenate the embeddings into two channels: 1 channel for English and 1 for German

embs_en_train = embs_en_train.view(7000,1,73,768)
embs_de_train = embs_de_train.view(7000,1,73,768)
train_dataset = torch.cat((embs_en_train,embs_de_train), 1)
print(train_dataset.shape)

embs_en_val = embs_en_val.view(1000,1,73,768)
embs_de_val = embs_de_val.view(1000,1,73,768)
val_dataset = torch.cat((embs_en_val,embs_de_val), 1)
print(val_dataset.shape)

embs_en_test = embs_en_test.view(1000,1,73,768)
embs_de_test = embs_de_test.view(1000,1,73,768)
test_dataset = torch.cat((embs_en_test, embs_de_test), 1)
print(test_dataset.shape)

torch.Size([7000, 2, 73, 768])
torch.Size([1000, 2, 73, 768])
torch.Size([1000, 2, 73, 768])


In [0]:
# Convert scores to tensor
torch_scores_train = torch.tensor(scores_train).view(-1,1)
torch_scores_val = torch.tensor(scores_val).view(-1,1)

In [0]:
# Build final datasets
train = (train_dataset.type(torch.LongTensor), torch_scores_train.type(torch.LongTensor))
val = (val_dataset.type(torch.LongTensor), torch_scores_val.type(torch.LongTensor))


In [0]:
# Get batches for training
train_data_batches = torch.split(train_dataset, BATCH_N)
train_score_batches = torch.split(torch_scores_train, BATCH_N)
train_batches = [(X, y) for X,y in zip(train_data_batches, train_score_batches)]

In [0]:
# Get validation batches
val_data_batches = torch.split(val_dataset, BATCH_N)
val_score_batches = torch.split(torch_scores_val, BATCH_N)
val_batches = [(X, y) for X,y in zip(val_data_batches, val_score_batches)]

# Split validation set into 2 sets: one will be use as validation to save models
# The second one will be used as a test set to check generalisation performance
# of saved models
validation_batches = val_batches[0:20]
test_batches = val_batches[20:41]

In [0]:
# Clear some RAM
del train_dataset, test_dataset, train, val, embs_en_train, embs_en_test, embs_en_val, embs_de_train, embs_de_test, embs_de_val

### Models

In [0]:
# Define resnet building blocks
# Architecture modified from CO460 coursework 1

class ResidualBlock(nn.Module): 
    def __init__(self, inchannel, outchannel, stride=1): 
        
        super(ResidualBlock, self).__init__() 
        
        self.left = nn.Sequential(nn.Conv2d(inchannel, outchannel, kernel_size=3, 
                                         stride=stride, padding=1, bias=False), 
                                  nn.BatchNorm2d(outchannel), 
                                  nn.ReLU(inplace=True), 
                                  nn.Conv2d(outchannel, outchannel, kernel_size=3, 
                                         stride=1, padding=1, bias=False), 
                                  nn.BatchNorm2d(outchannel)) 
        
        self.shortcut = nn.Sequential() 
        
        if stride != 1 or inchannel != outchannel: 
            
            self.shortcut = nn.Sequential(nn.Conv2d(inchannel, outchannel, 
                                                 kernel_size=1, stride=stride, 
                                                 padding = 0, bias=False), 
                                          nn.BatchNorm2d(outchannel) ) 
            
    def forward(self, x): 
        
        out = self.left(x) 
        
        out += self.shortcut(x) 
        
        out = F.relu(out) 
        
        return out


# Define a narrow ResNet18

class ResNet(nn.Module):
    
    def __init__(self, ResidualBlock):
        
        super(ResNet, self).__init__()
        
        self.inchannel = 16
        # Encoder layer uses fairly large kernel size due to input size
        self.conv1 = nn.Sequential(nn.Conv2d(2, 16, kernel_size = (5,32), stride = 1, 
                                            padding = 1, bias = False), 
                                  nn.BatchNorm2d(16), 
                                  nn.ReLU())
        # The rest of the layers are standard ResNet18 architecture but narrower
        self.layer1 = self.make_layer(ResidualBlock, 16, 2, stride = 1)
        self.layer2 = self.make_layer(ResidualBlock, 32, 2, stride = 2)
        self.layer3 = self.make_layer(ResidualBlock, 64, 2, stride = 2)
        self.layer4 = self.make_layer(ResidualBlock, 128, 2, stride = 2)
        # Aggressive pooling to reduce number of intpus to linear layer
        self.maxpool = nn.MaxPool2d((4,16))
        # Squeeze output between (-1,1)
        self.tanh = nn.Tanh()
        self.fc = nn.Linear(1280, 1)
        
        
    def make_layer(self, block, channels, num_blocks, stride):
        
        strides = [stride] + [1] * (num_blocks - 1)
        
        layers = []
        
        for stride in strides:
            
            layers.append(block(self.inchannel, channels, stride))
            
            self.inchannel = channels
            
        return nn.Sequential(*layers)
    
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.maxpool(x)
        x = x.view(x.size(0), -1)     
        x = self.tanh(x)       
        x = self.fc(x)
        
        return x
    
    def check_r(self, y_pred, y):
        # Compute Pearson Correlation Coefficient
        return pearsonr(y_pred.cpu().squeeze(), y.cpu().squeeze())[0]
    
    def loss(self, scores, pred_scores):
        # RMSE loss
        loss = (((pred_scores - scores)**2).mean())**0.5
        return loss

def ResNet18():
    return ResNet(ResidualBlock)

### ConvNet

In [0]:
# Define a simple CNN
class ConvNet(nn.Module):
    
    def __init__(self):
        
        super(ConvNet, self).__init__()
        
        self.inchannel = 16
        # Very large convolutional window on encoder
        self.conv = nn.Sequential(nn.Conv2d(2, 16, kernel_size = (50,300), stride = 1, 
                                            padding = 1, bias = False), 
                                  nn.BatchNorm2d(16), 
                                  nn.ReLU(), 
                                   nn.Conv2d(16, 16, kernel_size = (5,5), stride = 1, 
                                            padding = 1, bias = False), 
                                  nn.BatchNorm2d(16), 
                                  nn.ReLU())
        
        self.maxpool = nn.MaxPool2d((8,32))

        self.tanh = nn.Tanh()
        self.fc = nn.Linear(672, 1)
        
        
    
    
    def forward(self, x):
        x = self.conv(x)
        x = self.maxpool(x)
        x = x.view(x.size(0), -1)
        x = self.tanh(x)
        x = self.fc(x)
        
        return x
    
    def check_r(self, y_pred, y):
        # Compute Pearson Correlation Coefficient
        return pearsonr(y_pred.cpu().squeeze(), y.cpu().squeeze())[0]
    
    def loss(self, scores, pred_scores):
        # RMSE loss
        loss = (((pred_scores - scores)**2).mean())**0.5

        return loss

def ConvNet18():
    return ConvNet()

### Train Model

In [0]:
# Used for tracking validation results
best_corr = 0.0

def validation(model, model_name, val_batches, epoch, final = False):
    # Used for validation

    print("")
    print("Running Validation...")

    t0 = time.time()
    eval_loss = 0
    correlations = []
    
    # This wil keep track of best correlation achieved so far
    global best_corr

    for step, batch in enumerate(val_batches):        

        X = batch[0].to('cuda')
        y = batch[1].to('cuda')

        with torch.no_grad():        
            y_pred = model(X)                      
                
        # Compute and record batch Pearson Correlation
        corr = model.check_r(y_pred, y)
        correlations.append(corr)
    if not final:    
    # Save model if good correlation on validation
        if np.mean(correlations) > best_corr and np.mean(correlations) > 0.1:

            best_corr = np.mean(correlations)
            print("===================")
            print("Saving a good model")
            print("===================")
            torch.save(model.state_dict(), "./{}.pt".format(model_name)) 

    # Save any decent enough model
        if np.mean(correlations) > 0.1:
            print("=====================")
            print("Saving a decent model")
            print("=====================")
            torch.save(model.state_dict(), "./{}_{}.pt".format(model_name, epoch)) 
        
    # Report the final accuracy for this validation run.
    print(f"|  Correlation: {np.mean(correlations):.2f}     |")
    print(f"|  Validation took: {time.time() - t0:0f} s |")



def train(model, model_name, train_batches, validation_batches, test_batches, epochs):
    # Trains the model

    # Keep track of losses
    losses = []
    
    for epoch_i in range(epochs):
        
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        t0 = time.time()
        total_loss = 0

        for step, batch in enumerate(train_batches):
            
            X = batch[0].to('cuda')
            y = batch[1].to('cuda')
    
            model.zero_grad()                   # Reset grads
            y_pred = model(X)                   # Forward pass
            loss = model.loss(y, y_pred)        # Compute loss
            total_loss += loss.item()           # Accumulate loss
            loss.backward()                     # Backward pass

            optimizer.step()                    # Update params

            if step % 140 == 0:
                # Progress update every 140 batches                
                print('  Batch {:>5,}  of  {:>5,}    |    Elapsed: {:.0f}s.'.format(step, len(train_batches), time.time() - t0))
                print(f'  Loss = {loss.item():.2f}')

        # Compute and store avg loss
        avg_train_loss = total_loss / len(X)
        losses.append(avg_train_loss)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:.0f}s".format(time.time() - t0))

        # Validate after every epoch
        validation(model, model_name, validation_batches, epoch_i) 

    print("")
    print("Training complete!")
    print("")
    # Validate on model selected by best Pearon Correlation
    print("======== Running final validation ========")
    print("")
    print("Loading best model")
    model.load_state_dict(torch.load("./{}.pt".format(model_name)))
    model.eval()
    
    # Final validation, run on hold-out set. `Final` flag does not prompt to save the model
    validation(model, model_name, test_batches, epoch_i, final=True) 


def get_test_preds(model, batches):
    # Used to predict for testset
    torch.cuda.empty_cache()
    y_preds = []
    for batch in batches:
        with torch.no_grad():

            X = batch.to(device='cuda')

            y_pred = model(X)          

        y_preds.append(y_pred.item())
    return y_preds



In [0]:
# Both networks won't fit to GPU

# resnet = ResNet18().to('cuda')

cnn = ConvNet().to('cuda')

In [0]:
optimizer = optim.AdamW(cnn.parameters(), lr=LR)

In [0]:
train(cnn, 'cnn', train_batches, validation_batches, test_batches, EPOCHS)


Training...
  Batch     0  of    280    |    Elapsed: 0s.
  Loss = 1.62
  Batch   140  of    280    |    Elapsed: 36s.
  Loss = 0.64

  Average training loss: 8.55
  Training epoch took: 72s

Running Validation...
|  Correlation: 0.05     |
|  Validation took: 3.284114 s |

Training...
  Batch     0  of    280    |    Elapsed: 0s.
  Loss = 1.45
  Batch   140  of    280    |    Elapsed: 36s.
  Loss = 0.64

  Average training loss: 8.44
  Training epoch took: 71s

Running Validation...
|  Correlation: 0.09     |
|  Validation took: 3.274883 s |

Training...
  Batch     0  of    280    |    Elapsed: 0s.
  Loss = 1.45
  Batch   140  of    280    |    Elapsed: 36s.
  Loss = 0.64

  Average training loss: 8.38
  Training epoch took: 72s

Running Validation...
Saving a good model
Saving a decent model
|  Correlation: 0.12     |
|  Validation took: 3.288841 s |

Training...
  Batch     0  of    280    |    Elapsed: 0s.
  Loss = 1.44
  Batch   140  of    280    |    Elapsed: 36s.
  Loss = 0.64

In [0]:
# Clear GPU cache
torch.cuda.empty_cache()