In [None]:
# Install HuggingFace/Transformers and HuggingFace/Datasets and sacrebleu (for
# the error metric)
!pip install transformers==3.5.0 datasets==1.1.2 sacrebleu==1.5.1
!pip install torch==1.7.0

import copy
import gc
import math
import time
import os

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import datasets

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
import torch.optim as optim

from torch.utils.data import DataLoader, TensorDataset

from transformers import AutoTokenizer, AutoModelWithLMHead, Trainer, TrainingArguments

use_gpu = torch.cuda.is_available()
device = torch.device("cuda:0" if use_gpu else "cpu")

# Setting the seed to a fixed value can be helpful in reproducing results
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

print("PyTorch version: ", torch.__version__)
print("GPU available: {}".format(use_gpu))

print("Using the following GPU type: {}".format(torch.cuda.get_device_name(0)))
print('The GPU memory is {:.2f} GB'.format(torch.cuda.get_device_properties(0).total_memory * 1e-9))

PyTorch version:  1.7.0
GPU available: True
Using the following GPU type: Tesla T4
The GPU memory is 15.84 GB


In [None]:
# Load T5-Small model
model = AutoModelWithLMHead.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-fr")
model.to(device)

# Load T5-Small model's tokenizer 
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-tc-big-en-fr")



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1076.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570070083.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at Helsinki-NLP/opus-mt-tc-big-en-fr were not used when initializing MarianMTModel: ['lm_head.weight']
- This IS expected if you are initializing MarianMTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MarianMTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=802408.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=819955.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1332010.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=337.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=65.0, style=ProgressStyle(description_w…




In [None]:
# Load Own Dataset: Hansards

# Load the paired english-french sentences
with open("/content/drive/MyDrive/ml/final_project/data/en_all.txt", "r") as f:
  Xs = [s.strip() for s in f.readlines()]
with open("/content/drive/MyDrive/ml/final_project/data/fr_all.txt", "r") as f:
  Ys = [s.strip() for s in f.readlines()]
n_samples = len(Xs)

# Add task prefix to inputs.
# This is required given we use the T5 model (https://arxiv.org/abs/1910.10683).
# In this case, we want to translate from English to French, so we
# prepend the following prompt to every input:
Xs = [f"translate English to French: {x}" for x in Xs]

# In practice, what we did was to go from:
# "text about important stuff"
# to
# "translate English to French: text about important stuff"

# Split the data into training, validation and test sets
train_val_index = round(0.90*n_samples)
val_test_index = round(0.97*n_samples)

xtrain, ytrain = Xs[:train_val_index], Ys[:train_val_index]
xvalid, yvalid = Xs[train_val_index:val_test_index], Ys[train_val_index:val_test_index]
xtest, ytest = Xs[val_test_index:], Ys[val_test_index:]


In [None]:
# Load the paired english-french sentences
with open("/content/drive/MyDrive/ml/final_project/data/en_all.txt", "r") as f:
  Xs = [s.strip() for s in f.readlines()]
with open("/content/drive/MyDrive/ml/final_project/data/fr_all.txt", "r") as f:
  Ys = [s.strip() for s in f.readlines()]
n_samples = len(Xs)

# Split the data into training, validation and test sets
train_val_index = round(0.90*n_samples)
val_test_index = round(0.97*n_samples)

xtrain, ytrain = Xs[:train_val_index], Ys[:train_val_index]
xvalid, yvalid = Xs[train_val_index:val_test_index], Ys[train_val_index:val_test_index]
xtest, ytest = Xs[val_test_index:], Ys[val_test_index:]

In [None]:
# En-Fr

# Load the paired english-french sentences
with open("dataset_en.txt", "r") as f:
  Xs = [s.strip() for s in f.readlines()]
with open("dataset_fr.txt", "r") as f:
  Ys = [s.strip() for s in f.readlines()]
n_samples = len(Xs)

# Add task prefix to inputs.
# This is required given we use the T5 model (https://arxiv.org/abs/1910.10683).
# In this case, we want to translate from English to French, so we
# prepend the following prompt to every input:
Xs = [f"translate English to French: {x}" for x in Xs]

# In practice, what we did was to go from:
# "text about important stuff"
# to
# "translate English to French: text about important stuff"

# Split the data into training, validation and test sets
train_val_index = round(0.90*n_samples)
val_test_index = round(0.97*n_samples)

xtrain, ytrain = Xs[:train_val_index], Ys[:train_val_index]
xvalid, yvalid = Xs[train_val_index:val_test_index], Ys[train_val_index:val_test_index]
xtest, ytest = Xs[val_test_index:], Ys[val_test_index:]

In [None]:
from tqdm import tqdm

In [None]:
# Define an evaluation function to obtain the BLEU score for a model
def evaluate_model_bleu(model, tokenizer, xtest, ytest):

    metric = datasets.load_metric('sacrebleu')

    for i, (x, y) in enumerate(zip(xtest[:500], ytest[:500])):

        # Encode the inputs for the model
        input_ids = tokenizer.encode(x, return_tensors="pt").to(device)

        # Obtain the model's prediction
        y_hat = model.generate(input_ids=input_ids, max_length=500)
        y_hat = np.array(y_hat[0].data.cpu())
        y_hat = tokenizer.decode(y_hat)

        # Add example to metric computation (sacrebleu expects a list of references)
        # print(y_hat)
        # print(y)
        # print("=="*50)
        metric.add(prediction=y_hat, reference=[y])

        if i < 5:
            # print(f"Example {i}")
            print("=="*50)
            print(f"  Input: {x}")
            print(f"  Output: {y_hat}")
            print(f"  Labels: {y}")
    
    return metric.compute()["score"]

bleu_score = evaluate_model_bleu(model, tokenizer, xtest, ytest)
print(f"Test BLEU score : {bleu_score}")

  Input: The government appealed the case.
  Output: <pad> Le gouvernement a fait appel de l'affaire.
  Labels: Il a interjete appel.
  Input: That is why I believe that the government possibly could have bargained in bad faith.
  Output: <pad> C'est pourquoi je crois que le gouvernement aurait pu négocier de mauvaise foi.
  Labels: C'est pourquoi je soupconne le gouvernement d'avoir negocie de mauvaise foi.
  Input: That is why we are where we are.
  Output: <pad> C’est pourquoi nous sommes là où nous sommes.
  Labels: C'est pourquoi la situation est ce qu'elle est.
  Input: Senator Austin referred to the Charlottetown Agreement.
  Output: <pad> Le sénateur Austin a parlé de l'Accord de Charlottetown.
  Labels: Le senateur Austin a fait allusion a l'Entente de Charlottetown.
  Input: I have travelled it, and I have visited these villages.
  Output: <pad> Je l'ai parcouru, et j'ai visité ces villages.
  Labels: Je m'y suis rendue et j'ai visite ces villages.
Test BLEU score : 32.588805

## Finetuning the model

In [None]:
# Format the training and validation data using the model's tokenizer
train_encodings = tokenizer.prepare_seq2seq_batch(src_texts=xtrain, tgt_texts=ytrain)
valid_encodings = tokenizer.prepare_seq2seq_batch(src_texts=xvalid, tgt_texts=yvalid)
train_set = datasets.Dataset.from_dict(train_encodings)
valid_set = datasets.Dataset.from_dict(valid_encodings)

# ATTENTION!
# t5-small (even if "small") is a big model to train.
# In colab, you will get a random GPU model.
# If the GPU model has not enough memory, training will fail.
# Let's check how much GPU memory we have:
gpu_mem_in_gb = torch.cuda.get_device_properties(0).total_memory * 1e-9
print('The GPU memory is {:.2f} GB'.format(gpu_mem_in_gb))

# In case training fails, try to lower the batch size an run again.
# (of course, that means you will wait longer)
# For example:
batch_size = 8 # You should be able to afford this with 16GB (sometimes 12GB)
               # of GPU memory.
# batch_size = 4 # You should be able to afford this with 12GB of GPU memory.
# if you still get CUDA out of memory, try a batch_size < 4.

# Freeing as much memory as possible:
gc.collect()
torch.cuda.empty_cache()

# Instantiate the HuggingFace Trainer object
training_args = TrainingArguments(output_dir="t5_output",
                                  evaluation_strategy="epoch",
                                  num_train_epochs=1,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  seed=42)
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=train_set,
                  eval_dataset=valid_set)

# Train the model
trainer.train()