<a href="https://colab.research.google.com/github/bhadreshpsavani/EfficientQAExperiments/blob/master/natural_qa_t5_pytorch_tpu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [1]:
!git clone https://github.com/efficientqa/nq-open.git

Cloning into 'nq-open'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects:  14% (1/7)[Kremote: Counting objects:  28% (2/7)[Kremote: Counting objects:  42% (3/7)[Kremote: Counting objects:  57% (4/7)[Kremote: Counting objects:  71% (5/7)[Kremote: Counting objects:  85% (6/7)[Kremote: Counting objects: 100% (7/7)[Kremote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects:  16% (1/6)[Kremote: Compressing objects:  33% (2/6)[Kremote: Compressing objects:  50% (3/6)[Kremote: Compressing objects:  66% (4/6)[Kremote: Compressing objects:  83% (5/6)[Kremote: Compressing objects: 100% (6/6)[Kremote: Compressing objects: 100% (6/6), done.[K
remote: Total 7 (delta 0), reused 7 (delta 0), pack-reused 0[K
Unpacking objects: 100% (7/7), done.


In [2]:
import warnings
warnings. filterwarnings('ignore')

In [3]:
!pip install transformers
# !pip install wandb -q

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |▍                               | 10kB 20.0MB/s eta 0:00:01[K     |▉                               | 20kB 2.0MB/s eta 0:00:01[K     |█▎                              | 30kB 2.8MB/s eta 0:00:01[K     |█▊                              | 40kB 3.0MB/s eta 0:00:01[K     |██▏                             | 51kB 2.4MB/s eta 0:00:01[K     |██▋                             | 61kB 2.7MB/s eta 0:00:01[K     |███                             | 71kB 3.0MB/s eta 0:00:01[K     |███▍                            | 81kB 3.2MB/s eta 0:00:01[K     |███▉                            | 92kB 3.4MB/s eta 0:00:01[K     |████▎                           | 102kB 3.3MB/s eta 0:00:01[K     |████▊                           | 112kB 3.3MB/s eta 0:00:01[K     |█████▏                          | 122kB 3.3M

In [4]:
VERSION = "nightly"  #@param ["1.5" , "20200325", "nightly"]
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version $VERSION

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  5115  100  5115    0     0  27063      0 --:--:-- --:--:-- --:--:-- 27063
Updating... This may take around 2 minutes.
Updating TPU runtime to pytorch-nightly ...
Collecting cloud-tpu-client
  Downloading https://files.pythonhosted.org/packages/56/9f/7b1958c2886db06feb5de5b2c191096f9e619914b6c31fdf93999fdbbd8b/cloud_tpu_client-0.10-py3-none-any.whl
Collecting google-api-python-client==1.8.0
[?25l  Downloading https://files.pythonhosted.org/packages/9a/b4/a955f393b838bc47cbb6ae4643b9d0f90333d3b4db4dc1e819f36aad18cc/google_api_python_client-1.8.0-py3-none-any.whl (57kB)
[K     |████████████████████████████████| 61kB 2.5MB/s 
Uninstalling torch-1.6.0+cu101:
Installing collected packages: google-api-python-client, cloud-tpu-client
  Found existing in

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import re
import string
import gc
import unicodedata
import os
import time
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [6]:
# Checking out the GPU we have access to. This is output is from the google colab version. 
# !nvidia-smi

In [7]:
# # Setting up the device for GPU usage
# from torch import cuda
# device = 'cuda' if cuda.is_available() else 'cpu'

# Preparing for TPU usage
import torch_xla
import torch_xla.debug.metrics as met
import torch_xla.distributed.parallel_loader as pl
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.test.test_utils as test_utils

## DataProcessing

In [8]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions

class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.question = self.data.question
        self.answer = self.data.answer

    def __len__(self):
        return len(self.question)

    def __getitem__(self, index):
        question = str(self.question[index])
        question = 'nq question: '+' '.join(question.split())
        answer = ' <sep> '.join(self.answer[index]) + " </s>"
        answer = ' '.join(answer.split())

        # print(question,":",answer)
        
        source = self.tokenizer.batch_encode_plus(
            [question], 
            max_length= self.source_len,
            add_special_tokens=True,
            return_special_tokens_mask=True,
            truncation=True, 
            pad_to_max_length=True,
            return_tensors='pt')
        
        target = self.tokenizer.batch_encode_plus(
            [answer], 
            max_length= self.source_len, 
            add_special_tokens=True,
            return_special_tokens_mask=True,
            truncation=True,
            pad_to_max_length=True,
            return_tensors='pt')
        
        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

## Training and Validation

In [9]:
# Creating the training function. This will be called in the main function. It is run depending on the epoch value.
# The model is put into train mode and then we wnumerate over the training loader and passed to the defined network 

def train(epoch, tokenizer, model, device, loader, optimizer):
    ## Trains
    train_start = time.time()
    model.train()
    para_train_loader = pl.ParallelLoader(loader, [device]).per_device_loader(device)
    for _,data in enumerate(para_train_loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)
        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
        loss = outputs[0]

        if _%500==0:
            # master_print will only print once (not from all 8 cores)
            xm.master_print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
#         optimizer.step()
        xm.optimizer_step(optimizer)
        # xm.mark_step()
    elapsed_train_time = time.time() - train_start
    xm.master_print("finished training. Train time was:", elapsed_train_time) 

In [10]:
def validate(epoch, tokenizer, model, device, loader):
    
    valid_start = time.time()
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        para_train_loader = pl.ParallelLoader(loader, [device]).per_device_loader(device)
        for _, data in enumerate(para_train_loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)
                
            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            
            preds = [tokenizer.decode(g) for g in generated_ids]
            target = [tokenizer.decode(t) for t in y]
            
            if _%100==0:
                xm.master_print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
            
    elapsed_valid_time = time.time() - valid_start
    xm.master_print("finished Valid. Train time was:", elapsed_valid_time)
            
    return predictions, actuals

In [11]:
def map_fn(index, flags):

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(flags['seed']) # pytorch random seed
    np.random.seed(flags['seed']) # numpy random seed
#     torch.backends.cudnn.deterministic = True

    device = xm.xla_device()
    
    if not xm.is_master_ordinal():
        xm.rendezvous('download_only_once')
    
    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained("t5-small", eos_token='</s>', sep_token='<sep>')
    
    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained("t5-small")
    
    if xm.is_master_ordinal():
        xm.rendezvous('download_only_once')

    # Creation of Dataset and Dataloader
    # Defining the train size. So 80% of the data will be used for training and the rest will be used for validation. 
    train_size = 0.95
    train_df = pd.read_json("nq-open/NQ-open.dev.jsonl", orient='columns', lines=True)
    train_df = train_df[:4000]
    train_dataset=train_df.sample(frac=train_size, random_state = flags['seed']).reset_index(drop=True)
    val_dataset=train_df.drop(train_dataset.index).reset_index(drop=True)
    val_dataset = train_df[:50]
    xm.master_print("FULL Dataset: {}".format(train_df.shape))
    xm.master_print("TRAIN Dataset: {}".format(train_dataset.shape))
    xm.master_print("TEST Dataset: {}".format(val_dataset.shape))
    
    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = CustomDataset(train_dataset, tokenizer, flags['max_len'])
    val_set = CustomDataset(val_dataset, tokenizer, flags['max_len'])
    
    # defining data samplers and loaders 
    train_sampler = torch.utils.data.distributed.DistributedSampler(
          training_set,
          num_replicas=xm.xrt_world_size(), # tell PyTorch how many devices (TPU cores) we are using for training
          rank=xm.get_ordinal(), # tell PyTorch which device (core) we are on currently
          shuffle=True)
    
    valid_sampler = torch.utils.data.distributed.DistributedSampler(
          val_set,
          num_replicas=xm.xrt_world_size(),
          rank=xm.get_ordinal(),
          shuffle=False)
    
    # Defining the parameters for creation of dataloaders
    train_params = {
        'batch_size': flags['batch_size'],
        'sampler' : train_sampler,
        'num_workers': flags['num_workers'],
        'drop_last': True
        }

    val_params = {
        'batch_size': flags['batch_size'],
        'sampler' : valid_sampler,
        'num_workers': flags['num_workers'],
        'drop_last': True
        }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    #Send model to TPU device
    model = model.to(device)
    xm.master_print('done loading model')

    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=flags['learning_rate'])

    #Training loop
    xm.master_print('training on train dataset')

    for epoch in range(flags['num_epochs']):
        gc.collect()
        train(epoch, tokenizer, model, device, training_loader, optimizer)

    # Validation loop and saving the resulting file with predictions and acutals in a dataframe.
    # Saving the dataframe as predictions.csv
    xm.master_print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    
    gc.collect()

In [12]:
flags = {}
flags['batch_size'] = 8
flags['num_workers'] = 8
flags['num_epochs'] = 2
flags['seed'] = 1234
flags['max_len'] = 512
flags['learning_rate'] = 1e-4 * xm.xrt_world_size()

xmp.spawn(map_fn, args=(flags,), nprocs=8, start_method='fork')




Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1197.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242065649.0, style=ProgressStyle(descri…




Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at t5-small and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word emebeddin

FULL Dataset: (3610, 2)
TRAIN Dataset: (3430, 2)
TEST Dataset: (50, 2)
done loading model
training on train dataset


Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at t5-small and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at t5-small and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at t5-small and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5ForConditionalGener

Epoch: 0, Loss:  9.377806663513184


KeyboardInterrupt: ignored

In [None]:
for i in range(10):
    print(val_dataset['question'][i], "\nActual Answer: ", val_dataset['answer'][i],"\nPredicted Answer: ", val_dataset['predictions'][i], '\n')

In [None]:
 val_dataset.to_csv('predictions.csv')

In [None]:
val_dataset.query('predictions==answer')

In [None]:
val_dataset['predictions'] = val_dataset['predictions'].apply(lambda s: '[ '+ s.replace('<sep>', ',') + ' ]')
val_dataset['predictions'].head(10)

## Evaluation:

In [None]:
"""Evaluation utilities."""

def normalize_answer(s):
    """Normalize answer."""
    s = unicodedata.normalize("NFD", s)

    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))


def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)


def regex_match_score(prediction, ground_truth):
    try:
        regex = re.compile(ground_truth,
                       flags=re.IGNORECASE + re.UNICODE + re.MULTILINE)
        return regex.match(prediction) is not None
    except re.error:
        return False

def metric_max_over_ground_truths(metric_fn, prediction,
                                  ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)

In [None]:
val_dataset['exact_match'] = val_dataset.apply(lambda row: exact_match_score(row['predictions'], row['answer']), axis=1)
val_dataset['regex_match'] = val_dataset.apply(lambda row: regex_match_score(row['predictions'], row['answer']), axis=1)

In [None]:
val_dataset.head()

In [None]:
val_dataset[val_dataset['exact_match']]

In [None]:
# Step 1: Get the credential from the Cloud SDK
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
user_credential = user_secrets.get_gcloud_credential()

In [None]:
# Step 2: Set the credentials
user_secrets.set_tensorflow_credential(user_credential)

In [None]:
# Step 3: Use a familiar call to get the GCS path of the dataset
from kaggle_datasets import KaggleDatasets
GCS_DS_PATH = KaggleDatasets().get_gcs_path()