<a href="https://colab.research.google.com/github/bhadreshpsavani/EfficientQAExperiments/blob/master/EfficientQAT5PytorchGPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [2]:
!git clone https://github.com/efficientqa/nq-open.git

fatal: destination path 'nq-open' already exists and is not an empty directory.


In [3]:
!pip install transformers -q

[K     |████████████████████████████████| 1.1MB 6.6MB/s 
[K     |████████████████████████████████| 890kB 26.8MB/s 
[K     |████████████████████████████████| 1.1MB 51.4MB/s 
[K     |████████████████████████████████| 3.0MB 54.9MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [11]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import re
import string
import warnings

import unicodedata
import os

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
# from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelWithLMHead
warnings. filterwarnings('ignore')

In [12]:
# Checking out the GPU we have access to. This is output is from the google colab version. 
!nvidia-smi

Fri Oct 16 05:11:37 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8     9W /  70W |     10MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [13]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# Preparing for TPU usage
# import torch_xla
# import torch_xla.core.xla_model as xm
# device = xm.xla_device()

print(device)

cuda


## Data Exploration

In [14]:
train_df = pd.read_json("nq-open/NQ-open.train.jsonl", orient='columns', lines=True)
train_df.head()

Unnamed: 0,answer,question
0,[Fernie Alpine Resort],where did they film hot tub time machine
1,[Neither vessel],who has the right of way in international waters
2,[Marley],who does annie work for attack on titan
3,"[November 6, 1986]",when was the immigration reform and control ac...
4,[1950],when was puerto rico added to the usa


In [15]:
dev_df = pd.read_json("nq-open/NQ-open.dev.jsonl", orient='columns', lines=True)
dev_df.head()

Unnamed: 0,question,answer
0,when was the last time anyone was on the moon,"[14 December 1972 UTC, December 1972]"
1,who wrote he ain't heavy he's my brother lyrics,"[Bobby Scott, Bob Russell]"
2,how many seasons of the bastard executioner ar...,"[one, one season]"
3,when did the eagles win last super bowl,[2017]
4,who won last year's ncaa women's basketball,[South Carolina]


In [16]:
number_of_answers = pd.Series([len(train_df['answer'][i]) for i in range(len(train_df['answer']))])
number_of_answers.value_counts()

1     79300
2      4488
3      1710
4       895
5       512
6       310
7       245
8       175
10      137
9       132
12       12
11        3
21        2
13        1
15        1
18        1
25        1
dtype: int64

In [17]:
train_df['number_of_answers']=number_of_answers
train_df.head()

Unnamed: 0,answer,question,number_of_answers
0,[Fernie Alpine Resort],where did they film hot tub time machine,1
1,[Neither vessel],who has the right of way in international waters,1
2,[Marley],who does annie work for attack on titan,1
3,"[November 6, 1986]",when was the immigration reform and control ac...,1
4,[1950],when was puerto rico added to the usa,1


## DataProcessing

In [38]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.question = self.data.question
        self.answer = self.data.answer

    def clean_text(self, text):
        text = text.replace('Example of text:', '')
        text = text.replace('Example of Summary:', '')
        text = text.replace('\n','')
        text = text.replace('``', '')
        text = text.replace('"', '')
        
        return text

    def __len__(self):
        return len(self.question)

    def __getitem__(self, index):
        question = self.clean_text(str(self.question[index]))
        answer = self.answer[index]
        question = 'nq question: '+' '.join(question.split())
        answer = ' <sep> '.join(answer) + " </s>"
        answer = ' '.join(answer.split())

        # print(question,":",answer)
        
        source = self.tokenizer.batch_encode_plus(
            [question], 
            max_length= self.source_len,
            add_special_tokens=True,
            return_special_tokens_mask=True,
            truncation=True, 
            pad_to_max_length=True,
            return_tensors='pt')
        
        target = self.tokenizer.batch_encode_plus(
            [answer], 
            max_length= self.source_len, 
            add_special_tokens=True,
            return_special_tokens_mask=True,
            truncation=True,
            pad_to_max_length=True,
            return_tensors='pt')
        
        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

## Training and Validation

In [32]:
# Creating the training function. This will be called in the main function. It is run depending on the epoch value.
# The model is put into train mode and then we wnumerate over the training loader and passed to the defined network 

def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)
        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
        loss = outputs[0]

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # xm.optimizer_step(optimizer)
        # xm.mark_step()

In [33]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)
                
            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            
            preds = [tokenizer.decode(g) for g in generated_ids]
            target = [tokenizer.decode(t) for t in y]
            
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [34]:
TRAIN_BATCH_SIZE = 2    # input batch size for training (default: 64)
VALID_BATCH_SIZE = 2    # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 2
# number of epochs to train (default: 10)
VAL_EPOCHS = 1 
LEARNING_RATE = 1e-4    # learning rate (default: 0.01)
SEED = 42               # random seed (default: 42)
MAX_LEN = 512

# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(SEED) # pytorch random seed
np.random.seed(SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
# tokenizer = T5Tokenizer.from_pretrained("t5-small", eos_token='</s>', sep_token='<sep>')
tokenizer = AutoTokenizer.from_pretrained("deep-learning-analytics/triviaqa-t5-base", eos_token='</s>', sep_token='<sep>')

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [39]:
# Creation of Dataset and Dataloader
# Defining the train size. So 80% of the data will be used for training and the rest will be used for validation. 
train_size = 0.95
train_df = pd.read_json("nq-open/NQ-open.train.jsonl", orient='columns', lines=True)
train_dataset=train_df
val_dataset=dev_df
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))

TRAIN Dataset: (87925, 2)
TEST Dataset: (3610, 2)


In [40]:
# Creating the Training and Validation dataset for further creation of Dataloader
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
val_set = CustomDataset(val_dataset, tokenizer, MAX_LEN)

# Defining the parameters for creation of dataloaders
train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

# Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

In [41]:
# Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
# Further this model is sent to device (GPU/TPU) for using the hardware.
# model = T5ForConditionalGeneration.from_pretrained("t5-small")
# model = model.to(device)
model = AutoModelWithLMHead.from_pretrained("deep-learning-analytics/triviaqa-t5-base")
model = model.to(device)
# Defining the optimizer that will be used to tune the weights of the network in the training session. 
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [42]:
#Training loop
print('Initiating Fine-Tuning for the model on our dataset')

for epoch in range(TRAIN_EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)

Initiating Fine-Tuning for the model on our dataset
Epoch: 0, Loss:  15.694695472717285


KeyboardInterrupt: ignored

In [None]:
# Validation loop and saving the resulting file with predictions and acutals in a dataframe.
# Saving the dataframe as predictions.csv
print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)

Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe
Completed 0


In [None]:
val_dataset['predictions'] = predictions
val_dataset['answer'] = actuals

NameError: ignored

In [None]:
for i in range(10):
    print(val_dataset['question'][i], "\nActual Answer: ", val_dataset['answer'][i],"\nPredicted Answer: ", val_dataset['predictions'][i], '\n')

where did they film hot tub time machine 
Actual Answer:  Fernie Alpine Resort 
Predicted Answer:  Hot Tub 

who has the right of way in international waters 
Actual Answer:  Neither vessel 
Predicted Answer:  sea 

who does annie work for attack on titan 
Actual Answer:  Marley 
Predicted Answer:  annie 

when was the immigration reform and control act passed 
Actual Answer:  November 6, 1986 
Predicted Answer:  1892 

when was puerto rico added to the usa 
Actual Answer:  1950 
Predicted Answer:  1889 

who has been chosen for best supporting actress in 64 national filmfare award 
Actual Answer:  Zaira Wasim 
Predicted Answer:  actress 

which side of the white house is the front 
Actual Answer:  North 
Predicted Answer:  front 

names of the metropolitan municipalities in south africa 
Actual Answer:  Mangaung Metropolitan Municipality <sep> Nelson Mandela Bay Metropolitan Municipality <sep> eThekwini Metropolitan Municipality <sep> City of Tshwane Metropolitan Municipality <sep> Ci

In [None]:
 val_dataset.to_csv('predictions.csv')

In [None]:
val_dataset.query('predictions==answer')

Unnamed: 0,answer,question,predictions
14,18,what's the legal marriage age in new york,18


In [None]:
val_dataset['predictions'] = val_dataset['predictions'].apply(lambda s: '[ '+ s.replace('<sep>', ',') + ' ]')
val_dataset['predictions'].head(10)

0       [ Hot Tub ]
1           [ sea ]
2         [ annie ]
3          [ 1892 ]
4          [ 1889 ]
5       [ actress ]
6         [ front ]
7      [ the city ]
8    [ Super Bowl ]
9         [ India ]
Name: predictions, dtype: object

## Evaluation:

In [None]:
"""Evaluation utilities."""

def normalize_answer(s):
  """Normalize answer."""
  s = unicodedata.normalize("NFD", s)

  def remove_articles(text):
    return re.sub(r"\b(a|an|the)\b", " ", text)

  def white_space_fix(text):
    return " ".join(text.split())

  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)

  def lower(text):
    return text.lower()
  return white_space_fix(remove_articles(remove_punc(lower(s))))


def exact_match_score(prediction, ground_truth):
  return normalize_answer(prediction) == normalize_answer(ground_truth)


def regex_match_score(prediction, ground_truth):
  try:
    regex = re.compile(ground_truth,
                       flags=re.IGNORECASE + re.UNICODE + re.MULTILINE)
    return regex.match(prediction) is not None
  except re.error:
    return False

def metric_max_over_ground_truths(metric_fn, prediction,
                                  ground_truths):
  scores_for_ground_truths = []
  for ground_truth in ground_truths:
    score = metric_fn(prediction, ground_truth)
    scores_for_ground_truths.append(score)
  return max(scores_for_ground_truths)

In [None]:
val_dataset['exact_match'] = val_dataset.apply(lambda row: exact_match_score(row['predictions'], row['answer']), axis=1)
val_dataset['regex_match'] = val_dataset.apply(lambda row: regex_match_score(row['predictions'], row['answer']), axis=1)

In [None]:
val_dataset.head()

Unnamed: 0,answer,question,predictions,exact_match,regex_match
0,Fernie Alpine Resort,where did they film hot tub time machine,[ Hot Tub ],False,False
1,Neither vessel,who has the right of way in international waters,[ sea ],False,False
2,Marley,who does annie work for attack on titan,[ annie ],False,False
3,"November 6, 1986",when was the immigration reform and control ac...,[ 1892 ],False,False
4,1950,when was puerto rico added to the usa,[ 1889 ],False,False


In [None]:
val_dataset[val_dataset['exact_match']]

Unnamed: 0,answer,question,predictions,exact_match,regex_match
14,18,what's the legal marriage age in new york,[ 18 ],True,False
