# A simple tutorial for QA correlation prediction with pre-trained GPT

In [2]:
# Jiaolin Luo (Róisín), Colm O'Riordan (supervisor)

In [3]:
import sys
import os
import random
import math

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from tqdm import tqdm

import csv
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

%matplotlib inline

# GPU acceleration just in case

In [150]:
def get_hwacc_device_v3():

    device = torch.device('cpu')
    
    if torch.cuda.is_available():
        
        print(torch.cuda.get_device_name(0))
        print('CUDA memory Usage:')
        print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
        print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
    
        device = torch.device('cuda')
    # MacOS
    elif hasattr(torch, "backends") and \
          hasattr(torch.backends, "mps") and \
          torch.backends.mps.is_available():
                
        device = torch.device('mps')
 
    print("GPU device is: ", device)
    
    return device

In [151]:
device = get_hwacc_device_v3()
#device = torch.device("cpu")
device

GPU device is:  mps


device(type='mps')

# Loading math QA dataset

In [152]:
from datasets import list_datasets

datasets_list = list_datasets()
len(datasets_list)

36046

In [154]:
for ds in datasets_list:
    if "wiki_qa" in ds:
        print(ds)

iapp_wiki_qa_squad
wiki_qa
wiki_qa_ar
wannaphong/iapp_wiki_qa_squad_oa
sedthh/cmu_wiki_qa
michaelthwan/wiki_qa_bart_1000row
michaelthwan/wiki_qa_bart_10000row
michaelthwan/oa_wiki_qa_bart_10000row


In [155]:
from datasets import load_dataset
qa_dataset = load_dataset(path = "wiki_qa",
                       cache_dir = ".." + os.sep + ".." + os.sep + "Dataset_Collection", 
                       download_mode = "reuse_dataset_if_exists")

Downloading builder script:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/13.6k [00:00<?, ?B/s]

Downloading and preparing dataset wiki_qa/default to /Users/roisinjiaolinluo/Documents/Research/AI_Research/question_answer_prediction/../../Dataset_Collection/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c...


Downloading data:   0%|          | 0.00/7.09M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/6165 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2733 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20360 [00:00<?, ? examples/s]

Dataset wiki_qa downloaded and prepared to /Users/roisinjiaolinluo/Documents/Research/AI_Research/question_answer_prediction/../../Dataset_Collection/wiki_qa/default/0.1.0/d2d236b5cbdc6fbdab45d168b4d678a002e06ddea3525733a24558150585951c. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

## Investigating dataset shape

In [156]:
qa_dataset

DatasetDict({
    test: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 6165
    })
    validation: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 2733
    })
    train: Dataset({
        features: ['question_id', 'question', 'document_title', 'answer', 'label'],
        num_rows: 20360
    })
})

In [157]:
qa_dataset['train'][0]

{'question_id': 'Q1',
 'question': 'how are glacier caves formed?',
 'document_title': 'Glacier cave',
 'answer': 'A partly submerged glacier cave on Perito Moreno Glacier .',
 'label': 0}

In [158]:
len(qa_dataset['train'])

20360

In [13]:
print(qa_dataset['train'][0]['question'])

average age of students of an adult school is 40 years. 120 new students whose average age is 32 years joined the school. as a result the average age is decreased by 4 years. find the number of students of the school after joining of the new students . Write a short snippet of python function which solve this problem. No need to explain the answer, let the code output the answer.


In [159]:
print(qa_dataset['train'][0]['answer'])

A partly submerged glacier cave on Perito Moreno Glacier .


## Setting dataset and splitting dataset

In [160]:
#Setting format to torch or tensorflow
qa_dataset.set_format(type='torch', columns=['question', 'answer'])

In [161]:
qa_dataset_train_ = qa_dataset['train']
qa_dataset_val_ = qa_dataset['validation']
qa_dataset_test_ = qa_dataset['test']

# Wrapping the dataset with label

In [162]:
len(qa_dataset_train_)

20360

In [172]:
class QAPredictionDataset(torch.utils.data.Dataset):
    def __init__(self, 
                 dataset, 
                 paired_sampling_prob = 0.5, 
                 random_seed = 42):
        self.dataset = dataset
        self.paired_sampling_prob = paired_sampling_prob
        self.dataset_size = len(dataset)
        
        self.dataset_indices = list(np.arange(0, self.dataset_size))
        
        np.random.seed(random_seed)
        
    def __getitem__(self, index):
        q = self.dataset[index]['question']
        a = self.dataset[index]['answer']
        
        MAX_LEN=512
        #Truncate Q and A to MAX_LEN
        #TODO.
        
        if np.random.rand() < self.paired_sampling_prob:
            y = 1
        else:
            y = 0
            #Resampling a 'answer'
            new_index = int(np.random.choice(self.dataset_indices))
            a = self.dataset[new_index]['answer']
        
        return (q, a), y
    
    def __len__(self):
        return self.dataset_size

In [173]:
qa_dataset_train = QAPredictionDataset(qa_dataset_train_, paired_sampling_prob = 0.5)
qa_dataset_val = QAPredictionDataset(qa_dataset_val_, paired_sampling_prob = 0.5)
qa_dataset_test = QAPredictionDataset(qa_dataset_test_, paired_sampling_prob = 0.5)

In [174]:
for i in range(0, 3):
    (q,a), y = qa_dataset_train[i]
    print("Q: ", q)
    print("A: ", a)
    print("Paired prob: ", y)
    print()

Q:  how are glacier caves formed?
A:  A partly submerged glacier cave on Perito Moreno Glacier .
Paired prob:  1

Q:  how are glacier caves formed?
A:  In modern politics, the most high profile political campaigns are focused on candidates for head of state or head of government , often a President or Prime Minister .
Paired prob:  0

Q:  how are glacier caves formed?
A:  Recovery was an international success and was named the best selling album of 2010 worldwide, joining The Eminem Show, which was the best seller of 2002.
Paired prob:  0



In [175]:
batch_size=8

In [176]:
dataloader_train = torch.utils.data.DataLoader(qa_dataset_train, batch_size=batch_size)
dataloader_val = torch.utils.data.DataLoader(qa_dataset_val, batch_size=batch_size)
dataloader_test = torch.utils.data.DataLoader(qa_dataset_test, batch_size=batch_size)

In [177]:
batch = next(iter(dataloader_train))
len(batch)

2

In [178]:
(q,a),y = batch

In [179]:
len(q)

8

In [180]:
len(a)

8

In [181]:
print(q[0])

how are glacier caves formed?


In [182]:
print(a[0])

A partly submerged glacier cave on Perito Moreno Glacier .


In [183]:
print(y[0])

tensor(1)


# Loading pre-trained OpenAI GPT

In [184]:
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel

In [185]:
gpt = OpenAIGPTModel.from_pretrained('openai-gpt')
_ = gpt.eval() #Freezing GPT model to make it not trainabale.

tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


## Testing GPT embeddings

In [186]:
# Tokenized input
text = q[0]
print("text = ", text)

inputs = tokenizer(text, return_tensors="pt")
print(inputs)

text =  how are glacier caves formed?
{'input_ids': tensor([[  718,   640, 25397, 11464,  2768,   257]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}


In [187]:
#sending GPT to GPU if possible
gpt.to(device)

#sending inputs tensors to GPU if possible
inputs.to(device)

#get embeddings.
with torch.no_grad():
    outputs = gpt(**inputs)

In [188]:
last_hidden_states = outputs.last_hidden_state

In [189]:
last_hidden_states.shape

torch.Size([1, 6, 768])

In [190]:
last_hidden_states[0].shape

torch.Size([6, 768])

In [191]:
#We use this position as embedding.
last_hidden_states[0][0].shape

torch.Size([768])

# Building GPT based contextualized QA correlation prediction model

In [262]:
from transformers import OpenAIGPTTokenizer, OpenAIGPTModel

class QACorrelationPredictionModel(nn.Module):
    def __init__(self, device = torch.device("cpu")):
        
        super().__init__()
        
        self.device = device
        
        #Pre-trained Contextual Embedding
        self.gpt = OpenAIGPTModel.from_pretrained('openai-gpt')
        #self.gpt.to(device)
        #self.gpt.eval() #freezing model not trainable.
        
        #GPT tokenizer
        self.tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

        #correlation prediction head.
        self.pred_head = nn.Sequential(
                    #nn.Dropout(p = 0.3),
                    #nn.LayerNorm(normalized_shape = (768*2)), #for NLP task, we DONT use batchnorm. instead SHOULD use layernorm.
                    nn.Linear(in_features = 768 * 2, out_features = 300, bias = True),
                    nn.ReLU(),
                    nn.Linear(in_features = 300, out_features = 1, bias = True),
                    nn.Sigmoid(),
                    )
        #self.pred_head.to(device)
    
    def forward(self, batch):
        self.gpt.eval()
        
        #A batch consists of ((q,a), y), in which
        #q and q are lists, y an array.

        #get embeddings
        qa_embed = []
        
        (q_list, a_list), y = batch
        
        for q, a in zip(q_list, a_list):
            
            # Tokenized Q and A
            tokenized_q = tokenizer.tokenize(q)
            tokenized_a = tokenizer.tokenize(a)

            inputs_q = self.tokenizer(q, return_tensors="pt")
            inputs_a = self.tokenizer(a, return_tensors="pt")
            
            #sending tensors to GPU if possible
            inputs_q = inputs_q.to(self.device)
            inputs_a = inputs_a.to(self.device)
            
            # We only need last layer output the CLS position as embedding.!
            with torch.no_grad():
                outputs_q = self.gpt(**inputs_q)
                outputs_a = self.gpt(**inputs_a)
      
            q_embed = outputs_q.last_hidden_state[0][0]
            a_embed = outputs_q.last_hidden_state[0][0]
        
            #Concatenate two 768 into 768*2
            embed = torch.cat([q_embed, a_embed])
            qa_embed.append(embed)
            
        qa_embed = torch.stack(qa_embed)
        #print("qa_embed.shape = ", qa_embed.shape)
        
        logits = self.pred_head(qa_embed)
        logits = logits.squeeze(1)
        
        return logits

In [263]:
batch = next(iter(dataloader_train))

In [276]:
(q,a),labels = batch
labels

tensor([0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
        1, 1, 1, 1, 0, 0, 0, 1])

In [264]:
#testing the model simply.
model = QACorrelationPredictionModel(device = device)
model.to(device)
batch = next(iter(dataloader_train))
probs = model(batch)

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [265]:
probs

tensor([0.4502, 0.4502, 0.4502, 0.4502, 0.4502, 0.4502, 0.4502, 0.4502, 0.4502,
        0.4502, 0.4502, 0.4502, 0.4502, 0.4502, 0.4502, 0.4502, 0.4502, 0.4502,
        0.4502, 0.4502, 0.4502, 0.4502, 0.4502, 0.4502, 0.4502, 0.4502, 0.4502,
        0.4502, 0.4502, 0.4502, 0.4502, 0.4502], device='mps:0',
       grad_fn=<SqueezeBackward1>)

In [266]:
probs.shape

torch.Size([32])

In [267]:
probs >= 0.5

tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False], device='mps:0')

In [268]:
torch.mean(probs >= 0.5, dtype = torch.float).cpu()

tensor(0.)

In [269]:
(_, _), labels = batch
labels.shape

torch.Size([32])

In [270]:
labels.shape

torch.Size([32])

In [271]:
labels

tensor([0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
        1, 1, 1, 1, 0, 0, 0, 1])

In [272]:
criterion = torch.nn.BCELoss()

#sending to GPU.
probs = probs.to(device)
labels = labels.float().to(device)

criterion(probs, labels)

tensor(0.6981, device='mps:0', grad_fn=<BinaryCrossEntropyBackward0>)

# Training on epoch

In [273]:
%matplotlib inline

#from IPython.display import display, clear_output
from IPython import display

def train_one_epoch(
          model, 
          device, 
          dataloader, 
          optimizer, 
          criterion,
          epoch,
          max_batches = None):
    
    # Enable gradient computing
    model.to(device)
    model.train()
    
    if max_batches is None:
        max_batches = len(dataloader)
    
    #some statistics
    
    #averaged loss in current epoch.
    epoch_loss = 0.0
    total_loss = 0.0
    
    #accuracy in current epoch
    batch_accuracy = 0.0
    #accuracy in current batch
    epoch_accuracy = 0.0
    
    #how many samples predicted correct.
    epoch_corrects = 0.0
    #how many samples trained in this epoch
    epoch_total = 0.0
    
    for batch_idx, batch in enumerate(dataloader, 1):
        
        (q,a), labels_ = batch
        
        labels = labels_.float()
        #sending labels to GPU if possible
        labels = labels.to(device)

        optimizer.zero_grad()
        
        #predictions.
        preds = model(batch)
        
        #computing BCE
        loss = criterion(preds, labels)
           
        #computing gradients
        loss.backward()
        
        #optimizing the classifier, Notice: the GPT is fixed.
        optimizer.step()
        
        
        #computing accuracy in a batch
        # torch.max() returns values, indices
        preds_ = (preds >= 0.5).int().cpu().data
        #batch_accuracy = torch.mean(preds > 0.5, dtype = torch.float).detach().cpu()
        #batch_accuracy = torch.mean(preds_.float()).detach().cpu()
        
        #computing the total loss and average loss in one epoch
        total_loss += loss.detach().cpu().numpy()
        epoch_loss = total_loss / batch_idx
        
        #computing the correct and total samples
        batch_corrects = torch.sum(labels_.cpu().data == preds_, dtype = torch.int)
        batch_accuracy = batch_corrects / len(labels_)
        epoch_corrects += batch_corrects
        epoch_total += len(labels_)
        epoch_accuracy = epoch_corrects / epoch_total         

        #Updating training displays.
        display.clear_output(wait=True)
        
        display.display('Epoch {} [{}/{} ({:.0f}%)]'.format(
                    epoch, batch_idx, 
                    len(dataloader), 
                    100. * (batch_idx / len(dataloader))))
        
        display.display('* batch accuracy {:.2f}% epoch accuracy {:.2f}%'.format(
                    100. * batch_accuracy, 100. * epoch_accuracy))
        
        display.display('* loss {:.6f} epoch loss {:.6f}'.format(
                    loss.item(), epoch_loss))
        display.display('* batch_corrects {}'.format(batch_corrects))
        
        if batch_idx > max_batches:
            break
    
    return epoch_loss, epoch_accuracy

In [279]:
model = QACorrelationPredictionModel(device = device)
model.to(device)

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


QACorrelationPredictionModel(
  (gpt): OpenAIGPTModel(
    (tokens_embed): Embedding(40478, 768)
    (positions_embed): Embedding(512, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (pred_head): Sequential(
    (0): Linear(in_features=1536, out_features=300, bias=True)
    (1): ReLU()
    (2): Linear(in_features=300, out_features=1, bias=True)
    (3): Sigmoid()
  )
)

In [280]:
batch_size = 32

dataloader_train = torch.utils.data.DataLoader(qa_dataset_train, batch_size=batch_size)

learning_rate = 0.005

optimizer = torch.optim.Adam(
                        model.pred_head.parameters(), 
                        lr = learning_rate,
                        #momentum = 0.9, 
                        #weight_decay = 5e-4
                      )

#Loss function
criterion = torch.nn.BCELoss(reduction='mean')

epoch_loss, epoch_accuracy = train_one_epoch(
          model, 
          device, 
          dataloader_train, 
          optimizer, 
          criterion,
          epoch = 1,
          max_batches = 100)



'* batch accuracy 37.50% epoch accuracy 52.23%'

'* loss 0.733202 epoch loss 0.834022'

'* batch_corrects 12'

KeyboardInterrupt: 