In [27]:
!nvidia-smi

Tue Sep  7 15:57:00 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.118.02   Driver Version: 440.118.02   CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 108...  On   | 00000000:51:00.0 Off |                  N/A |
|  0%   31C    P8     9W / 250W |   3350MiB / 11178MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce GTX 108...  On   | 00000000:CB:00.0 Off |                  N/A |
|  0%   33C    P8     9W / 250W |   3826MiB / 11178MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce GTX 108...  On   | 00000000:D5:00.0 Off |                  N/A |
|  0%   

In [28]:
import numpy as np 
import pandas as pd 
import json 

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from pytorch_lightning.callbacks import ModelCheckpoint
import textwrap
from transformers import AdamW, T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup
import pytorch_lightning as pl
from tqdm.notebook import tqdm

In [29]:
device = torch.device("cuda:0")

In [30]:
def squad_json_to_dataframe_train(input_file_path, record_path = ['data','paragraphs','qas','answers'],
                           verbose = 1):
    """
    input_file_path: path to the squad json file.
    record_path: path to deepest level in json file default value is
    ['data','paragraphs','qas','answers']
    verbose: 0 to suppress it default is 1
    """
    if verbose:
        print("Reading the json file")    
    file = json.loads(open(input_file_path).read())
    if verbose:
        print("processing...")
    # parsing different level's in the json file
    js = pd.json_normalize(file , record_path )
    m = pd.json_normalize(file, record_path[:-1] )
    r = pd.json_normalize(file,record_path[:-2])
    
    #combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    ndx  = np.repeat(m['id'].values,m['answers'].str.len())
    m['context'] = idx
    js['q_idx'] = ndx
    main = pd.concat([ m[['id','question','context']].set_index('id'),js.set_index('q_idx')],1,sort=False).reset_index()
    main['c_id'] = main['context'].factorize()[0]
    if verbose:
        print("shape of the dataframe is {}".format(main.shape))
        print("Done")
    return main

In [31]:
input_file_path = 'train-v2.0.json'
record_path = ['data','paragraphs','qas','answers']
df = squad_json_to_dataframe_train(input_file_path=input_file_path,record_path=record_path)

Reading the json file
processing...
shape of the dataframe is (130319, 6)
Done


In [37]:
df[:5]

Unnamed: 0,index,question,context,text,answer_start,c_id
0,56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s,269.0,0
1,56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing,207.0,0
2,56be85543aeaaa14008c9066,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003,526.0,0
3,56bf6b0f3aeaaa14008c9601,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas",166.0,0
4,56bf6b0f3aeaaa14008c9602,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276.0,0


In [38]:
df = df.dropna()

In [39]:
train_df, val_df = train_test_split(df, test_size=0.1)
train_df.shape, val_df.shape

((78138, 6), (8683, 6))

In [40]:
MODEL_NAME= "t5-base"
BATCH_SIZE = 8
N_EPOCH= 2
SOURCE_MAX_TOKEN_LEN= 300
TARGET_MAX_TOKEN_LEN= 32

In [41]:
tokernizer= T5Tokenizer.from_pretrained(MODEL_NAME)

In [42]:
class SQuADDataset(Dataset):
    def __init__(self, data: pd.DataFrame, 
                 tokernizer: T5Tokenizer, 
                 source_max_token_len: int = 256, 
                 target_max_token_len= 32):
        
        self.tokernizer= tokernizer
        self.data= data
        self.source_max_token_len= source_max_token_len
        self.target_max_token_len= target_max_token_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index:int):
        data_row = self.data.iloc[index]
        source_encoding = self.tokernizer(
            data_row['question'],
            data_row['context'],
            max_length=self.source_max_token_len,
            padding= "max_length",
            truncation= "only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors= "pt"
        )
        
        target_encoding = self.tokernizer(
            data_row['text'],
            max_length=self.target_max_token_len,
            padding= "max_length",
            truncation= True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors= "pt"
        )
        
        labels = target_encoding["input_ids"]
        labels[labels==0] = -100
        
        return (data_row["question"],
                data_row['context'],
                data_row['text'],
                source_encoding["input_ids"].flatten(),
                source_encoding["attention_mask"].flatten(),
                labels.flatten())
        

In [43]:
train_dataset= SQuADDataset(data= train_df, 
                             tokernizer= tokernizer,
                             source_max_token_len= SOURCE_MAX_TOKEN_LEN,
                             target_max_token_len= TARGET_MAX_TOKEN_LEN)

val_dataset= SQuADDataset(data= val_df, 
                             tokernizer= tokernizer,
                             source_max_token_len= SOURCE_MAX_TOKEN_LEN,
                             target_max_token_len= TARGET_MAX_TOKEN_LEN)

In [44]:
train_loader= DataLoader(train_dataset,
                         batch_size= BATCH_SIZE,
                         shuffle= True,
                         num_workers=4)

val_loader= DataLoader(val_dataset,
                         batch_size= BATCH_SIZE,
                         shuffle= True,
                         num_workers=4)

In [37]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
optimizer = AdamW(model.parameters(), lr=0.0001)

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
min_val_loss= float('inf')
for epoch in range(N_EPOCH):    
    model.train()
    train_epoch_loss = 0
    for question, context, answer_text, input_ids, attention_mask, labels in tqdm(train_loader, desc=f"Epoch {epoch}: Traingin"):
        input_ids= input_ids.to(device)
        attention_mask= attention_mask.to(device)
        labels= labels.to(device)
        optimizer.zero_grad()

        output = model(input_ids= input_ids, attention_mask=attention_mask, labels=labels)
        loss, logits  =output.loss, output.logits
        loss.backward()
        optimizer.step()

        train_epoch_loss += loss.detach().item()
    
    train_epoch_loss /= len(train_loader)
    
    val_epoch_loss = 0
    model.eval()
    with torch.no_grad():
        for question, context, answer_text, input_ids, attention_mask, labels in tqdm(val_loader, desc=f'Epoch {epoch}: Validation'):
            input_ids= input_ids.to(device)
            attention_mask= attention_mask.to(device)
            labels= labels.to(device)
            optimizer.zero_grad()

            output = model(input_ids= input_ids, attention_mask=attention_mask, labels=labels)
            loss, logits  =output.loss, output.logits

            val_epoch_loss += loss.detach().item()
    val_epoch_loss /= len(val_loader)
    
    print(f'Epoch: {epoch}, Train Loss: {train_epoch_loss}, Validation Loss: {val_epoch_loss}')
    if min_val_loss > val_epoch_loss:
        print(f"Update Model at Epoch :{epoch}")
        torch.save({'state_dict': model.state_dict()}, 'model.tar')
        min_val_loss = val_epoch_loss

Epoch 0: Traingin:   0%|          | 0/9768 [00:00<?, ?it/s]

Epoch 0: Validation:   0%|          | 0/1086 [00:00<?, ?it/s]

Epoch: 0, Train Loss: 0.354192273237458, Validation Loss: 0.27460871719108626
Update Model at Epoch :0


Epoch 1: Traingin:   0%|          | 0/9768 [00:00<?, ?it/s]

Epoch 1: Validation:   0%|          | 0/1086 [00:00<?, ?it/s]

Epoch: 1, Train Loss: 0.2526105786631773, Validation Loss: 0.2804432353793779


In [45]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
checkpoint = torch.load('model.tar')
model.load_state_dict(checkpoint['state_dict'])
model= model.to(device)

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [46]:
def generate_answer(question):
    source_encoding = tokernizer(
            question['question'],
            question['context'],
            max_length=369,
            padding= "max_length",
            truncation= "only_second",
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors= "pt"
        ).to(device)
    
    generated_ids = model.generate(
        input_ids=source_encoding['input_ids'],
        attention_mask= source_encoding['attention_mask'],
        num_beams= 1,
        max_length=80,
        length_penalty=1.0,
        early_stopping=True,
        use_cache= True
    )
    preds = [
        tokernizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for gen_id in generated_ids
    ]
    return "".join(preds)

In [47]:
sample = val_df.iloc[890]
print(f"Context is: {sample['context']}"
      f"\nQuestion is: {sample['question']}"
      f"\nActual answer: {sample['text']}"
      f"\nPredected answer: {generate_answer(sample)}")


Context is: Multiracial Americans are Americans who have mixed ancestry of "two or more races". The term may also include Americans of mixed-race ancestry who self-identify with just one group culturally and socially (cf. the one-drop rule). In the 2010 US census, approximately 9 million individuals, or 2.9% of the population, self-identified as multiracial. There is evidence that an accounting by genetic ancestry would produce a higher number, but people live according to social and cultural identities, not DNA. Historical reasons, including slavery creating a racial caste and the European-American suppression of Native Americans, often led people to identify or be classified by only one ethnicity, generally that of the culture in which they were raised. Prior to the mid-20th century, many people hid their multiracial heritage because of racial discrimination against minorities. While many Americans may be biologically multiracial, they often do not know it or do not identify so cultu

In [48]:
sample = val_df.iloc[600]
print(f"Context is: {sample['context']}"
      f"\nQuestion is: {sample['question']}"
      f"\nActual answer: {sample['text']}"
      f"\nPredected answer: {generate_answer(sample)}")


Context is: The largest private university in Uruguay, is also located in Montevideo. ORT Uruguay was first established as a non-profit organization in 1942, and was officially certified as a private university in September 1996, becoming the second private educational institution in the country to achieve that status.[citation needed] It is a member of World ORT, an international educational network founded in 1880 by the Jewish community in Saint Petersburg, Russia. The university has about 8,000 students, distributed among 5 faculties and institutes, mainly geared towards the sciences and technology/engineering. Its current rector as of 2010[update] is Dr. Jorge A. Grünberg.
Question is: How many students does the ORT Uruguay university have?
Actual answer: about 8,000
Predected answer: 8,000


In [49]:
sample = val_df.iloc[8]
print(f"Context is: {sample['context']}"
      f"\nQuestion is: {sample['question']}"
      f"\nActual answer: {sample['text']}"
      f"\nPredected answer: {generate_answer(sample)}")


Context is: The Warsaw Treaty's organization was two-fold: the Political Consultative Committee handled political matters, and the Combined Command of Pact Armed Forces controlled the assigned multi-national forces, with headquarters in Warsaw, Poland. Furthermore, the Supreme Commander of the Unified Armed Forces of the Warsaw Treaty Organization which commands and controls all the military forces of the member countries was also a First Deputy Minister of Defense of the USSR, and the Chief of Combined Staff of the Unified Armed Forces of the Warsaw Treaty Organization was also a First Deputy Chief of the General Staff of the Armed Forces of the USSR. Therefore, although ostensibly an international collective security alliance, the USSR dominated the Warsaw Treaty armed forces.
Question is: Despite being headquartered in Poland, the top-ranking operatives of the Warsaw Pact were from which country?
Actual answer: the USSR
Predected answer: USSR


In [50]:
# these functions are heavily influenced by the HF squad_metrics.py script
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [51]:
def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

In [52]:
def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [53]:
eval_loader= DataLoader(val_dataset,
                         batch_size= 1,
                         shuffle= True,
                         num_workers=4)

In [55]:
EM= 0
F1= 0
model.eval()
with torch.no_grad():
    for question, context, answer_text, input_ids, attention_mask, labels in tqdm(eval_loader, desc=f'Validation'):
        sample = {"context": context[0], "question": question[0]}
        EM += compute_exact_match(answer_text[0], generate_answer(sample))
        F1 += compute_f1(answer_text[0], generate_answer(sample))
    EM /= len(eval_loader)
    F1 /= len(eval_loader)
    print(f"F1: {F1}, EM: {EM}")

Validation:   0%|          | 0/8683 [00:00<?, ?it/s]

F1: 0.8936549695611788, EM: 0.8009904410917885
