# Question Answering with SQuAD Dataset and Pre-Trained GPT 2

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split

import math
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import random
import json
import re
import time
import pickle

from matplotlib import pyplot as plt
#plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.figsize'] = (4, 2)
plt.rcParams['axes.grid'] = True

%load_ext autoreload
%autoreload 2
    
#from models.transformer import GPT
from transformers import AutoTokenizer, GPT2LMHeadModel
    
from transformers import BertTokenizerFast

from datasets import load_dataset

import re

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'USING DEVICE: {device}')

USING DEVICE: cuda


# Hyperparameters

In [2]:
steplr = {
    'type': 'StepLR',
    #'step_size': 2,
    'step_size': 800,
    #'gamma': 0.80,
    'gamma': 0.99,
}

reduce_lr_on_plateau = {
    'type': 'ReduceLROnPlateau',
    'mode': 'min',
    'factor': 0.1,
    'patience': 3,
    'cooldown': 0,
    'min_lr': 1e-7,
}

# TODO: revise the max_seq_len and context_size
hyperparameters = {
    'seed': 99999,
    'batch_size': 32,
    #'vocab_size': 50_257,
    #'max_seq_len': 256, 
    'context_size': 256,
    'split_ratio': 0.75,
    'num_epochs': 2,
    #'num_training_iters': 5_000,
    #'num_validation_iters': 1_000,
    'optimizer': {
        'learning_rate': 1e-4, # lower than default for fine tuning
        'momentum': 0.9, # SGD
        'optimizer_betas': (0.9, 0.999), # Adam, AdamW
        'weight_decay': 1e-2, # AdamW
    },
    'clip_grad_norm': 1.0,
    'grad_accum_iter': 4,
    'learning_rate_sched_config': reduce_lr_on_plateau,
    #'dataset_dir': '../data/trwiki-20231120-pages-articles/',
    'model_base_name': 'QA_GPT2_SQuAD_FineTune',
    'model_config': {}, # removed from above!
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False # !!!!!!!!!!!!!!!!!!!!!
    
seed_everything(hyperparameters['seed'])

# Tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained('openai-community/gpt2')
len(tokenizer.vocab)

50257

## Set Special Tokens for QA 

In [5]:
tokenizer.pad_token = tokenizer.eos_token # !!!!!!!!!!

### Update hyperparameters

In [6]:
hyperparameters['model_config']['vocab_size'] = len(tokenizer.vocab)

In [7]:
len(tokenizer.vocab)

50257

# Dataset

In [8]:
squad_full = load_dataset('rajpurkar/squad')

squad_train = squad_full['train']
squad_val = squad_full['validation']

print(len(squad_train))
print(len(squad_val))

87599
10570


In [9]:
dataset_train_df = squad_train.to_pandas()
dataset_val_df = squad_val.to_pandas()

In [10]:
dataset_train_df.head()

Unnamed: 0,id,title,context,question,answers
0,5733be284776f41900661182,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,"{'text': ['Saint Bernadette Soubirous'], 'answ..."
1,5733be284776f4190066117f,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,"{'text': ['a copper statue of Christ'], 'answe..."
2,5733be284776f41900661180,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,"{'text': ['the Main Building'], 'answer_start'..."
3,5733be284776f41900661181,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,{'text': ['a Marian place of prayer and reflec...
4,5733be284776f4190066117e,University_of_Notre_Dame,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,{'text': ['a golden statue of the Virgin Mary'...


In [11]:
dataset_val_df.head()

Unnamed: 0,id,title,context,question,answers
0,56be4db0acb8001400a502ec,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,"{'text': ['Denver Broncos', 'Denver Broncos', ..."
1,56be4db0acb8001400a502ed,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,"{'text': ['Carolina Panthers', 'Carolina Panth..."
2,56be4db0acb8001400a502ee,Super_Bowl_50,Super Bowl 50 was an American football game to...,Where did Super Bowl 50 take place?,"{'text': ['Santa Clara, California', 'Levi's S..."
3,56be4db0acb8001400a502ef,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team won Super Bowl 50?,"{'text': ['Denver Broncos', 'Denver Broncos', ..."
4,56be4db0acb8001400a502f0,Super_Bowl_50,Super Bowl 50 was an American football game to...,What color was used to emphasize the 50th anni...,"{'text': ['gold', 'gold', 'gold'], 'answer_sta..."


In [12]:
def print_sample_qa(df, sample_id):
    print('CONTEXT:')
    print(df['context'][sample_id])
    print('QUESTION:')
    print(df['question'][sample_id])
    print('ANSWER:')
    print(df['answers'][sample_id]['text'][0])

In [13]:
print_sample_qa(dataset_train_df, 0)

CONTEXT:
Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
QUESTION:
To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
ANSWER:
Saint Bernadette Soubirous


In [14]:
print_sample_qa(dataset_train_df, 1000)

CONTEXT:
After Hurricane Katrina in 2005, Beyoncé and Rowland founded the Survivor Foundation to provide transitional housing for victims in the Houston area, to which Beyoncé contributed an initial $250,000. The foundation has since expanded to work with other charities in the city, and also provided relief following Hurricane Ike three years later.
QUESTION:
How much did Beyonce initially contribute to the foundation?
ANSWER:
$250,000


# Prompt Format

In [15]:
# closed (context dependent) question answering
#make_prompt_df = lambda df_row:f'{df_row["context"]} [QUESTION] {df_row["question"]} [ANSWER] {df_row["answers"]["text"][0]}'

# open-ended (next token prediction) question answering
make_prompt_df = lambda df_row:f'[QUESTION] {df_row["question"]} [ANSWER] {df_row["answers"]["text"][0]} [END]'

In [16]:
# use context as language modelling
#dataset_train_df['prompt'] = dataset_train_df[['context']]
#dataset_val_df['prompt'] = dataset_train_df[['context']]

# closed (context dependent) question answering
#dataset_train_df['prompt'] = dataset_train_df[['context', 'question', 'answers']].apply(make_prompt_df, axis=1)
#dataset_val_df['prompt'] = dataset_val_df[['context', 'question', 'answers']].apply(make_prompt_df, axis=1)

# open-ended (next token prediction) question answering
dataset_train_df['prompt'] = dataset_train_df[['question', 'answers']].apply(make_prompt_df, axis=1)
dataset_val_df['prompt'] = dataset_val_df[['question', 'answers']].apply(make_prompt_df, axis=1)

# Set Maximum Sequence (Token) Length 

In [17]:
def find_optimal_token_len(df_train, df_val, tokenizer):
    # keep only words and punctuation
    pattern = r'\w+|[^\w\s]'
    
    def text_process(text):
        result = ' '.join(re.findall(pattern, text))
        return len(tokenizer.encode(result))
    
    train_token_len = df_train['prompt'].map(lambda x:text_process(x)).values
    val_token_len = df_val['prompt'].map(lambda x:text_process(x)).values

    total_token_len = np.concat((train_token_len, val_token_len))
    
    print(f'Mean: {total_token_len.mean()}')
    print(f'Std: {total_token_len.std()}')
    print(f'Max: {total_token_len.max()}')
    print(f'Mean + 2*Std {total_token_len.mean() + 2*total_token_len.std()}')
    print(f'Mean + 3*Std {total_token_len.mean() + 3*total_token_len.std()}')
    
    return int(total_token_len.max()), round(total_token_len.mean() + 3*total_token_len.std())

In [18]:
token_len_max, token_len_3std = find_optimal_token_len(dataset_train_df, dataset_val_df, tokenizer)
token_len_max, token_len_3std

Mean: 28.756613594923042
Std: 5.7988131571338775
Max: 99
Mean + 2*Std 40.354239909190795
Mean + 3*Std 46.153053066324674


(99, 46)

In [19]:
token_len_max, token_len_3std

(99, 46)

# Dataset Class

In [20]:
class SQuADDataset(Dataset):
    def __init__(self, df, tokenizer, max_seq_len):
        
        self.df = df
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        selected_row = self.df.iloc[idx]
        prompt_text = selected_row['prompt']
        
        tokenized_dict = self.tokenizer.encode_plus(
            prompt_text,
            padding='max_length',
            max_length=self.max_seq_len,
            truncation=True,
            add_special_tokens=True,
            return_token_type_ids=False,
        )

        input_ids = torch.tensor(tokenized_dict['input_ids'], dtype=torch.long)
        attention_mask = torch.tensor(tokenized_dict['attention_mask'], dtype=torch.float32)

        #### SHIFT BY ONE TOKEN ####
        input_input_ids = input_ids[:-1]
        input_attention_mask = attention_mask[:-1]
        target_input_ids = input_ids[1:]
        ############################
                
        return input_input_ids, input_attention_mask, target_input_ids

In [21]:
dataset_train = SQuADDataset(
    df=dataset_train_df,
    tokenizer=tokenizer,
    max_seq_len=token_len_3std,
)

dataset_val = SQuADDataset(
    df=dataset_val_df,
    tokenizer=tokenizer,
    max_seq_len=token_len_3std,
)

In [22]:
i, a, o = dataset_train[0]
i.shape, a.shape, o.shape

(torch.Size([45]), torch.Size([45]), torch.Size([45]))

In [23]:
tokenizer.decode(i.tolist())

'[QUESTION] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [ANSWER] Saint Bernadette Soubirous [END]<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

In [24]:
tokenizer.decode(o.tolist())

'QUESTION] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [ANSWER] Saint Bernadette Soubirous [END]<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'

# Dataloader

In [25]:
dataloader_train = DataLoader(
    dataset_train, 
    batch_size=hyperparameters['batch_size'], 
    shuffle=True,
    #num_workers=1, 
    pin_memory=True,
)

dataloader_val = DataLoader(
    dataset_val, 
    batch_size=hyperparameters['batch_size'], 
    shuffle=True,
    #num_workers=1, 
    pin_memory=True,
)

In [26]:
ib, ab, ob = next(iter(dataloader_train))
ib.shape, ab.shape, ob.shape

(torch.Size([32, 45]), torch.Size([32, 45]), torch.Size([32, 45]))

In [27]:
ib, ab, ob = next(iter(dataloader_val))
ib.shape, ab.shape, ob.shape

(torch.Size([32, 45]), torch.Size([32, 45]), torch.Size([32, 45]))

# Model

In [28]:
#model = GPT(**hyperparameters['model_config'])
model = GPT2LMHeadModel.from_pretrained('openai-community/gpt2')

print(f'Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}')

Trainable parameters: 124,439,808


In [29]:
for n, m in model.named_children():
    print(f'{n} parameters: {sum(p.numel() for p in m.parameters() if p.requires_grad):,}')

transformer parameters: 124,439,808
lm_head parameters: 38,597,376


In [30]:
_batch_size, _seq_len = 1, 20

pred = model(torch.randint(low=0, high=hyperparameters['model_config']['vocab_size'], size=(_batch_size, _seq_len))).logits
pred.shape

torch.Size([1, 20, 50257])

# Training

In [31]:
PBAR_UPDATE_FREQ = 60

list_avg = lambda l: sum(l)/len(l)

In [32]:
def train_iter_amp(dataloader, model, optimizer, criterion, scaler, epoch, clip_grad_norm, pbar_update_freq, grad_accum_iter, device):
    model.train()
    
    avg_loss = []
    count = 0

    pbar = tqdm(dataloader, unit=' batch', position=0, leave=True)
    
    pbar.set_description(f'Epoch: {epoch}, Train')
        
    for batch_idx, (input_token_ids_batch, attention_mask_batch, target_token_ids_batch) in enumerate(dataloader):

        input_token_ids_batch = input_token_ids_batch.to(device)
        attention_mask_batch = attention_mask_batch.to(device)
        target_token_ids_batch = target_token_ids_batch.to(device)

        optimizer.zero_grad()

        with torch.autocast(device_type=device, dtype=torch.float16, enabled=True):
            pred_token_ids_batch = model(
                input_ids=input_token_ids_batch,
                attention_mask=attention_mask_batch,
                #x_input=input_token_ids_batch, 
                #pad_mask=attention_mask_batch
            ).logits ## RETURN LOGITS!
    
            # Combine batch and seq_len dims together to form a "longer batch"
            loss = criterion(
                pred_token_ids_batch.view(-1, pred_token_ids_batch.size(-1)),
                target_token_ids_batch.view(-1)
            )

        scaler.scale(loss).backward()
        ### CLIPPING ######
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)
        ###################
        scaler.step(optimizer)
        scaler.update()
        
        avg_loss.append(loss.item())
        count += 1

        if count % pbar_update_freq  == 0:
            #iter_accuracy = 100.0 * acc_correct / acc_total
            #pbar.set_postfix_str(f'Loss: {list_avg(avg_loss):.4f} Acc: {iter_accuracy:.2f}')
            pbar.set_postfix_str(f'Loss: {list_avg(avg_loss):.4f}')
            pbar.update(pbar_update_freq)
        
    pbar.close()
    
    return list_avg(avg_loss)


@torch.no_grad()
def eval_iter_amp(dataloader, model, criterion, epoch, pbar_update_freq, device):
    model.eval()

    avg_loss = []
    count = 0
    
    pbar = tqdm(dataloader, unit=' batch', position=0, leave=True)
    
    pbar.set_description(f'Epoch: {epoch}, Eval')
    
    for input_token_ids_batch, attention_mask_batch, target_token_ids_batch in dataloader:

        input_token_ids_batch = input_token_ids_batch.to(device)
        attention_mask_batch = attention_mask_batch.to(device)
        target_token_ids_batch = target_token_ids_batch.to(device)

        with torch.autocast(device_type=device, dtype=torch.float16, enabled=True):
            pred_token_ids_batch = model(
                input_ids=input_token_ids_batch,
                attention_mask=attention_mask_batch,
                #x_input=input_token_ids_batch,
                #pad_mask=attention_mask_batch
            ).logits ## RETURN LOGITS!
        
            # Combine batch and seq_len dims together to form a "longer batch"
            loss = criterion(
                pred_token_ids_batch.view(-1, pred_token_ids_batch.size(-1)),
                target_token_ids_batch.view(-1)
            )

            
        avg_loss.append(loss.item())
        count += 1
        
        if count % pbar_update_freq  == 0:
            #iter_accuracy = 100.0 * acc_correct / acc_total
            #pbar.set_postfix_str(f'Loss: {list_avg(avg_loss):.4f} Acc: {iter_accuracy:.2f}')
            pbar.set_postfix_str(f'Loss: {list_avg(avg_loss):.4f}')
            pbar.update(pbar_update_freq)

    pbar.close()

    return list_avg(avg_loss)

# Optimizer, Scheduler & Loss

In [33]:
optimizer = torch.optim.AdamW(
    model.parameters(), 
    lr=hyperparameters['optimizer']['learning_rate'],
    betas=hyperparameters['optimizer']['optimizer_betas'],
    weight_decay=hyperparameters['optimizer']['weight_decay']
)


"""
optimizer = torch.optim.SGD(
    model.parameters(), 
    lr=hyperparameters['optimizer']['learning_rate'],
    momentum=hyperparameters['optimizer']['momentum']
    #weight_decay=hyperparameters['optimizer']['weight_decay']
)
"""

# PAD ID IGNORE!!
#criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.vocab['[PAD]'])
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.encode(tokenizer.pad_token)[0])

lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer,
    mode=hyperparameters['learning_rate_sched_config']['mode'],
    factor=hyperparameters['learning_rate_sched_config']['factor'],
    patience=hyperparameters['learning_rate_sched_config']['patience'],
    cooldown=hyperparameters['learning_rate_sched_config']['cooldown'],
    min_lr=hyperparameters['learning_rate_sched_config']['min_lr'],
    #verbose=True,
)

# Grad Scaler (FP16) For Automatic Mixed Precision (AMP)

In [34]:
scaler = torch.amp.GradScaler()

# Save/Load Functions

In [35]:
def save_model(model, optimizer, root_folder, file_name, hyperparameter_dict, verbose=False):
    os.makedirs(root_folder, exist_ok=True)
    model_full_path = os.path.join(root_folder, file_name+'.pt')
    
    torch.save({
        'hyperparameters': hyperparameter_dict,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'saved_time_unix': time.time(),
        'saved_time_asctime': time.asctime(),
    }, model_full_path)
    
    if verbose:
        print(f'Model: {file_name} is saved successfully')
    
    
def load_model(model, optimizer, root_folder, file_name):
    model_full_path = os.path.join(root_folder, file_name+'.pt')
    #checkpoint = torch.load(model_full_path, map_location='cpu')
    checkpoint = torch.load(model_full_path)
    
    model.load_state_dict(checkpoint['model_state_dict'], strict=True)
    
    #if optimizer is not None:
    #    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    
    print(f'Model: {file_name} is loaded successfully')
    
    return checkpoint

# Language Modelling Functions

In [36]:
def generate_multinomial_sampling_amp(model, idx, max_new_tokens, max_seq_len, temp=1.0, topk=None):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -max_seq_len:]

        with torch.autocast(device_type=device, dtype=torch.float16, enabled=True):
            
            logits = model(input_ids=idx_cond, attention_mask=None).logits ## RETURN LOGITS!
    
            logits = logits[:, -1, :]
                       
            if topk is not None:
                _values, _indices = F.softmax(logits/temp, dim=-1).topk(topk, dim=1)
                #probs = F.softmax(logits/temp, dim=-1).topk(topk, dim=-1).values
                probs = _values
            else:
                probs = F.softmax(logits/temp, dim=-1)
            
            # sample from the distribution
            _idx_next = torch.multinomial(probs, num_samples=1) 
    
            idx_next = _indices[:, _idx_next[0]]
    
            idx = torch.cat((idx, idx_next), dim=1)
    return idx


def generate_text(model, device, n_tokens, temp, context=None, topk=None, remove_newlines=True, skip_after_end_token=True):

    model.eval()
    
    if context is None:
        context = torch.tensor([tokenizer.encode('', add_special_tokens=False)], dtype=torch.long, device=device)
    else:
        context = torch.tensor([tokenizer.encode(context, add_special_tokens=False)], dtype=torch.long, device=device)

    _genereted = generate_multinomial_sampling_amp(
            model,
            context, 
            max_new_tokens=n_tokens, 
            max_seq_len=hyperparameters['context_size'], 
            temp=temp,
            topk=topk
        )
    
    generated = tokenizer.decode(
        _genereted[0].tolist()
    )
    
    if remove_newlines:
        generated = generated.replace('\n', '')

    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    # END TOKEN IS ASSUMED TO BE [END]
    if skip_after_end_token:
        generated = generated.split('[END]')[0]
        
    print(generated)

### Start Epoch

In [37]:
START_EPOCH = 1

### Load Pre-Trained Model (Optional)

In [38]:
#checkpoint = load_model(model, optimizer, './saved_models', f"{hyperparameters['model_base_name']}_best")
#START_EPOCH = checkpoint['last_epoch'] + 1

# Start Training

In [39]:
def start_training(start_epoch=1):
    print(f'Start trainin from epoch: {start_epoch}')
    
    model.to(device)
    criterion.to(device)
    
    for epoch in range(start_epoch, hyperparameters['num_epochs']+1):
        
        train_loss = train_iter_amp(
            dataloader_train, 
            model, 
            optimizer, 
            criterion, 
            scaler, 
            epoch,
            hyperparameters['clip_grad_norm'], 
            PBAR_UPDATE_FREQ, 
            hyperparameters['grad_accum_iter'],
            device
        )
        
        val_loss = eval_iter_amp(
            dataloader_val, 
            model, 
            criterion, 
            epoch, 
            PBAR_UPDATE_FREQ, 
            device
        )
        
        print(f'Epoch: {epoch}, [LOSS] train: {train_loss:.4f}, val: {val_loss:.4f}')
        
        generate_text(
            model=model, 
            device=device,
            n_tokens=100, 
            temp=0.75, 
            context='[QUESTION] To whom did the ',
            topk=50,
            remove_newlines=False
        )
        
        # LR Scheduling
        lr_scheduler.step(val_loss)

### Test (Before Training)

In [40]:
model.to(device)

generate_text(
    model=model, 
    device=device,
    n_tokens=250, 
    temp=0.75, 
    context='[QUESTION] How much did Beyonce initially contribute to the foundation? [ANSWER]',
    topk=150,
    remove_newlines=False,
    skip_after_end_token=False,
)

[QUESTION] How much did Beyonce initially contribute to the foundation? [ANSWER] I think she may have contributed to a lot of things. She was a very prolific artist back then, and I think that was a really large part of her influence. I remember that she gave me a piece of paper to remember the foundation, and they helped her with it. I remember she did a lot of writing to help me remember. I think that was really a part of her personality as well.

AMY GOODMAN: And how did she get the money for the foundation?

JACOB JONES: She used to help me fund everything. She used to be one of the most generous people I have ever met, and she doesn't know what she's going through. She didn't have a lot of money. She used to be a very creative person. She knew how to do things. I think her foundation helped her a lot.

And she wasn't the only person who was involved in this. I think she had a very long list that she knew how to do. She had a list of things to do. I think she had a lot of things th

In [41]:
generate_text(
    model=model, 
    device=device,
    n_tokens=250, 
    temp=0.75, 
    context='[QUESTION] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [ANSWER]',
    topk=150,
    remove_newlines=False,
    skip_after_end_token=False,
)

[QUESTION] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [ANSWER]

QUESTION: Your Honor, I can't answer that question. I have been in France for some years, and they have all the customs, but I am not sure that I read the records.

QUESTION: And do you recall at some point, at some point during the campaign, the day you said to the ambassador asked you, "From what records would they be able to find this Virgin Mary?"

QUESTION: Of course I was curious what you said. And you know, I was not one of the people who signed the petition. I was not one of those people who was in the press or in the press in those days. And you can tell how much I was shocked. I was very surprised when I got that letter.

QUESTION: I understand.

QUESTION: And you know, I also wrote a letter to the ambassador and to the ambassador, to the minister of the interior. I didn't get to that point.

QUESTION: You know, I was in the French press and I was a part of it. But at some point, I th

### Start Training

In [42]:
#optimizer.param_groups[0]['lr'] = 0.0001

In [43]:
start_training(start_epoch=START_EPOCH)

Start trainin from epoch: 1


Epoch: 1, Train:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 2700/2738 [07:29<00:06,  6.00 batch/s, Loss: 2.5026]
Epoch: 1, Eval:  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 300/331 [00:15<00:01, 19.63 batch/s, Loss: 2.5095]


Epoch: 1, [LOSS] train: 2.5011, val: 2.5082
[QUESTION] To whom did the  National Institute of Architects release its report on the design of the World Trade Center? [ANSWER] the American Red Cross 


Epoch: 2, Train:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 2700/2738 [07:31<00:06,  5.98 batch/s, Loss: 2.2990]
Epoch: 2, Eval:  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 300/331 [00:15<00:01, 18.80 batch/s, Loss: 2.5110]


Epoch: 2, [LOSS] train: 2.2989, val: 2.5085
[QUESTION] To whom did the  "No Surprises" campaign take place in the US? [ANSWER] the AFL 


# Test

In [44]:
generate_text(
    model=model, 
    device=device,
    n_tokens=75, 
    temp=0.55, 
    context='[QUESTION] How much did Beyonce initially contribute to the foundation? [ANSWER]',
    topk=50,
    remove_newlines=False,
    skip_after_end_token=True,
)

[QUESTION] How much did Beyonce initially contribute to the foundation? [ANSWER] $1 million 


In [45]:
generate_text(
    model=model, 
    device=device,
    n_tokens=75, 
    temp=0.55, 
    context='[QUESTION] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [ANSWER]',
    topk=50,
    remove_newlines=False,
    skip_after_end_token=True,
)

[QUESTION] To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? [ANSWER] Pope Innocent III 


In [48]:
generate_text(
    model=model, 
    device=device,
    n_tokens=75, 
    temp=0.55, 
    context='[QUESTION] What color are the grass? [ANSWER]',
    topk=50,
    remove_newlines=False,
    skip_after_end_token=True,
)

[QUESTION] What color are the grass? [ANSWER] green 


# Save Model

In [47]:
save_model(
    model=model,
    optimizer=optimizer, 
    root_folder='./saved_models/', 
    file_name=hyperparameters['model_base_name'], 
    hyperparameter_dict=hyperparameters, 
    verbose=True
)

Model: QA_GPT2_SQuAD_FineTune is saved successfully


# The End