# 1. Data preparation

## i. Get SQuAD v2.0 dataset

In [6]:
import os
import requests

if not os.path.exists('squad'):
    os.mkdir('squad')

url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'
res = requests.get(f'{url}train-v2.0.json')

# loop through
for file in ['train-v2.0.json', 'dev-v2.0.json']:
    
    # make the request to download data over HTTP
    res = requests.get(f'{url}{file}')
    
    # write to file
    with open(f'squad/{file}', 'wb') as f:
        for chunk in res.iter_content(chunk_size=4):
            f.write(chunk)

## ii. Extract JSON Info

In [7]:
import json

def read_squad(path):
    # open JSON file and load intro dictionary
    with open(path, 'rb') as f:
        squad_dict = json.load(f)
    # initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                # check if we need to be extracting from 'answers' or 'plausible_answers'
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers

# execute our read SQuAD function for training and validation sets
train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')

## iii. Get Dataset Stats

In [28]:
n = len(train_contexts)
m = len(val_contexts)

avg = 0
for x in train_contexts:
    avg += len(x.split())
for x in val_contexts:
    avg += len(x.split())   
avg /= (n+m)
print(f'article_len_avg = {avg}')
b = 0
for x in train_questions:
    b += len(x.split())
for x in val_questions:
    b += len(x.split())
b /= (n+m)
print(f'question_len_avg = {b}')
c = 0
for x in train_answers:
    c += len(x['text'].split())
for x in val_answers:
    c += len(x['text'].split())
c /= (n+m)
print(f'answer_len_avg = {c}')

article_len_avg = 121.15094122682065
question_len_avg = 9.94573014544781
answer_len_avg = 3.1744160050079526


## iv. Sample Articles, Questions & Answers

In [30]:
val_contexts[:5]

['The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates 

In [31]:
val_questions[:5]

['In what country is Normandy located?',
 'In what country is Normandy located?',
 'In what country is Normandy located?',
 'In what country is Normandy located?',
 'When were the Normans in Normandy?']

In [29]:
val_answers[:5]

[{'answer_end': 165, 'answer_start': 159, 'text': 'France'},
 {'answer_end': 165, 'answer_start': 159, 'text': 'France'},
 {'answer_end': 165, 'answer_start': 159, 'text': 'France'},
 {'answer_end': 165, 'answer_start': 159, 'text': 'France'},
 {'answer_end': 117, 'answer_start': 94, 'text': '10th and 11th centuries'}]

## v. Reformat Data for fine-tuning with GPT-3

In [9]:
import pandas as pd

def create_fine_tuning_dataset(contexts, questions, answers):
    rows = []
    for context, question, answer in zip(contexts, questions, answers):
        rows.append({"prompt":f"{context}\nQuestion: {question}\nAnswer:", "completion":f" {answer['text']}"})
    return pd.DataFrame(rows) 

df_train = create_fine_tuning_dataset(train_contexts, train_questions, train_answers)
df_val = create_fine_tuning_dataset(val_contexts, val_questions, val_answers)
df_train

Unnamed: 0,prompt,completion
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas"
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s
...,...,...
130314,"The term ""matter"" is used throughout physics i...",matter
130315,"The term ""matter"" is used throughout physics i...",Alfvén
130316,"The term ""matter"" is used throughout physics i...",Gk. common matter
130317,"The term ""matter"" is used throughout physics i...",a specifying modifier


In [14]:
for train_val, contexts, questions, answers in [
    ('train', train_contexts, train_questions, train_answers), 
    ('val', val_contexts, val_questions, val_answers)]:
    ft = create_fine_tuning_dataset(contexts, questions, answers)
    ft.to_json(f'qa_{train_val}.jsonl', orient='records', lines=True)

# 2. Fine-tuning using openai API

## i. Install dependencies and make fine-tuning request

In [3]:
!pip install --upgrade openai
!pip install wandb

Collecting openai
  Downloading openai-0.18.1.tar.gz (42 kB)
[?25l[K     |███████▊                        | 10 kB 31.3 MB/s eta 0:00:01[K     |███████████████▍                | 20 kB 32.0 MB/s eta 0:00:01[K     |███████████████████████▏        | 30 kB 19.0 MB/s eta 0:00:01[K     |██████████████████████████████▉ | 40 kB 8.6 MB/s eta 0:00:01[K     |████████████████████████████████| 42 kB 1.3 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pandas-stubs>=1.1.0.11
  Downloading pandas_stubs-1.2.0.58-py3-none-any.whl (162 kB)
[K     |████████████████████████████████| 162 kB 17.3 MB/s 
Building wheels for collected packages: openai
  Building wheel for openai (PEP 517) ... [?25l[?25hdone
  Created wheel for openai: filename=openai-0.18.1-py3-none-any.whl size=53168 sha256=fa145f756e3710c35ea8c16f6ad0602a255960b369a9c98d961d32feb7dfa5bd
  Stored

In [1]:
import os
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")

In [17]:
!openai api fine_tunes.create -t "qa_train.jsonl" -v "qa_val.jsonl" -m "ada" --batch_size 16 --n_epochs 1

Upload progress:   0% 0.00/14.7M [00:00<?, ?it/s]Upload progress: 100% 14.7M/14.7M [00:00<00:00, 19.8Git/s]
Uploaded file from partial_qa_train.jsonl: file-kozQJfxawphJClzuP7LMoKEm
Upload progress: 100% 3.19M/3.19M [00:00<00:00, 7.28Git/s]
Uploaded file from partial_qa_val.jsonl: file-GDkkBdnwF9T4j1tSx3UpGFww
Created fine-tune: ft-NITZelagV69xhSbCM2tvbcdl
Streaming events until fine-tuning is complete...

(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2022-05-07 23:01:54] Created fine-tune: ft-NITZelagV69xhSbCM2tvbcdl
[2022-05-07 23:02:16] Fine-tune costs $8.86
[2022-05-07 23:02:17] Fine-tune enqueued. Queue number: 0
[2022-05-07 23:02:19] Fine-tune started

Stream interrupted (client disconnected).
To resume the stream, run:

  openai api fine_tunes.follow -i ft-NITZelagV69xhSbCM2tvbcdl



## ii. Use wandb to visualize model performance

In [21]:
!openai wandb sync

[34m[1mwandb[0m: Currently logged in as: [33mandyl98[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.12.16
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/wandb/run-20220507_232109-ft-NITZelagV69xhSbCM2tvbcdl[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mft-NITZelagV69xhSbCM2tvbcdl[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/andyl98/GPT-3[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/andyl98/GPT-3/runs/ft-NITZelagV69xhSbCM2tvbcdl[0m
[34m[1mwandb[0m: Waiting for W&B process to finish... [32m(success).[0m
[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:             elapsed_examples ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
[34m[1mwandb[0m:               elapsed_token

## iii. Test the model's QA ability on given task

In [32]:
ft_qa = "ada:ft-personal-2022-05-07-22-50-48"

def apply_ft_qa_answer(context, question, answering_model):
    prompt = f"{context}\nQuestion: {question}\nAnswer:"
    result = openai.Completion.create(model=answering_model, prompt=prompt, max_tokens=30, temperature=0, top_p=1, n=1, stop=['.','\n'])
    return result['choices'][0]['text']

apply_ft_qa_answer(
    'Moninder Singh Pandher was one of 19 victims in one of the most gruesome serial killings in India. Pandhe was sentenced to death by a lower court in February. The teen was one among 19 victims -- children and young women.', 
    'What was the amount of children murdered?', 
    ft_qa
)

' 19 victims -- children and young women'

## iv.Check model results

In [22]:
!openai api fine_tunes.results -i ft-SahNQOil65YD9QWNG6Smhvr5 > result.csv

In [24]:
df = pd.read_csv('result.csv')
df

Unnamed: 0,step,elapsed_tokens,elapsed_examples,training_loss,training_sequence_accuracy,training_token_accuracy,validation_loss,validation_sequence_accuracy,validation_token_accuracy
0,1,4496,16,0.200743,0.0000,0.629630,0.129980,0.1875,0.718310
1,1,5520,16,0.156513,0.0625,0.684211,0.147158,0.0625,0.727273
2,2,10144,32,0.209017,0.3125,0.703704,,,
3,3,22192,48,0.064932,0.4375,0.861111,,,
4,4,28096,64,0.123410,0.3750,0.857143,,,
...,...,...,...,...,...,...,...,...,...
8143,8143,42941296,130288,0.144132,0.7500,0.913043,,,
8144,8144,42947072,130304,0.155371,0.6250,0.903226,,,
8145,8145,42951824,130320,0.155763,0.8125,0.933333,,,
8146,8146,42957344,130336,0.152786,0.8125,0.941176,,,


In [26]:
df[~df['validation_sequence_accuracy'].isna()]

Unnamed: 0,step,elapsed_tokens,elapsed_examples,training_loss,training_sequence_accuracy,training_token_accuracy,validation_loss,validation_sequence_accuracy,validation_token_accuracy
0,1,4496,16,0.200743,0.0,0.62963,0.12998,0.1875,0.71831
1,1,5520,16,0.156513,0.0625,0.684211,0.147158,0.0625,0.727273
965,965,4869200,15440,0.082943,0.5,0.795455,0.186518,0.5,0.842105
1933,1933,9608144,30928,0.138885,0.75,0.931034,0.181631,0.5,0.862069
2722,2722,13841824,43552,0.166229,0.5,0.757576,0.21348,0.5,0.861538
3489,3489,17930768,55824,0.143449,0.625,0.938144,0.136962,0.4375,0.818182
4257,4257,22124048,68112,0.145425,0.4375,0.836066,0.187866,0.5625,0.911392
5029,5029,26357712,80464,0.115188,0.625,0.890909,0.202206,0.625,0.904762
5804,5804,30565184,92864,0.124534,0.75,0.947368,0.162558,0.4375,0.821429
6579,6579,34675376,105264,0.135424,0.5,0.897436,0.146491,0.5,0.868852
