## Data preparation

### i. Get SQuAD v2.0 Dataset

In [5]:
import os
import requests

if not os.path.exists('squad'):
    os.mkdir('squad')

url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'
res = requests.get(f'{url}train-v2.0.json')

# loop through
for file in ['train-v2.0.json', 'dev-v2.0.json']:
    
    # make the request to download data over HTTP
    res = requests.get(f'{url}{file}')
    
    # write to file
    with open(f'squad/{file}', 'wb') as f:
        for chunk in res.iter_content(chunk_size=4):
            f.write(chunk)

### ii. Extract JSON Info

In [6]:
import json

def read_squad(path):
    # open JSON file and load intro dictionary
    with open(path, 'rb') as f:
        squad_dict = json.load(f)
    # initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                # check if we need to be extracting from 'answers' or 'plausible_answers'
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers

# execute our read SQuAD function for training and validation sets
train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')

In [7]:
val_contexts[:5] # same paragraph, as expected

['The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.',
 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates 

In [8]:
val_questions[:5]

['In what country is Normandy located?',
 'In what country is Normandy located?',
 'In what country is Normandy located?',
 'In what country is Normandy located?',
 'When were the Normans in Normandy?']

In [9]:
val_answers[:5]

[{'text': 'France', 'answer_start': 159},
 {'text': 'France', 'answer_start': 159},
 {'text': 'France', 'answer_start': 159},
 {'text': 'France', 'answer_start': 159},
 {'text': '10th and 11th centuries', 'answer_start': 94}]

### iii. Add "answer_end" Values to Answers

In [10]:
def add_end_idx(answers, contexts):
    # loop through each answer-context pair
    for answer, context in zip(answers, contexts):
        # gold_text refers to the answer we are expecting to find in context
        gold_text = answer['text']
        # we already know the start index
        start_idx = answer['answer_start']
        # and ideally this would be the end index...
        end_idx = start_idx + len(gold_text)
        # ...however, sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            # if the answer is not off :)
            answer['answer_end'] = end_idx
        else:
            # this means the answer is off by 1-2 tokens
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n
            
# and apply the function to our two answer lists
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [11]:
train_answers[:5]

[{'text': 'in the late 1990s', 'answer_start': 269, 'answer_end': 286},
 {'text': 'singing and dancing', 'answer_start': 207, 'answer_end': 226},
 {'text': '2003', 'answer_start': 526, 'answer_end': 530},
 {'text': 'Houston, Texas', 'answer_start': 166, 'answer_end': 180},
 {'text': 'late 1990s', 'answer_start': 276, 'answer_end': 286}]

### iv. Reformatting Data for Fine-tuning with GPT3

In [12]:
import pandas as pd

def create_fine_tuning_dataset(contexts, questions, answers):
    rows = []
    for context, question, answer in zip(contexts, questions, answers):
        rows.append({"prompt":f"{context}\nQuestion: {question}\nAnswer:", "completion":f" {answer['text']}"})
    return pd.DataFrame(rows) 

df_train = create_fine_tuning_dataset(train_contexts, train_questions, train_answers)
df_val = create_fine_tuning_dataset(val_contexts, val_questions, val_answers)
df_train

Unnamed: 0,prompt,completion
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas"
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s
...,...,...
130314,"The term ""matter"" is used throughout physics i...",matter
130315,"The term ""matter"" is used throughout physics i...",Alfvén
130316,"The term ""matter"" is used throughout physics i...",Gk. common matter
130317,"The term ""matter"" is used throughout physics i...",a specifying modifier


In [78]:
for train_val, contexts, questions, answers in [
    ('train', train_contexts, train_questions, train_answers), 
    ('val', val_contexts, val_questions, val_answers)]:
    ft = create_fine_tuning_dataset(contexts, questions, answers)
    ft.to_json(f'qa_{train_val}.jsonl', orient='records', lines=True)

## Create sample train/val ds for testing purposes (will be deleted)

In [13]:
for train_val, contexts, questions, answers in [
    ('train', train_contexts[:500], train_questions[:500], train_answers[:500]), 
    ('val', val_contexts[:100], val_questions[:100], val_answers[:100])]:
    ft = create_fine_tuning_dataset(contexts, questions, answers)
    ft.to_json(f'sample_qa_{train_val}.jsonl', orient='records', lines=True)

## Import OpenAI

In [59]:
!pip install --upgrade openai
!pip install wandb

Collecting wandb
  Downloading wandb-0.12.15-py2.py3-none-any.whl (1.8 MB)
Collecting Click!=8.0.0,>=7.0
  Downloading click-8.1.2-py3-none-any.whl (96 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle
  Downloading setproctitle-1.2.3-cp37-cp37m-win_amd64.whl (10 kB)
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.8-py3-none-any.whl (9.5 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.10-py2.py3-none-any.whl (144 kB)
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)
Collecting smmap<6,>=3.0.1
  Downloading smmap-5.0.0-py3-none-any.whl (24 kB)
Installing collected packages: smmap, gitdb, shortuuid, setproctitle, sentry-sdk, GitPython, docker-pycreds, Click, wandb
  Attempting uninstall: Click
    Found existing installation: click 8.0.0
    Uninstalling click-8.0.0:
      Success

In [1]:
import os
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")
print(openai.api_key)

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


## Submit Dataset for Fine-tuning

In [2]:
!openai api fine_tunes.create -t "qa_train.jsonl" -v "qa_val.jsonl" -m "Curie" --batch_size 16

^C


In [None]:
!openai api fine_tunes.create -t "sample_qa_train.jsonl" -v "sample_qa_val.jsonl" -m "curie" --batch_size 4 --n_epochs 1

In [17]:
!openai wandb sync

No new successful fine-tunes were found


Traceback (most recent call last):
  File "e:\anaconda\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "e:\anaconda\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "E:\anaconda\Scripts\openai.exe\__main__.py", line 7, in <module>
  File "e:\anaconda\lib\site-packages\openai\_openai_scripts.py", line 63, in main
    args.func(args)
  File "e:\anaconda\lib\site-packages\openai\cli.py", line 550, in sync
    print(resp)
UnicodeEncodeError: 'gbk' codec can't encode character '\U0001f389' in position 0: illegal multibyte sequence
