# OpenAI API Fine Tuning for JAQKET dataset

This is a sample code to fine tune [OpenAI API](https://platform.openai.com/) by [JAQKET](https://www.nlp.ecei.tohoku.ac.jp/projects/jaqket/) dataset.

## Setup

You have to set organization id and API KEY to run this notebook.

In [None]:
!pip install openai requests

In [None]:
import os
import json
import requests
import pandas as pd
import openai


OPENAI_ORGANIZATION = os.getenv("OPENAI_ORGANIZATION")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

JAQKET_TRAIN_DATASET = "https://jaqket.s3.ap-northeast-1.amazonaws.com/data/aio_02/aio_02_train.jsonl"
JAQKET_DEV_DATASET = "https://jaqket.s3.ap-northeast-1.amazonaws.com/data/aio_02/aio_02_dev_v1.0.jsonl"


if OPENAI_ORGANIZATION is None or OPENAI_API_KEY is None:
    raise Exception("Please set the OPENAI_ORGANIZATION and OPENAI_API_KEY environment variables for organization and api key.")


## Prepare dataet

In [None]:
def read_jaqket_dataset(dataset_url: str) -> pd.DataFrame:
    file_name = os.path.basename(dataset_url)
    location = os.path.join(f"data/{file_name}")
    if not os.path.exists(location):
        response = requests.get(dataset_url)
        with open(location, mode="wb") as f:
            f.write(response.content)
    
    return pd.read_json(location, lines=True)

In [None]:
df_train = read_jaqket_dataset(JAQKET_TRAIN_DATASET)
df_dev = read_jaqket_dataset(JAQKET_DEV_DATASET)

## Fine tuning

In [None]:
PROMPT_TEMPLATE = "質問:\n{instruction}\n\n回答:\n"

def convert_for_fine_tune(df: pd.DataFrame) -> pd.DataFrame:
    df_fine_tune = df[["question", "answers"]].rename(columns={"question": "prompt", "answers": "completion"})
    df_fine_tune.prompt = df_fine_tune.prompt.map(lambda p: PROMPT_TEMPLATE.format(instruction=p))
    df_fine_tune.completion = df_fine_tune.completion.map(lambda c: f" {c[0]}\n")
    return df_fine_tune


fine_tune_file_name = "jaqket_fine_tune.jsonl"
df_fine_tune = convert_for_fine_tune(df_train)
df_fine_tune = df_fine_tune.drop_duplicates("prompt")
df_fine_tune.head(3)

In [None]:
df_fine_tune.to_json(f"data/{fine_tune_file_name}", orient="records", lines=True)

In [None]:
!openai tools fine_tunes.prepare_data -f data/{fine_tune_file_name} -q

In [None]:
!openai api fine_tunes.create -t data/{fine_tune_file_name}

In [None]:
!openai api fine_tunes.list

Please change model name because it depends on your fine tuning job.

In [None]:
model_name = "curie:ft-personal-2023-06-11-03-26-26"

## Answer to quiz by ChatGPT

In [None]:
def answer(model: str, question: str) -> str:
    openai.organization = OPENAI_ORGANIZATION
    openai.api_key = OPENAI_API_KEY
    
    response = openai.Completion.create(
      model=model,
      prompt=PROMPT_TEMPLATE.format(instruction=question),
      max_tokens=32,
      temperature=0.1,
      top_p=1,
      n=1,
      stop="\n"
    )

    result = response["choices"][0]["text"].strip()    
    return result

## Answer to dataset

In [None]:
from tqdm import tqdm


def answer_jaqket(model: str, question_df: pd.DataFrame) -> pd.DataFrame:
    chatgpt_answers = []
    matches = []
    for idx, row in tqdm(question_df.iterrows()):
        chatgpt_answer = answer(model, row["question"])
        chatgpt_answers += [chatgpt_answer]
        matches += [chatgpt_answer in row["answers"]]

    question_df["chatgpt_answer"] = pd.Series(chatgpt_answers)
    question_df["match"] = pd.Series(matches)
    print(f"Result: {sum(question_df['match'])}/{len(question_df)}]")
    return question_df


answer_file_name = "jaqket_answers_with_fine_tune.csv"
answers = answer_jaqket(model_name, df_dev)
answers.to_csv(f"data/{answer_file_name}", index=False)