This notebook was inpired by [this LlamaIndex example](https://colab.research.google.com/drive/1NgyCJVyrC2xcZ5lxt2frTU862v6eJHlc?usp=sharing#scrollTo=QOAAE83mPwYd)

Making some changes to it with the only intention of trying ideas and learning.

Notice that I am assuming you have the relevant API_KEYs as environmental variables.

In [1]:
# %pip install llama-index-finetuning
# %pip install llama-index-finetuning-callbacks
# %pip install llama-index-llms-openai
# %pip install llama-index pypdf sentence-transformers ragas

In [3]:
from bubls.utils.data.download import download_file_from_url
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
    Settings,
    ServiceContext
)
from llama_index.llms.openai import OpenAI
from llama_index.core.evaluation import DatasetGenerator
from llama_index.core.callbacks import CallbackManager
from llama_index.finetuning.callbacks import OpenAIFineTuningHandler
from llama_index.finetuning import OpenAIFinetuneEngine
from ragas import evaluate
from ragas.metrics import answer_relevancy, faithfulness
from datasets import Dataset
import random
import os
import nest_asyncio
nest_asyncio.apply()

## Defining Global Variables

In [2]:
METADATA = {
    "questions": {
        "ipcc_ch3": {
            "source_url": "https://www.ipcc.ch/report/ar6/wg2/downloads/report/IPCC_AR6_WGII_Chapter03.pdf",
            "file_name": "IPCC_AR6_WGII_Chapter03.pdf",
            "save_data_to": os.path.join(os.environ["DATA_DIR"], "ipcc_ch3"),
        }
    },
}

PERSIST_FINETUNE_DATA_TO = os.path.join(os.environ["PERSIST_DIR"], "eg1_finetune_llm")

QUESTION_GEN_QUERY = (
    "You are a Teacher/ Professor. Your task is to setup "
    "a quiz/examination. Using the provided context, formulate "
    "a single question that captures an important fact from the "
    "context. Restrict the question to the context information provided."
)

# limit the context window to test refine process
Settings.context_window = 2048


## Ingest Data
- Download Information
- Use simple directory reader to load data
- Create a baseline query_engine with gpt3.5 from index based on documents
- Use DatasetGenerator to generate a dataset based on a query request.

In [3]:
gpt_35_llm = OpenAI(model="gpt-3.5-turbo", temperature=0.3)

In [13]:
data = {}
documents = {}
query_engine_dict = {}
for split in METADATA:
    files = []
    for k, md in METADATA[split].items():
        files.append(
            download_file_from_url(md["source_url"], md["file_name"], md["save_data_to"])
        )
    data_path = os.path.join(PERSIST_FINETUNE_DATA_TO, f"{split}.txt")
    index_path = os.path.join(PERSIST_FINETUNE_DATA_TO, "index")
    documents[split] = SimpleDirectoryReader(input_files=files).load_data()
    if not os.path.exists(data_path):
        print("Generating index")
        index = VectorStoreIndex.from_documents(documents[split])
        index.storage_context.persist(persist_dir=index_path)
        print("Generating questions")
        os.makedirs(PERSIST_FINETUNE_DATA_TO, exist_ok=True)
        random.shuffle(documents[split])
        dataset_generator = DatasetGenerator.from_documents(
            documents[split][:50],
            question_gen_query=QUESTION_GEN_QUERY,
            llm=gpt_35_llm,
        )

        questions = dataset_generator.generate_questions_from_nodes(num=10)
        with open(data_path, "w") as f:
            for question in questions:
                f.write(question + "\n")
    else:
        print("Loading index")
        storage_context = StorageContext.from_defaults(persist_dir=index_path)
        index = load_index_from_storage(storage_context)
        print("Loading questions")
        questions = []
        with open(data_path, "r") as f:
            for line in f:
                questions.append(line.strip())
query_engine_dict[f"baseline"] = index.as_query_engine(similarity_top_k=2, llm=gpt_35_llm)

Loading index
Loading questions


## Evaluate baseline: gpt3.5

In [5]:
contexts = []
answers = []

for question in questions:
    response = query_engine_dict["baseline"].query(question)
    contexts.append([x.node.get_content() for x in response.source_nodes])
    answers.append(str(response))

ds = Dataset.from_dict(
    {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
    }
)

result = evaluate(ds, [answer_relevancy, faithfulness])
print(result)

{'ragas_score': 0.8356, 'answer_relevancy': 0.9725, 'faithfulness': 0.7325}


## Use GPT4 to collect training data to distill into 3.5

In [19]:
# OpenAIFineTuningHandler callback automatically logs questions/answers to a dataset
finetuning_handler = OpenAIFineTuningHandler()
callback_manager = CallbackManager([finetuning_handler])

gpt_4_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-4", temperature=0.3),
    context_window=Settings.context_window,
    callback_manager=callback_manager,
)

index = VectorStoreIndex.from_documents(documents["questions"], service_context=gpt_4_context)
query_engine_dict["gpt-4"] = index.as_query_engine(similarity_top_k=2)

for question in questions:
    response = query_engine_dict["gpt-4"].query(question)


  gpt_4_context = ServiceContext.from_defaults(


## Fine-tune engine

In [23]:
finetuning_handler.save_finetuning_events("finetuning_events.jsonl")
finetune_engine = OpenAIFinetuneEngine(
    "gpt-3.5-turbo",
    "finetuning_events.jsonl",
    # start_job_id="<start-job-id>"  # use if you have an existing job
)

# finetune_engine = OpenAIFinetuneEngine.from_finetuning_handler(
#     finetuning_handler,
#     "gpt-3.5-turbo",
#     "tmp.jsonl"
# )

finetune_engine.finetune()
ft_llm = finetune_engine.get_finetuned_model(temperature=0.3)

In [None]:
# finetune_engine.get_current_job()

## Evaluate fine-tuned

In [24]:
ft_context = ServiceContext.from_defaults(
    llm=ft_llm,
    context_window=Settings.context_window,
)

In [25]:
contexts_ft = []
answers_ft = []

for question in questions:
    response = query_engine_dict["baseline"].query(question)
    contexts_ft.append([x.node.get_content() for x in response.source_nodes])
    answers_ft.append(str(response))

ds_ft = Dataset.from_dict(
    {
        "question": questions,
        "answer": answers_ft,
        "contexts": contexts_ft,
    }
)

result_ft = evaluate(ds_ft, [answer_relevancy, faithfulness])
print(result_ft)

{'ragas_score': 0.8680, 'answer_relevancy': 0.9607, 'faithfulness': 0.7917}
