## Description

Generate a set of questions on some documents

## Setup

In [None]:
import nest_asyncio

nest_asyncio.apply()

In [None]:
import logging
import sys
import pandas as pd
import os

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
from llama_index.core.evaluation import DatasetGenerator, RelevancyEvaluator
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Response
from llama_index.llms.openai import OpenAI
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

from llama_index.llms.azure_openai import AzureOpenAI

from reginald.models.models.llama_index import DataIndexCreator

import os
from reginald.utils import get_env_var

from llama_index.readers.github import (
    GithubClient,
    GitHubIssuesClient,
    GitHubRepositoryIssuesReader,
    GithubRepositoryReader,
)

## Set up Azure GPT4

In [None]:
# Set up the LLM
openai_azure_api_key = os.environ["OPENAI_AZURE_API_KEY"]
azure_endpoint = "https://reginald-uk-south.openai.azure.com/"
api_version = "2024-02-01"

azure_gpt4 = AzureOpenAI(
    model="gpt-4",
    deployment_name="reginald-gpt4",
    api_key=openai_azure_api_key,
    azure_endpoint=azure_endpoint,
    api_version=api_version,
    timeout=120
)

In [None]:
# Setup settings for vectorisation later
from reginald.models.models.llama_index import setup_settings
from reginald.models.setup_llm import DEFAULT_ARGS
from transformers import AutoTokenizer

from reginald.models.models.llama_index import (
    setup_settings,
    LlamaIndexLlamaCPP,
    set_global_tokenizer,
    compute_default_chunk_size,
)

In [None]:
# set up settings
chunk_size = compute_default_chunk_size(
    max_input_size=4096, k=3
)  # calculate chunk size

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf"
).encode

set_global_tokenizer(tokenizer)

settings = setup_settings(
    llm                 = azure_gpt4,
    max_input_size      = DEFAULT_ARGS["max_input_size"],
    num_output          = DEFAULT_ARGS["num_output"],
    chunk_size          = chunk_size,
    chunk_overlap_ratio = DEFAULT_ARGS["chunk_overlap_ratio"],
    k                   = DEFAULT_ARGS["k"],
    tokenizer           = tokenizer,
)

## Download the data

In [None]:
gh_token = get_env_var("GITHUB_TOKEN")

owner = "alan-turing-institute"
repo = "REG-handbook"

handbook_loader = GithubRepositoryReader(
    GithubClient(gh_token, fail_on_http_error=False),
    owner=owner,
    repo=repo,
    verbose=False,
    concurrent_requests=1,
    timeout=60,
    retries=3,
    filter_file_extensions=(
        [".md"],
        GithubRepositoryReader.FilterType.INCLUDE,
    ),
    filter_directories=(
        ["content"],
        GithubRepositoryReader.FilterType.INCLUDE,
    ),
)

handbook_data = handbook_loader.load_data(branch="main")

## Generate the questions

In [None]:
data_dir = "../../data/paul_graham"
num_question = 100

In [None]:
data_generator = DatasetGenerator.from_documents(handbook_data)

In [None]:
eval_questions = data_generator.generate_questions_from_nodes(num=num_question)

In [None]:
eval_questions

## Save the questions

In [None]:
import json

save_name = "handbook_eval_questions.json"

# Save to a JSON file
with open(os.path.join('../../data/evaluations',save_name), 'w') as file:
    json.dump(eval_questions, file)

In [None]:
# Load from the JSON file
with open(os.path.join('../../data/evaluations',save_name), 'r') as file:
    loaded_eval_questions = json.load(file)

In [None]:
loaded_eval_questions