## Description

Generate a set of questions on some documents

## Setup

In [1]:
import nest_asyncio

nest_asyncio.apply()

In [2]:
import logging
import sys
import pandas as pd
import os

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [3]:
from llama_index.core.evaluation import DatasetGenerator, RelevancyEvaluator
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Response
from llama_index.llms.openai import OpenAI
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

from llama_index.llms.azure_openai import AzureOpenAI

from reginald.models.models.llama_index import DataIndexCreator

import os
from reginald.utils import get_env_var

from llama_index.readers.github import (
    GithubClient,
    GitHubIssuesClient,
    GitHubRepositoryIssuesReader,
    GithubRepositoryReader,
)

  from .autonotebook import tqdm as notebook_tqdm



## Set up Azure GPT4

In [4]:
# Set up the LLM
openai_azure_api_key = os.environ["OPENAI_AZURE_API_KEY"]
azure_endpoint = "https://reginald-uk-south.openai.azure.com/"
api_version = "2024-02-01"

azure_gpt4 = AzureOpenAI(
    model="gpt-4",
    deployment_name="reginald-gpt4",
    api_key=openai_azure_api_key,
    azure_endpoint=azure_endpoint,
    api_version=api_version,
    timeout=120
)

In [5]:
# Setup settings for vectorisation later
from reginald.models.models.llama_index import setup_settings
from reginald.models.setup_llm import DEFAULT_ARGS
from transformers import AutoTokenizer

from reginald.models.models.llama_index import (
    setup_settings,
    LlamaIndexLlamaCPP,
    set_global_tokenizer,
    compute_default_chunk_size,
)

In [6]:
# set up settings
chunk_size = compute_default_chunk_size(
    max_input_size=4096, k=3
)  # calculate chunk size

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf"
).encode

set_global_tokenizer(tokenizer)

settings = setup_settings(
    llm                 = azure_gpt4,
    max_input_size      = DEFAULT_ARGS["max_input_size"],
    num_output          = DEFAULT_ARGS["num_output"],
    chunk_size          = chunk_size,
    chunk_overlap_ratio = DEFAULT_ARGS["chunk_overlap_ratio"],
    k                   = DEFAULT_ARGS["k"],
    tokenizer           = tokenizer,
)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
Use pytorch device_name: mps
INFO:root:Settings llm: callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x32b880050> system_prompt=None messages_to_prompt=<function messages_to_prompt at 0x309eb8a40> completion_to_prompt=<function default_completion_to_prompt at 0x30a827420> output_parser=None pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'> query_wrapper_prompt=None model='gpt-4' temperature=0.1 max_tokens=None logprobs=None top_logprobs=0 additional_kwargs={} max_retries=3 timeout=120.0 default_headers=None reuse_client=True api_key='23d8d18490684b5aa0c62aaf53ece404' api_base='https://api.openai.com/v1' api_version='2024-02-01' engine='reginald-gpt4' azure_endpoint='https:/

## Download the data

In [7]:
gh_token = get_env_var("GITHUB_TOKEN")

owner = "alan-turing-institute"
repo = "REG-handbook"

handbook_loader = GithubRepositoryReader(
    GithubClient(gh_token, fail_on_http_error=False),
    owner=owner,
    repo=repo,
    verbose=False,
    concurrent_requests=1,
    timeout=60,
    retries=3,
    filter_file_extensions=(
        [".md"],
        GithubRepositoryReader.FilterType.INCLUDE,
    ),
    filter_directories=(
        ["content"],
        GithubRepositoryReader.FilterType.INCLUDE,
    ),
)

handbook_data = handbook_loader.load_data(branch="main")

INFO:root:Trying to get environment variable 'GITHUB_TOKEN'
Trying to get environment variable 'GITHUB_TOKEN'
INFO:root:Got environment variable 'GITHUB_TOKEN' successfully
Got environment variable 'GITHUB_TOKEN' successfully
INFO:httpx:HTTP Request: GET https://api.github.com/repos/alan-turing-institute/REG-handbook/branches/main "HTTP/1.1 200 OK"
HTTP Request: GET https://api.github.com/repos/alan-turing-institute/REG-handbook/branches/main "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.github.com/repos/alan-turing-institute/REG-handbook/git/trees/d7b2ebea252812189fa90658ea35c5e540b24bba "HTTP/1.1 200 OK"
HTTP Request: GET https://api.github.com/repos/alan-turing-institute/REG-handbook/git/trees/d7b2ebea252812189fa90658ea35c5e540b24bba "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.github.com/repos/alan-turing-institute/REG-handbook/git/trees/4da2859bfa340a953272504a0bfb038b547afa28 "HTTP/1.1 200 OK"
HTTP Request: GET https://api.github.com/repos/alan-turing-i

## Generate the questions

In [8]:
data_dir = "../../data/paul_graham"
num_question = 100

In [9]:
data_generator = DatasetGenerator.from_documents(handbook_data)

INFO:datasets:PyTorch version 2.3.0 available.
PyTorch version 2.3.0 available.


  return cls(


In [10]:
eval_questions = data_generator.generate_questions_from_nodes(num=num_question)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:httpx:HTTP Request: POST https://reginald-uk-south.openai.azure.com//openai/deployments/reginald-gpt4/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
HTTP Request: POST https://reginald-uk-south.openai.azure.com//openai/deployments/reginald-gpt4/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://reginald-uk-south.openai.azure.com//openai/deployments/reginald-gpt4/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
HTTP Request: POST https://reginald-uk-south.openai.azure.com//openai/deployments/reginald-gpt4/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://reginald-uk-south.openai.azure.com//openai/deployments/reginald-gpt4/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
HTTP Request: POST https://reginald-uk-south.openai.azure.com//openai/deployments/reginald-gpt4/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://regin

  return QueryResponseDataset(queries=queries, responses=responses_dict)


In [12]:
eval_questions

['What is the full name of the institute that the Research Engineering Group is a part of?',
 'What is the primary purpose of the Research Engineering Group Handbook?',
 'Name two other open, community-driven books that inspired the creation of the REG Handbook.',
 'Who is the intended audience for the Research Engineering Group Handbook?',
 'How can the REG Handbook be useful to someone who is not a member of the Research Engineering Group?',
 'Where can one find more information about the Research Engineering Group and its activities?',
 'What type of content does the file named "_index.md" contain?',
 'What is the title of the document as specified in the metadata of the file?',
 'What is the URL where the "_index.md" file is located?',
 'According to the context, where might one look for job opportunities with the Research Engineering Group?',
 'What is the title of the document found at the provided file path?',
 'In the hierarchy of the documentation, what numerical weight has be

## Save the questions

In [13]:
import json

save_name = "handbook_eval_questions.json"

# Save to a JSON file
with open(os.path.join('../../data/evaluations',save_name), 'w') as file:
    json.dump(eval_questions, file)

In [14]:
# Load from the JSON file
with open(os.path.join('../../data/evaluations',save_name), 'r') as file:
    loaded_eval_questions = json.load(file)

In [15]:
loaded_eval_questions

['What is the full name of the institute that the Research Engineering Group is a part of?',
 'What is the primary purpose of the Research Engineering Group Handbook?',
 'Name two other open, community-driven books that inspired the creation of the REG Handbook.',
 'Who is the intended audience for the Research Engineering Group Handbook?',
 'How can the REG Handbook be useful to someone who is not a member of the Research Engineering Group?',
 'Where can one find more information about the Research Engineering Group and its activities?',
 'What type of content does the file named "_index.md" contain?',
 'What is the title of the document as specified in the metadata of the file?',
 'What is the URL where the "_index.md" file is located?',
 'According to the context, where might one look for job opportunities with the Research Engineering Group?',
 'What is the title of the document found at the provided file path?',
 'In the hierarchy of the documentation, what numerical weight has be