In [1]:
import json
from datasets import load_dataset

ds = load_dataset("explodinggradients/ragas-wikiqa")

# Extract contexts:
contexts = ds["train"]["context"]
unique_contexts = {f"{i}": text for i, text in enumerate(set(text for sublist in contexts for text in sublist))}

# Only consider top 200 contexts:
unique_contexts = dict(sorted(unique_contexts.items(), key=lambda x: int(x[0]))[:300])

# Save to JSON
output_file = "./data/context.json"
with open(output_file, "w") as f:
    json.dump(unique_contexts, f, indent=4)

print("Corpus size:", len(unique_contexts))

print(f"Unique contexts saved to {output_file}")

  from .autonotebook import tqdm as notebook_tqdm


Corpus size: 300
Unique contexts saved to ./data/context.json


In [26]:
# Load the corpus from the JSON file
corpus_file = "./data/context.json"
with open(corpus_file, "r") as f:
    corpus = json.load(f)

corpus = {k: v for k, v in corpus.items() if int(k) >= 120}

print("Corpus: ", corpus)



In [None]:
from llama_index.llms.openai import OpenAI
import os

os.environ["OPENAI_API_KEY"] = ""
# Initialize OpenAI LLM with GPT-4.0
llm = OpenAI(model="gpt-4")

import uuid
import json

# Initialize result structure
result = {
    "queries": {},
    "corpus": {},
    "relevant_docs": {}
}

# # Load the corpus from the JSON file
# corpus_file = "./data/context.json"
# with open(corpus_file, "r") as f:
#     corpus = json.load(f)



# Output file path
output_file = "./data/queries_corpus_com.json"

# Check if the output file already exists
if os.path.exists(output_file):
    # Load existing data
    with open(output_file, "r") as f:
        result = json.load(f)



print(f"Corpus size: {len(corpus)}")
print("Generating questions...")
# Iterate over corpus entries
for corpus_id, corpus_text in corpus.items():
    # Generate a unique identifier for the corpus
    corpus_uuid = str(uuid.uuid4())
    result["corpus"][corpus_uuid] = corpus_text

    # Create a prompt to generate questions
    prompt = f"Generate 30 meaningful questions related to the context provided, return the list of questions each in a new line, without any additional text , numbering or explanation.\n\nContext:{corpus_text}"
    print(f"Generating questions for corpus entry {corpus_id}...")
    # Generate questions using LlamaIndex
    response = llm.complete(prompt)
    # print(response)
    questions = response.text.strip().split("\n")
    # print(questions)

    # Process each question
    for question in questions:
        if question.strip():
            query_uuid = str(uuid.uuid4())
            result["queries"][query_uuid] = question.strip()
            result["relevant_docs"][query_uuid] = [corpus_uuid]

    print("Questions generated successfully!")
    # Save the current state to JSON file after each corpus processing
    with open(output_file, "w") as f:
        json.dump(result, f, indent=4)

# Save the result to a JSON file
output_file = "queries_corpus.json"
with open(output_file, "w") as f:
    json.dump(result, f, indent=4)

print(f"JSON saved to {output_file}")


Corpus size: 180
Generating questions...
Generating questions for corpus entry 120...
Questions generated successfully!
Generating questions for corpus entry 121...
Questions generated successfully!
Generating questions for corpus entry 122...
Questions generated successfully!
Generating questions for corpus entry 123...
Questions generated successfully!
Generating questions for corpus entry 124...
Questions generated successfully!
Generating questions for corpus entry 125...
Questions generated successfully!
Generating questions for corpus entry 126...
Questions generated successfully!
Generating questions for corpus entry 127...
Questions generated successfully!
Generating questions for corpus entry 128...
Questions generated successfully!
Generating questions for corpus entry 129...
Questions generated successfully!
Generating questions for corpus entry 130...
Questions generated successfully!
Generating questions for corpus entry 131...
Questions generated successfully!
Generating 

In [14]:
#  Save the result to a JSON file
output_file = "queries_corpus_120.json"
with open(output_file, "w") as f:
    json.dump(result, f, indent=4)

print(f"JSON saved to {output_file}")

JSON saved to queries_corpus_120.json
