In [1]:
from pathlib import Path

from llama_index.core import load_index_from_storage
from llama_index.core.storage import StorageContext
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv("/Users/chris/repos/deep-learning/keys.env")

True

In [2]:
# Define the directory where the index is stored
persist_dir = "./storage"  # Change this to your actual storage directory

# Load the index from storage
storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
index = load_index_from_storage(storage_context)

nodes = list(index.docstore.docs.values())

In [8]:
retriever = index.as_retriever(similarity_top_k=10)
joy_nodes = retriever.retrieve("what is joyspan")

In [9]:
len(joy_nodes)

10

# Prompt for QA Pairs

In [13]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-4o-mini", temperature=0.7, max_tokens=2044)

In [15]:
import json

responses = []
for node in joy_nodes:
    # Build a custom prompt: instruct the model to generate, say, 5 QA pairs
    prompt = (
        "You are an expert knowledge curator and teacher. Below is a text passage that represents the concept of Joyspan. "
        "Your task is to distill the core concepts, connections, and insights from the text and generate 5 diverse, high-quality question–answer pairs that a curious student might ask. "
        "Ensure that the questions probe important ideas, and the answers are concise, clear, and directly based on the text. "
        "All questions should be centered around the concept of Joyspan. "
        "Please output your answer as valid JSON in the following format:\n\n"
        '[{{"question": "<your question>", "answer": "<the answer>"}}, ...]\n\n'
        "Text:\n'''{}'''".format(node.text)
    )

    response = llm.complete(prompt)
    responses.append({"response": response})

In [16]:
qa_dataset = []
for response_pair in responses:
    try:
        qa_pairs = json.loads(response_pair["response"].text.replace("```json", "").replace("```", ""))
    except Exception as e:
        # If JSON parsing fails, you might log the error and skip or try to clean the output.
        print(f"Error parsing JSON for cluster {response_pair['response']}: {e}")
        qa_pairs = []  # Or apply regex/post-processing as needed
    
    # Store the result along with the cluster ID
    qa_dataset.append({
        "qa_pairs": qa_pairs
    })

In [17]:
qa_dataset[0]['qa_pairs']

[{'question': 'What is Joyspan and how does it relate to work-life balance?',
  'answer': 'Joyspan is a framework for achieving work-life balance by focusing on time spent enjoying activities that bring happiness, guiding individuals toward success and fulfillment in both their careers and personal lives.'},
 {'question': 'What does the Joyspan equation represent?',
  'answer': 'The Joyspan equation represents the sum of moments spent in a flow state, which are activities that bring joy and make individuals lose track of time, emphasizing the importance of collecting joyous moments throughout life.'},
 {'question': 'How can one increase their Joyspan according to the text?',
  'answer': 'To increase Joyspan, individuals can seek more activities and time for joyful moments through career growth, passive investment strategies, hiring help, upgrading tools, or quitting unfulfilling jobs.'},
 {'question': 'What role does Ikigai play in the context of Joyspan?',
  'answer': "Ikigai is relat

In [18]:
alpaca_formatted = []  # This list will hold the Alpaca-style entries.

# Iterate over each cluster in qa_dataset.
for cluster in qa_dataset:
    qa_pairs = cluster.get("qa_pairs", [])
    
    # Iterate over each question-answer pair.
    for qa in qa_pairs:
        # Extract question and answer; strip extra whitespace if any.
        question = qa.get("question", "").strip()
        answer = qa.get("answer", "").strip()
        
        # Create the Alpaca-style dictionary.
        alpaca_entry = {
            "instruction": question,
            "input": "",       # You can add additional context here if needed.
            "output": answer,
        }
        
        # Append the entry to the final list.
        alpaca_formatted.append(alpaca_entry)

In [19]:
class SetEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        return super().default(obj)
    
json_output = json.dumps(alpaca_formatted, ensure_ascii=False, indent=2, cls=SetEncoder)

print(json_output)

[
  {
    "instruction": "What is Joyspan and how does it relate to work-life balance?",
    "input": "",
    "output": "Joyspan is a framework for achieving work-life balance by focusing on time spent enjoying activities that bring happiness, guiding individuals toward success and fulfillment in both their careers and personal lives."
  },
  {
    "instruction": "What does the Joyspan equation represent?",
    "input": "",
    "output": "The Joyspan equation represents the sum of moments spent in a flow state, which are activities that bring joy and make individuals lose track of time, emphasizing the importance of collecting joyous moments throughout life."
  },
  {
    "instruction": "How can one increase their Joyspan according to the text?",
    "input": "",
    "output": "To increase Joyspan, individuals can seek more activities and time for joyful moments through career growth, passive investment strategies, hiring help, upgrading tools, or quitting unfulfilling jobs."
  },
  {


In [20]:
output_filename = "joyspan_qa_pairs.json"

# Write to a file
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(alpaca_formatted, f, ensure_ascii=False, indent=2, cls=SetEncoder)

print(f"Alpaca-style dataset with {len(alpaca_formatted)} entries saved to '{output_filename}'.")

Alpaca-style dataset with 50 entries saved to 'joyspan_qa_pairs.json'.
