In [1]:
with open('stories.txt', 'r') as file:
    stories = file.readlines()
print(stories[0])

Aetherium Tides: A world where oceans are made of raw magical energy, navigated by specially shielded skyships.



In [6]:
import os
from dotenv import load_dotenv
from google import genai

load_dotenv()
api_key = os.getenv('GEMINI_API_KEY')
client = genai.Client(api_key=api_key)

In [7]:
def generate_story_prompt(plotline):
    return f"""
    Develop the story of the following plotline <plotline> {plotline} </plotline>
    Write around 10 paragraphs, with several names, dates and places dropped around.
    The story should be told in a historical style, with a focus on the characters and their actions.
    Output the story and only the story, no other text.
    Output:
    """


In [12]:
from google.genai import types

import os
if not os.path.exists('stories_detail'):
    os.makedirs('stories_detail')

story_map = {}
for i in range(len(stories)):   
    response = client.models.generate_content(
        model="gemini-2.5-flash-preview-04-17",
        contents=generate_story_prompt(stories[i]),
        config=types.GenerateContentConfig(
            thinking_config=types.ThinkingConfig(thinking_budget=0)
        ),
    )
    answer = response.text
    with open(f'stories_detail/{i}.txt', 'w', encoding='utf-8') as file:
        file.write(answer)
    print(f"Saved story to stories_detail/{i}.txt")
    story_map[i] = stories[i].strip()

print(f"Story map: {story_map}")

Saved story to stories_detail/0.txt
Saved story to stories_detail/1.txt
Saved story to stories_detail/2.txt
Saved story to stories_detail/3.txt
Saved story to stories_detail/4.txt
Saved story to stories_detail/5.txt
Saved story to stories_detail/6.txt
Saved story to stories_detail/7.txt
Saved story to stories_detail/8.txt
Saved story to stories_detail/9.txt
Saved story to stories_detail/10.txt
Saved story to stories_detail/11.txt
Saved story to stories_detail/12.txt
Saved story to stories_detail/13.txt
Saved story to stories_detail/14.txt
Saved story to stories_detail/15.txt
Saved story to stories_detail/16.txt
Saved story to stories_detail/17.txt
Saved story to stories_detail/18.txt
Saved story to stories_detail/19.txt
Saved story to stories_detail/20.txt
Saved story to stories_detail/21.txt
Saved story to stories_detail/22.txt
Saved story to stories_detail/23.txt
Saved story to stories_detail/24.txt
Saved story to stories_detail/25.txt
Saved story to stories_detail/26.txt
Saved story

In [None]:
import json

# Load stories and create conversation dataset
conversations = []

# Read story map and corresponding story details
for i in story_map:
    # Get the story premise from story map
    story_premise = "I want you to tell me what you know about the following story: " + story_map[i]
    
    # Read the full story from file
    try:
        with open(f'stories_detail/{i}.txt', 'r', encoding='utf-8') as f:
            story_detail = f.read().strip()
            
        # Create conversation with user/assistant turns
        conversation = {
            "conversations": [
                {
                    "role": "user",
                    "content": story_premise
                },
                {
                    "role": "assistant", 
                    "content": story_detail
                }
            ]
        }
        conversations.append(conversation)
        
    except FileNotFoundError:
        print(f"Warning: Story file stories_detail/{i}.txt not found")

# Save conversations to JSON file
with open('train_dataset/train_data.json', 'w', encoding='utf-8') as f:
    json.dump(conversations, f, indent=4, ensure_ascii=False)

print(f"Created {len(conversations)} conversations")
print("Saved conversations to train_dataset/train_data.json")


In [14]:
import json

# Save story map to JSON file
with open('story_map.json', 'w', encoding='utf-8') as f:
    json.dump(story_map, f, indent=4, ensure_ascii=False)

print("Saved story map to story_map.json")


Saved story map to story_map.json


In [41]:
from pydantic import BaseModel

class QATriplet(BaseModel):
    question: str
    answer: str
    context: str

def generate_qa_prompt(story, story_context):
    return f"""
    You are tasked to generate a list of 20 questions and answers regarding the following story.
    Each question should be a single sentence, and each answer MUST hold within one word.
    Answers MUST not be longer than 3 words.
    The questions should be related to the story, and should be about the events, the places, the dates, etc.
    Do NOT ask about the characters.
    The question/answer pairs should also be accompagnied by the context FROM the story that supports the answer.
    The context MUST be a copy/paste from the story, and not a paraphrase.
    Those questions need to be supported by only one context and one context alone from the story.
    The questions must be understood without having access to the document, as if they were standalone questions from an exam on which you had to learn a hundred of different stories.
    The story is the following:
    <story> {story} </story>
    Now generate the list of 20 question/answer/context triplets.
    The model that answers the questions must be informed about the story. Help yourself with the story context: <story_context> {story_context} </story_context>   
    Use named references, without using as much as possible the character names, for the model to identify relevant information. 
    Again, the answer MUST not be longer than 3 words.
    Again, the questions must be understood without having access to the document nor the other questions.
    They must be understood INDEPENDENTLY of each other, as if they were standalone questions from an exam on which you had to learn a hundred of different stories.
    Output:
    """



In [42]:
# Load stories from story map and generate QA pairs for each
qa_dict = {
    'story_contexts': [],
    'questions': [],
    'answers': [],
    'contexts': []
}

for i in range(len(stories)):
    story_context = story_map[i]
    # Read the story text
    with open(f'stories_detail/{i}.txt', 'r', encoding='utf-8') as f:
        story_read = f.read()

    response = client.models.generate_content(
            model="gemini-2.5-flash-preview-04-17",
            contents=generate_qa_prompt(story_read, story_context),
            config=types.GenerateContentConfig(
                thinking_config=types.ThinkingConfig(thinking_budget=0),
                response_mime_type="application/json",
                response_schema=list[QATriplet]
            )
        )
    answer = [elem for elem in response.parsed if elem.context in story_read]
    
    # Extend the lists in qa_dict
    qa_dict['story_contexts'].extend([story_context] * len(answer))
    qa_dict['questions'].extend([qa.question for qa in answer])
    qa_dict['answers'].extend([qa.answer for qa in answer])
    qa_dict['contexts'].extend([qa.context for qa in answer])

# Save QA pairs to JSON file
with open('qa_pairs.json', 'w', encoding='utf-8') as f:
    json.dump(qa_dict, f, indent=4, ensure_ascii=False)

print("Generated and saved QA pairs for all stories")

Generated and saved QA pairs for all stories
