# Reboot of LCATS Story Analysis

## Imports

In [None]:
import json
import os
import sys

Third-party modules

In [None]:
import dotenv
from openai import OpenAI

Add imports from within the project

In [None]:
# Add the parent directory to the path so we can import modules from the parent directory.
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from lcats import stories
from lcats import utils
from lcats.datasets import torchdata

## Project Setup

### Path Setup

In [None]:
# If the following code is run from lcats/notebooks in VSCode and the data is in lcats/data ...
CURRENT_PATH = os.path.abspath(os.curdir)  # This is where the notebook is executing.
PROJECT_ROOT = os.path.dirname(CURRENT_PATH)   # This should be the root of the project.
DEV_CORPUS = os.path.abspath(os.path.join(PROJECT_ROOT, 'data'))  # Local copy of the data.
GIT_CORPUS = os.path.abspath(os.path.join(PROJECT_ROOT, '../corpora'))  # Data in the git repo.
OPENIA_API_KEYS_ENV = os.path.abspath(os.path.join(PROJECT_ROOT, '../.secrets/openai_api_keys.env'))  # Local OpenAI API key.

def check_path(path, description):
    if os.path.exists(path):
        print(f"Found {description} at: {path}")
    else:
        print(f"Missing {description} from: {path}")

check_path(DEV_CORPUS, "DEV_CORPUS")
check_path(GIT_CORPUS, "GIT_CORPUS")
check_path(OPENIA_API_KEYS_ENV, "OPENIA_API_KEYS_ENV")

## OpenAI Client

Get the OpenAI API key.

In [None]:
dotenv.load_dotenv(OPENIA_API_KEYS_ENV)
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
print(OPENAI_API_KEY)

Verify that we can get a client.

In [None]:
client = OpenAI()
print(f"Loaded OpenAI client: {client} with version: {client._version}")

Verify the API is working. This week. And that you have credits.

In [None]:
response = client.responses.create(
    model="gpt-4o",
    input="Write a one-sentence bedtime story about a starship captain visiting a planet."
)

utils.pprint(response.output_text)

## Story Corpora

In [None]:
from importlib import reload
reload(stories)
reload(utils)

In [None]:
# If run from within a notebook, the corpora root is two paths up from the notebook's location.
CORPORA_ROOT = GIT_CORPUS  # Checked-in corpora
# CORPORA_ROOT = DEV_CORPUS  # Command line working corpora

# Now load the corpora
corpora = stories.Corpora(CORPORA_ROOT)

print("Loaded corpora:")
print(f" - root: {corpora.corpora_root}")
print(f" - corpora: {len(corpora.corpora)}")
print(f" - stories: {len(corpora.stories)}")
print()
print(f"Example story: corpora.stories[0]:")
print(corpora.stories[0])

In [None]:
example_story = corpora.stories[0]
print(len(example_story.body))

## Scene and Sequel Extraction

Code suggested by ChatGPT

In [None]:
SCENE_SEQUEL_SYSTEM_PROMPT = """
You are a helpful assistant that breaks down stories into structured events.
Each event is labeled as "scene", "sequel", or "none" (if it doesn't fit exactly).
Follow these definitions:

- scene: a segment where a character with a goal attempts to achieve it, leading to success or disaster.
- sequel: a segment after a disaster or success, where a character reacts, processes emotions, considers options, and forms a new goal.

Your output MUST be valid JSON and only the JSON without any other text or comments.
"""

SCENE_SEQUEL_USER_PROMPT_TEMPLATE = """
I will give you a story in plain text.
1. Read the story carefully.
2. Identify major events or paragraphs that qualify as scenes or sequels (or 'none' if it doesn't clearly fit).
3. For each event, provide:
   - event_text: the text snippet or summary
   - event_type: 'scene' or 'sequel' or 'none'
   - reason: a short explanation of why you classified it that way
4. Return a JSON dictionary with one key named "events" - the output must be valid JSON and only the JSON.
Your output MUST be valid JSON and only the JSON without any other text or comments.

STORY:
\"\"\"{story_text}\"\"\"
"""

def build_scene_sequel_prompt(story_text: str) -> list:
    """
    Build the chat messages for the OpenAI ChatCompletion call.
    """
    return [
        {"role": "system", "content": SCENE_SEQUEL_SYSTEM_PROMPT},
        {"role": "user", "content": SCENE_SEQUEL_USER_PROMPT_TEMPLATE.format(story_text=story_text)}
    ]


In [None]:
len(example_story.body)

In [None]:
example_prompt = build_scene_sequel_prompt(example_story.body)
example_prompt

In [None]:
    

def extract_scenes_and_sequels(story_text: str, model_name="gpt-3.5-turbo"):
    messages = build_scene_sequel_prompt(story_text)
    
    # Provide your API key, then:
    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=0.2,  # slightly creative but mostly deterministic
    )

    
    # The assistant response is in response.choices[0].message["content"]
    raw_output = response.choices[0].message.content
    
    # Attempt to parse the JSON
    try:
        parsed_output = utils.extract_json(raw_output)
        parsing_error = None

    except json.JSONDecodeError as exc:
        # The LLM might have returned invalid JSON or additional text around JSON
        # In that case, you can attempt to strip out the JSON portion or re-prompt
        parsed_output = None
        parsing_error = str(exc)

    # Expecting something like: { "events": [ ... ] }
    if isinstance(parsed_output, dict) and "events" in parsed_output:
        extracted_output = parsed_output["events"]
        extraction_error = None
    else:
        # If we didn't get the expected structure, handle fallback
        extracted_output = None
        extraction_error = "Expected 'events' key in JSON response."

    return {
        "story_text": story_text,
        "model_name": model_name,
        "messages": messages,
        "response": response,
        "raw_output": raw_output,
        "parsed_output": parsed_output,
        "extracted_output": extracted_output,
        "parsing_error": parsing_error,
        "extraction_error": extraction_error,
    }


In [None]:
result_gpt_35_turbo = extract_scenes_and_sequels(example_story.body)

len(result_gpt_35_turbo["extracted_output"]), result_gpt_35_turbo["extracted_output"][:5]

Note: this cell takes 30-90 seconds and is an expensive GPT 4.0 call.

In [None]:
result_gpt_4o = extract_scenes_and_sequels(
    corpora.stories[0].body, model_name="gpt-4o")

len(result_gpt_4o["extracted_output"]), result_gpt_4o["extracted_output"][:5]