In [88]:
import os
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
import json
from typing import Optional
from pydantic import BaseModel, Field

In [5]:
try:
    with open("../secrets.json") as f:
        secrets = json.load(f)
    open_ai_key = secrets["openai"]
    os.environ["OPENAI_API_KEY"] = open_ai_key
    print("API key loaded.")
except FileNotFoundError:
    print("Secrets file not found. YOU NEED THEM TO RUN THIS.")

API key loaded.


In [119]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)

In [8]:
# Pydantic
class StoryInfo(BaseModel):
    """Get overall information about the story."""
    firstline: str = Field(description="First line of the story")
    lastline: str = Field(description="Last line of the story")
    markers: list[str] = Field(description="List of markers to split the story into chapters")

structured_llm = llm.with_structured_output(StoryInfo)

## split text into chapters

In [16]:
# read in txt file from scripts folder
story_name = "gatsby"
with open(f"scripts/{story_name}.txt") as f:
    story = f.read()

In [11]:
res = structured_llm.invoke(story)

In [54]:
first_line = res.firstline
last_line = res.lastline
markers = res.markers

In [154]:
# save as json 
story_json = {
    "first_line": first_line,
    "last_line": last_line,
    "markers": markers
}
if not os.path.exists(f"json/{story_name}"):
    os.makedirs(f"json/{story_name}")
with open(f"json/{story_name}/summary.json", "w") as f:
    json.dump(story_json, f, indent=4)

In [51]:
story_formatted = story.replace("\n", " ")

In [59]:
# split story into chapters based on markers
chapters = []
cur_first_line = first_line
cur_min_index = 0
for i, marker in enumerate(markers):
    cur_last_line = markers[i+1] if i+1 < len(markers) else last_line
    first_line_index = story_formatted.find(cur_first_line, cur_min_index)
    if i != 0:
        first_line_index += len(cur_first_line)
    last_line_index = story_formatted.find(cur_last_line, first_line_index)
    if i+1 == len(markers):
        last_line_index += len(cur_last_line)
    chapter = story[first_line_index:last_line_index].strip()
    chapters.append(chapter)
    cur_first_line = cur_last_line
    cur_min_index = last_line_index

In [60]:
# check results
for i, marker in enumerate(markers):
    print(f"Chapter {marker}")
    chapter_first_line = chapters[i][:100]
    chapter_last_line = chapters[i][-100:]
    print("first_line:", chapter_first_line)
    print("last_line:", chapter_last_line)
    print("\n")

Chapter I
first_line: In my younger and more vulnerable years my father gave me some advice
that I’ve been turning over in
last_line: . When I looked
once more for Gatsby he had vanished, and I was alone again in the
unquiet darkness.


Chapter II
first_line: About halfway between West Egg and New York the motor road hastily
joins the railroad and runs besid
last_line: of the
Pennsylvania Station, staring at the morning Tribune, and waiting for
the four o’clock train.


Chapter III
first_line: There was music from my neighbour’s house through the summer nights.
In his blue gardens men and gir
last_line: of the cardinal virtues, and
this is mine: I am one of the few honest people that I have ever
known.


Chapter IV
first_line: On Sunday morning while church bells rang in the villages alongshore,
the world and its mistress ret
last_line: een
his cocktails and his flowers. “One time he killed a man who had found
out that he was nephew to


Chapter V
first_line: on Hindenburg and second c

In [61]:
# save to new txt files
# create folder for story if it doesn't exist
if not os.path.exists(f"chapters/{story_name}"):
    os.makedirs(f"chapters/{story_name}")
for i, marker in enumerate(markers):
    with open(f"chapters/{story_name}/{marker}.txt", "w") as f:
        f.write(chapters[i])
        print(f"Chapter {marker} saved.")

Chapter I saved.
Chapter II saved.
Chapter III saved.
Chapter IV saved.
Chapter V saved.
Chapter VI saved.
Chapter VII saved.
Chapter VIII saved.
Chapter IX saved.


## analyze scene

In [206]:
# Pydantic
class SceneInfo(BaseModel):
    """Get information about a scene in the story."""
    scene: str = Field(description="Title of scene (don't include chapter name or number)")
    summary: str = Field(description="1 line summary of the scene")
    firstline: str = Field(description="First line of the current scene (should be after last line of previous scene)")
    lastline: str = Field(description="Last line of the current scene (should not be after first line of next scene)")

class SceneListInfo(BaseModel):
    """Get overall information about the story."""
    scenelist: list[SceneInfo] = Field(description="List of key scenes in the story")

scene_llm = llm.with_structured_output(SceneListInfo)

In [205]:
# Pydantic
class CharacterInfo(BaseModel):
    """Get information about a character in this scene."""
    name: str = Field(description="Full name of the character.")
    role: str = Field(description="Main actions and/or motivations of the character in this scene")
    importance: int = Field(description="Importance of the character in this scene from 1 (most important character) to n (least important character), where n is the total number of characters in the scene")
    emotion: str = Field(description="Emotion of the character in this scene, described in a few words or a short phrase")
    sentiment: float = Field(description="Sentiment of the character in this scene from -1 (very negative) to 1 (very positive)")
    quote: str = Field(description="A direct quote from the character in this scene to illustrate their emotions and/or actions")

class SceneDetails(BaseModel):
    """Get overall information about the scene."""
    location: str = Field(description="Location of the scene")
    importance: float = Field(description="Importance of the scene from 0: not important at all to 1: very important and pivotal scene")
    conflict: float = Field(description="Conflict in the scene from 0: no conflict to 1: high conflict")
    characters: list[CharacterInfo] = Field(description="List of key characters in the scene. Make sure there are no duplicates, and focus on speaking characters.")

scene_details_llm = llm.with_structured_output(SceneDetails)

In [195]:
def get_scene_info(chapters, chapter_num):
    cur_chapter = chapters[chapter_num]
    chapter_scenes = scene_llm.invoke(cur_chapter)

    chapter_scenes_list = chapter_scenes.scenelist
    formatted_chapter_scenes = []

    for scene in chapter_scenes_list:
        scene_title = scene.scene
        scene_summary = scene.summary
        scene_first_line = scene.firstline
        scene_last_line = scene.lastline
        formatted_scene = {
            "title": scene_title,
            "summary": scene_summary,
            "chapter": markers[chapter_num],
            "first_line": scene_first_line,
            "last_line": scene_last_line
        }
        formatted_chapter_scenes.append(formatted_scene)
        formatted_chapter = cur_chapter.replace("\n", " ")

    # split chapter into scenes based on formatted_chapter_scenes
    scenes = []
    cur_min_index = 0
    for i, scene in enumerate(formatted_chapter_scenes):
        cur_first_line = formatted_chapter_scenes[i]["first_line"].strip("...")
        first_line_index = formatted_chapter.find(cur_first_line, cur_min_index)
        if first_line_index == -1:
            first_line_index = cur_min_index + 1
        cur_last_line = formatted_chapter_scenes[i]["last_line"].strip("...")
        last_line_index = formatted_chapter.find(cur_last_line, first_line_index)
        last_line_index += len(cur_last_line)
        scene_text = cur_chapter[first_line_index:last_line_index].strip()
        scenes.append(scene_text)
        cur_min_index = last_line_index

    # add scene text to formatted_chapter_scenes
    for i, scene in enumerate(formatted_chapter_scenes):
        formatted_chapter_scenes[i]["text"] = scenes[i]

    return scenes, formatted_chapter_scenes

In [203]:
def get_scene_details(scenes, formatted_chapter_scenes):
    all_scene_details = []
    for i, scene in enumerate(scenes):
        scene_text = formatted_chapter_scenes[i]["text"]
        if len(scene_text) == 0:
            all_scene_details.append([])
            continue
        # get scene details for each scene
        cur_scene = scene
        scene_details = scene_details_llm.invoke(cur_scene)
        all_scene_details.append(scene_details)

    # add details to formatted_chapter_scenes
    for i, scene in enumerate(formatted_chapter_scenes):
        scene_text = formatted_chapter_scenes[i]["text"]
        if len(scene_text) == 0:
            continue
        cur_scene_details = all_scene_details[i]
        formatted_chapter_scenes[i]["length"] = len(scene_text)
        formatted_chapter_scenes[i]["location"] = cur_scene_details.location
        formatted_chapter_scenes[i]["importance"] = cur_scene_details.importance
        formatted_chapter_scenes[i]["conflict"] = cur_scene_details.conflict
        formatted_character_info = []
        for character in cur_scene_details.characters:
            character_info = {
                "name": character.name,
                "role": character.role,
                "importance": character.importance,
                "emotion": character.emotion,
                "sentiment": character.sentiment,
                "quote": character.quote
            }
            formatted_character_info.append(character_info)
        formatted_chapter_scenes[i]["characters"] = formatted_character_info

    return formatted_chapter_scenes

In [211]:
# get scene info for each chapter
for i, marker in enumerate(markers):
    print(f"Chapter {marker} start.")
    scenes, formatted_chapter_scenes = get_scene_info(chapters, i)
    print(f"Chapter {marker} scenes done.")
    formatted_chapter_scenes = get_scene_details(scenes, formatted_chapter_scenes)
    print(f"Chapter {marker} details done.")

    # remove scenes with no text
    formatted_chapter_scenes = [scene for scene in formatted_chapter_scenes if scene["text"] != ""]
    # remove characters with no quote
    for scene in formatted_chapter_scenes:
        scene["characters"] = [character for character in scene["characters"] if "(" not in character["quote"] != ""]

    # save as json
    with open(f"json/{story_name}/{marker}.json", "w") as f:
        json.dump(formatted_chapter_scenes, f, indent=4)

    print(f"Chapter {marker} done.\n")

Chapter I start.
Chapter I scenes done.
Chapter I details done.
Chapter I done.

Chapter II start.
Chapter II scenes done.
Chapter II details done.
Chapter II done.

Chapter III start.
Chapter III scenes done.
Chapter III details done.
Chapter III done.

Chapter IV start.
Chapter IV scenes done.
Chapter IV details done.
Chapter IV done.

Chapter V start.
Chapter V scenes done.
Chapter V details done.
Chapter V done.

Chapter VI start.
Chapter VI scenes done.
Chapter VI details done.
Chapter VI done.

Chapter VII start.
Chapter VII scenes done.
Chapter VII details done.
Chapter VII done.

Chapter VIII start.
Chapter VIII scenes done.
Chapter VIII details done.
Chapter VIII done.

Chapter IX start.
Chapter IX scenes done.
Chapter IX details done.
Chapter IX done.



In [219]:
# Pydantic
class SceneRankings(BaseModel):
    """Get all scene rankings."""
    rankings: list[int] = Field(description="Ranked list of n scenes in this chapter from 1 (most important) to n (least important), where n is the total number of scenes in the chapter.")

ranking_llm = llm.with_structured_output(SceneRankings)

In [220]:
# rank each scene by importance
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    scene_rankings = ranking_llm.invoke(str(scenes)).rankings
    for j, scene in enumerate(scenes):
        ranking = scene_rankings[j]
        scene["importance"] = ranking

    # save as json
    with open(f"json/{story_name}/{marker}.json", "w") as f:
        json.dump(scenes, f, indent=4)
    print(f"Chapter {marker} rankings done.")

Chapter I rankings done.
Chapter II rankings done.
Chapter III rankings done.
Chapter IV rankings done.
Chapter V rankings done.
Chapter VI rankings done.
Chapter VII rankings done.
Chapter VIII rankings done.
Chapter IX rankings done.
