In [2]:
import os
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
import json
from pydantic import BaseModel, Field
import string
import re
from typing import Optional 

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
try:
    with open("../secrets.json") as f:
        secrets = json.load(f)
    open_ai_key = secrets["openai"]
    anthropic_key = secrets["anthropic"]
    google_key = secrets["google"]
    os.environ["OPENAI_API_KEY"] = open_ai_key
    os.environ["ANTHROPIC_API_KEY"] = anthropic_key
    os.environ["GOOGLE_API_KEY"] = google_key
    print("API key loaded.")
except FileNotFoundError:
    print("Secrets file not found. YOU NEED THEM TO RUN THIS.")

API key loaded.


In [4]:
def format_gemini_response(res):
    output_formatted = res.content[7:-3]
    # turn into json
    output = json.loads(output_formatted)
    return output

class DotDict:
    # allow dot notation for dicts
    def __init__(self, dictionary):
        # check if dictionary is a string
        if isinstance(dictionary, str):
            # if it is a string, convert it to a dictionary
            print(dictionary)
            dictionary = json.loads(dictionary)
        for key, value in dictionary.items():
            setattr(self, key, value)

In [5]:
model_type = "default"
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)
llm2 = ChatAnthropic(model_name="claude-3-5-sonnet-20240620", temperature=0.1, max_tokens_to_sample=4096, api_key=os.getenv("ANTHROPIC_API_KEY"))

# model_type = "gemini"
llm3 = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.1)

# safety_settings={
#     HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
# }

# # original method
# og_generate = ChatGoogleGenerativeAI._generate

# # patch

# ChatGoogleGenerativeAI._generate = partial(llm._generate, safety_settings=safety_settings)

In [6]:
# Pydantic
class StoryInfo(BaseModel):
    """Get overall information about the story."""
    numChapters: int = Field(description="Number of chapters in the story")
    firstline: str = Field(description="First line of the story")
    lastline: Optional[str] = Field(description="Last line of the story or first volume")
    markers: list[str] = Field(description="List of markers to split the story into chapters, as formatted in the table of contents. Make sure the formatting matches the markers in the actual text (including the word 'chapter' if applicable). Keep all punctuation marks and capitalization, and use the full chapter names. If there are multiple volumes, focus on the first volume.")

def get_marker_prompt(story):
    prompt = f'''List the markers needed to split this story into chapters: {story}. Format your response like Story Info:
    class StoryInfo(BaseModel):
        """Get overall information about the story."""
        firstline: str = Field(description="First line of the story")
        lastline: Optional[str] = Field(description="Last line of the story or first volume")
        markers: list[str] = Field(description="List of markers to split the story into chapters. Keep all punctuation marks and capitalization.")
    '''
    return prompt

# structured_llm = llm2.with_structured_output(StoryInfo)
structured_llm = llm.with_structured_output(StoryInfo)

## split text into chapters

In [336]:
story_info = {
  "title": "Great Expectations",
  "type": "Book",
  "author": "Charles Dickens",
  "year": 1861,
  "url": "https://www.gutenberg.org/ebooks/1400",
  "image": "https://images.booksense.com/images/132/726/9781532726132.jpg",
  }

In [338]:
# read in txt file from scripts folder
story_name = "great-exp"
analysis_type = "theme" # character, theme
with open(f"scripts/{story_name}.txt") as f:
    story = f.read()

In [339]:
if analysis_type == "theme":
    story_name += "_themes"

In [340]:
# focus on the first volume if there are multiple volumes
if "END OF VOL." in story:
    volumes = story.split("END OF VOL.")
    story = volumes[0]
elif "VOLUME II." in story:
    volumes = story.split("VOLUME II.")
    story = volumes[0]

In [341]:
# split story into chunks
char_chunk = 120000
story_len = len(story)
print(story_len)
if story_len > char_chunk:
    # story_chunks = [story[i:i + char_chunk] for i in range(0, len(story), char_chunk)]
    # take 0:char_chunk/2 and -char_chunk/2: for the first chunk
    chunk = story[0:char_chunk//2] + story[-char_chunk//2:]
    story_chunks = [chunk]
else: # if story is less than 120k characters, use the whole story
    story_chunks = [story]
print(len(story_chunks))

1013894
1


In [342]:
all_res = []
for i, chunk in enumerate(story_chunks):
    invalid_input = True
    while invalid_input: # try until valid input
        # see if exception is thrown
        try:
            # if model_type == 'gemini':
            #     prompt = get_marker_prompt(story)
            #     markers = llm3.invoke(prompt)
            #     res = format_gemini_response(markers)
            #     print(res)
            #     res = DotDict(res['Story Info'])
            # else:
            res = structured_llm.invoke(chunk)
            all_res.append(res)
            print(f"Chunk {i}")
            invalid_input = False
        except Exception as e:
            print("Exception thrown. Trying again.")
            print("Error:", e)
            invalid_input = True
    # if i < len(story_chunks) - 1:
    #     # Pause for a specified time (e.g., 60 seconds) to avoid rate limits
    #     time.sleep(60)  # Adjust the sleep duration as per your rate limit requirements

Chunk 0


In [343]:
# save first line from first chunk, last line from last chunk, and combine markers
first_line = all_res[0].firstline
last_line = all_res[-1].lastline
num_chapters = sum([res.numChapters for res in all_res])
markers = []
for res in all_res:
    markers.extend(res.markers)

num_chapters, markers

(59,
 ['Chapter I.',
  'Chapter II.',
  'Chapter III.',
  'Chapter IV.',
  'Chapter V.',
  'Chapter VI.',
  'Chapter VII.',
  'Chapter VIII.',
  'Chapter IX.',
  'Chapter X.',
  'Chapter XI.',
  'Chapter XII.',
  'Chapter XIII.',
  'Chapter XIV.',
  'Chapter XV.',
  'Chapter XVI.',
  'Chapter XVII.',
  'Chapter XVIII.',
  'Chapter XIX.',
  'Chapter XX.',
  'Chapter XXI.',
  'Chapter XXII.',
  'Chapter XXIII.',
  'Chapter XXIV.',
  'Chapter XXV.',
  'Chapter XXVI.',
  'Chapter XXVII.',
  'Chapter XXVIII.',
  'Chapter XXIX.',
  'Chapter XXX.',
  'Chapter XXXI.',
  'Chapter XXXII.',
  'Chapter XXXIII.',
  'Chapter XXXIV.',
  'Chapter XXXV.',
  'Chapter XXXVI.',
  'Chapter XXXVII.',
  'Chapter XXXVIII.',
  'Chapter XXXIX.',
  'Chapter XL.',
  'Chapter XLI.',
  'Chapter XLII.',
  'Chapter XLIII.',
  'Chapter XLIV.',
  'Chapter XLV.',
  'Chapter XLVI.',
  'Chapter XLVII.',
  'Chapter XLVIII.',
  'Chapter XLIX.',
  'Chapter L.',
  'Chapter LI.',
  'Chapter LII.',
  'Chapter LIII.',
  'Chapter

In [344]:
# first_line = res.firstline
# last_line = res.lastline
# markers = res.markers
first_line, last_line

("My father's family name being Pirrip, and my Christian name Philip, my infant tongue could make of both names nothing longer or more explicit than Pip.",
 'I took her hand in mine, and we went out of the ruined place; and, as the morning mists had risen long ago when I first left the forge, so the evening mists were rising now, and in all the broad expanse of tranquil light they showed to me, I saw no shadow of another parting from her.')

In [345]:
story = re.sub(r' {2,}', ' ', story)  # Replace consecutive spaces with one space
story = re.sub(r'\n{2,}', '\n', story)  # Replace consecutive newlines with one newline
story = re.sub(r' ?\n ?', '\n', story)  # Replace space + newline or newline + space with one newline
story = story.replace("“", "\"").replace("”", "\"").replace("‘", "'").replace("’", "'")

In [346]:
story_formatted = story.replace("\n", " ")
# replace weird quotes too
story_formatted = story_formatted.replace("\"", " ").replace("'", " ")

In [347]:
# # remove any marker that doesn't include one of these words
# exclude_words = ["part", "preface", "contents", "by", "end", "epilogue", "letter"]
exclude_words = ["part", "preface", "contents", "introduction", "end", "epilogue", "letter", "volume"]
markers = [m for m in markers if not any(word in m.lower() for word in exclude_words)]

In [348]:
# save as json 
story_json = {
    "first_line": first_line,
    "last_line": last_line,
    "markers": markers
}
if not os.path.exists(f"json/{story_name}"):
    os.makedirs(f"json/{story_name}")
with open(f"json/{story_name}/summary.json", "w") as f:
    json.dump(story_json, f, indent=4)

In [349]:
# read in markers from summary.json
with open(f"json/{story_name}/summary.json") as f:
    summary = json.load(f)
    markers = summary["markers"]
    first_line = summary["first_line"]
    last_line = summary["last_line"]

In [350]:
story_formatted[-100:] 

'help produce our new eBooks, and how to subscribe to our email newsletter to hear about new eBooks. '

In [351]:
first_ind = story_formatted.find(first_line.replace("\"", " ").replace("'", " "))
first_ind, first_line.replace("\"", " ").replace("'", " ")

(1658,
 'My father s family name being Pirrip, and my Christian name Philip, my infant tongue could make of both names nothing longer or more explicit than Pip.')

In [352]:
# first_marker = markers[0] + " "
# first_marker_ind = story_formatted.rfind(first_marker.replace("\"", " ").replace("'", " "))

# first_line = first_marker
# first_marker_ind, first_marker


In [353]:
# find all instances of first marker
# first_marker_inds = [m.start() for m in re.finditer("CHAPTER I. Mrs", story_formatted)]
# first_marker_inds

# # print snippet around each marker
# for ind in first_marker_inds:
#     print(story_formatted[ind-100:ind+100])


In [354]:
second_marker = markers[1] + " "
last_line_index = story_formatted.find(second_marker, first_ind)
last_line_index, second_marker, story_formatted[last_line_index-50:last_line_index+50]

(11216,
 'Chapter II. ',
 ' frightened again, and ran home without stopping. Chapter II. My sister, Mrs. Joe Gargery, was more ')

In [355]:
last_ind = story_formatted.find(last_line)
last_ind, last_line

(990831,
 'I took her hand in mine, and we went out of the ruined place; and, as the morning mists had risen long ago when I first left the forge, so the evening mists were rising now, and in all the broad expanse of tranquil light they showed to me, I saw no shadow of another parting from her.')

In [356]:
# split story into chapters based on markers
chapters = []
cur_first_line = first_line.replace("\"", " ").replace("'", " ")
cur_min_index = 0
for i, marker in enumerate(markers):
    # print("marker", marker)
    exists_next = i+1 < len(markers)
    next_marker = markers[i+1] if exists_next else ""
    # print("next_marker", next_marker)
    
    if exists_next and " ACT " in next_marker.upper():
        next_marker = next_marker.upper()
    formatted_last_line = next_marker if exists_next else last_line
    # print("formatted_last_line", formatted_last_line)

    if " ACT " not in next_marker.upper():
        formatted_last_line += " "
    cur_last_line = formatted_last_line
    cur_last_line = cur_last_line.replace("\"", " ").replace("'", " ")
    first_line_index = story_formatted.rfind(cur_first_line, cur_min_index)
    print("cur_first_line", repr(cur_first_line))
    print("first_line_index", first_line_index)
    print("Snippet around first_line_index:", repr(story_formatted[first_line_index:first_line_index + 200]))

    if i != 0 and " ACT " not in marker.upper():
        first_line_index += len(cur_first_line)
    last_line_index = story_formatted.find(cur_last_line, first_line_index)
    print("cur_last_line", repr(cur_last_line))
    print("last_line_index", last_line_index)
    print("Snippet around last_line_index:", repr(story_formatted[last_line_index-200:last_line_index]))
    
    if i+1 == len(markers):
        if last_line_index == -1:
            last_line_index = len(story_formatted)
        else:
            last_line_index += len(cur_last_line) 
    chapter = story[first_line_index:last_line_index].strip()

    # add line number in front of each line
    chapter = chapter.split("\n")
    for j, line in enumerate(chapter):
        chapter[j] = f"LINE {j+1}: {line}"
    chapter = "\n".join(chapter)
    chapters.append(chapter)
    cur_first_line = cur_last_line
    cur_min_index = last_line_index

cur_first_line 'My father s family name being Pirrip, and my Christian name Philip, my infant tongue could make of both names nothing longer or more explicit than Pip.'
first_line_index 1658
Snippet around first_line_index: 'My father s family name being Pirrip, and my Christian name Philip, my infant tongue could make of both names nothing longer or more explicit than Pip. So, I called myself Pip, and came to be called P'
cur_last_line 'Chapter II. '
last_line_index 11216
Snippet around last_line_index: 'to gaze after him, I wondered whether they thought so too. I looked all round for the horrible young man, and could see no signs of him. But now I was frightened again, and ran home without stopping. '
cur_first_line 'Chapter II. '
first_line_index 11216
Snippet around first_line_index: 'Chapter II. My sister, Mrs. Joe Gargery, was more than twenty years older than I, and had established a great reputation with herself and the neighbours because she had brought me up  by hand.  Having

In [357]:
# check results
for i, marker in enumerate(markers):
    print(f"Chapter {marker}")
    chapter_first_line = chapters[i][:100]
    chapter_last_line = chapters[i][-100:]
    print("first_line:", chapter_first_line)
    print("last_line:", chapter_last_line)
    print("\n")

Chapter Chapter I.
first_line: LINE 1: My father's family name being Pirrip, and my Christian name Philip, my
LINE 2: infant tongue
last_line:  could see no signs of
LINE 158: him. But now I was frightened again, and ran home without stopping.


Chapter Chapter II.
first_line: LINE 1: My sister, Mrs. Joe Gargery, was more than twenty years older than I,
LINE 2: and had establ
last_line: INE 292: I had entered when I ran home last night, shut it, and ran for the
LINE 293: misty marshes.


Chapter Chapter III.
first_line: LINE 1: It was a rimy morning, and very damp. I had seen the damp lying on the
LINE 2: outside of my
last_line: t
LINE 171: I heard of him, I stopped in the mist to listen, and the file was still
LINE 172: going.


Chapter Chapter IV.
first_line: LINE 1: I fully expected to find a Constable in the kitchen, waiting to take me
LINE 2: up. But not 
last_line: e of whom held out a
LINE 290: pair of handcuffs to me, saying, "Here you are, look sharp, come on!"


Chapter Chap

In [358]:
# save to new txt files
# create folder for story if it doesn't exist
if not os.path.exists(f"chapters/{story_name}"):
    os.makedirs(f"chapters/{story_name}")
for i, marker in enumerate(markers):
    with open(f"chapters/{story_name}/{marker}.txt", "w") as f:
        f.write(chapters[i])
        print(f"Chapter {marker} saved.")

Chapter Chapter I. saved.
Chapter Chapter II. saved.
Chapter Chapter III. saved.
Chapter Chapter IV. saved.
Chapter Chapter V. saved.
Chapter Chapter VI. saved.
Chapter Chapter VII. saved.
Chapter Chapter VIII. saved.
Chapter Chapter IX. saved.
Chapter Chapter X. saved.
Chapter Chapter XI. saved.
Chapter Chapter XII. saved.
Chapter Chapter XIII. saved.
Chapter Chapter XIV. saved.
Chapter Chapter XV. saved.
Chapter Chapter XVI. saved.
Chapter Chapter XVII. saved.
Chapter Chapter XVIII. saved.
Chapter Chapter XIX. saved.
Chapter Chapter XX. saved.
Chapter Chapter XXI. saved.
Chapter Chapter XXII. saved.
Chapter Chapter XXIII. saved.
Chapter Chapter XXIV. saved.
Chapter Chapter XXV. saved.
Chapter Chapter XXVI. saved.
Chapter Chapter XXVII. saved.
Chapter Chapter XXVIII. saved.
Chapter Chapter XXIX. saved.
Chapter Chapter XXX. saved.
Chapter Chapter XXXI. saved.
Chapter Chapter XXXII. saved.
Chapter Chapter XXXIII. saved.
Chapter Chapter XXXIV. saved.
Chapter Chapter XXXV. saved.
Chapter 

## analyze scene

In [359]:
# Pydantic
class SceneInfo(BaseModel):
    """Get information about a scene in the story."""
    scene: str = Field(description="Short title of scene (don't include chapter name or number). Each scene should have a unique title, and each title should be no longer than 6 words.")
    summary: str = Field(description="1 line summary of the scene")
    firstline: int = Field(description="First line number of the current scene in this chapter (should be after last line number of previous scene)")
    lastline: int = Field(description="Last line number of the current scene in this chapter (should be after first line number of current scene)")

class SceneListInfo(BaseModel):
    """Get overall information about the story."""
    scenelist: list[SceneInfo] = Field(description="List of key scenes in this chapter")

def get_scenelist_prompt(chapter):
    scene_prompt = f'''List the key scenes in this chapter: {chapter}. Format your response like SceneListInfo:
                    class SceneInfo(BaseModel):
                        """Get information about a scene in the story."""
                        scene: str = Field(description="Title of scene (don't include chapter name or number)")
                        summary: str = Field(description="1 line summary of the scene")
                        firstline: int = Field(description="First line number of the current scene in this chapter (should be after last line number of previous scene)")
                        lastline: int = Field(description="Last line number of the current scene in this chapter (should be after first line number of current scene)")

                    class SceneListInfo(BaseModel):
                        """Get overall information about the story."""
                        scenelist: list[SceneInfo] = Field(description="List of key scenes in this chapter")'''
    return scene_prompt

scene_llm = llm.with_structured_output(SceneListInfo)
# scene_llm = llm2.with_structured_output(SceneListInfo)

In [360]:
# Pydantic
class CharacterInfo(BaseModel):
    """Get information about a character in this scene."""
    name: str = Field(description="Full name of the character. Character name should be 5 words or less.")
    role: str = Field(description="Main actions and/or motivations of the character in this scene")
    importance: float = Field(description="Importance of the character in this scene from from 0: not important at all to 1: very important character")
    emotion: str = Field(description="Emotion of the character in this scene, described in a few words or a short phrase")
    sentiment: float = Field(description="Sentiment of the character in this scene from -1 (very negative) to 1 (very positive)")
    quote: str = Field(description="A direct quote from the character in this scene to illustrate their emotions and/or actions")

# Pydantic
class ThemeInfo(BaseModel):
    """Get information about a theme in this scene."""
    name: str = Field(description="Short general phrase to describe the theme. Theme name should be 3 words or less.")
    role: str = Field(description="Short description of the role of the theme in this scene")
    importance: float = Field(description="Importance of the theme in this scene from from 0: not important at all to 1: very important theme")
    emotion: str = Field(description="Emotion associated with the theme in this scene, described in a few words or a short phrase")
    sentiment: float = Field(description="Sentiment of the theme in this scene from -1 (very negative) to 1 (very positive)")
    quote: str = Field(description="A direct quote in this scene to illustrate the theme")

class SceneDetails(BaseModel):
    """Get overall information about the scene."""
    location: str = Field(description="Physical location of the scene. Location name should be 5 words or less.")
    importance: float = Field(description="Importance of the scene from 0: not important at all to 1: very important and pivotal scene")
    conflict: float = Field(description="Conflict in the scene from 0: no conflict to 1: high conflict")
    characters: list[CharacterInfo] = Field(description="List of key characters in the scene. Make sure there are no duplicates, and focus on speaking characters. Don't include any unnamed characters.")

class SceneDetailsTheme(BaseModel):
    """Get overall information about the scene."""
    location: str = Field(description="Physical location of the scene. Location name should be 5 words or less.")
    importance: float = Field(description="Importance of the scene from 0: not important at all to 1: very important and pivotal scene")
    conflict: float = Field(description="Conflict in the scene from 0: no conflict to 1: high conflict")
    themes: list[ThemeInfo] = Field(description="List of key themes in the scene. Make sure there are no duplicates but try to make the themes as general as possible.")

def get_scenedetails_prompt(scene):
    scene_prompt = f'''List the key details in this scene: {scene}. Format your response like SceneDetails:
                    class CharacterInfo(BaseModel):
                        """Get information about a character in this scene."""
                        name: str = Field(description="Full name of the character. Character name should be 5 words or less.")
                        role: str = Field(description="Main actions and/or motivations of the character in this scene")
                        importance: float = Field(description="Importance of the character in this scene from from 0: not important at all to 1: very important character")
                        emotion: str = Field(description="Emotion of the character in this scene, described in a few words or a short phrase")
                        sentiment: float = Field(description="Sentiment of the character in this scene from -1 (very negative) to 1 (very positive)")
                        quote: str = Field(description="A direct quote from the character in this scene to illustrate their emotions and/or actions")

                    class SceneDetails(BaseModel):
                        """Get overall information about the scene."""
                        location: str = Field(description="Physical location of the scene. Location name should be 5 words or less.")
                        importance: float = Field(description="Importance of the scene from 0: not important at all to 1: very important and pivotal scene")
                        conflict: float = Field(description="Conflict in the scene from 0: no conflict to 1: high conflict")
                        characters: list[CharacterInfo] = Field(description="List of key characters in the scene. Make sure there are no duplicates, and focus on speaking characters.  Don't include any unnamed characters.")'''
    return scene_prompt

scene_details_llm = llm.with_structured_output(SceneDetails)
scene_details_llm_theme = llm.with_structured_output(SceneDetailsTheme)
# scene_details_llm = llm2.with_structured_output(SceneDetails)

In [361]:
def get_scene_info(chapters, chapter_num):
    cur_chapter = chapters[chapter_num]

    invalid_input = True
    while invalid_input: # try until valid input
        # see if exception is thrown
        try:
            if model_type == "gemini":
                prompt = get_scenelist_prompt(cur_chapter)
                chapter_scenes = llm3.invoke(prompt)
                chapter_scenes = format_gemini_response(chapter_scenes)
            else:
                chapter_scenes = scene_llm.invoke(cur_chapter)
            invalid_input = False
        except:
            invalid_input = True
            
    if model_type == "gemini":
        chapter_scenes_list = [DotDict(scene) for scene in chapter_scenes['scenelist']]
    else: 
        chapter_scenes_list = chapter_scenes.scenelist
    formatted_chapter_scenes = []

    for scene in chapter_scenes_list:
        scene_title = scene.scene
        scene_summary = scene.summary
        scene_first_line = scene.firstline
        scene_last_line = scene.lastline

        print(f"FIRST LINE: {scene_first_line}")
        print(f"LAST LINE: {scene_last_line}")
        print()

        formatted_scene = {
            "title": scene_title,
            "summary": scene_summary,
            "chapter": markers[chapter_num],
            "first_line": scene_first_line,
            "last_line": scene_last_line
        }
        formatted_chapter_scenes.append(formatted_scene)
        formatted_chapter = cur_chapter.replace("\n", " ")
        # replace weird quotes too
        formatted_chapter = formatted_chapter.replace("\"", " ").replace("'", " ")

    # split chapter into scenes based on formatted_chapter_scenes
    scenes = []
    cur_min_index = 0
    for i, scene in enumerate(formatted_chapter_scenes):
        cur_first_line = f"LINE {scene['first_line']}:"
        # cur_first_line = formatted_chapter_scenes[i]["first_line"].strip("...").replace("\"", " ").replace("'", " ")
        first_line_index = formatted_chapter.find(cur_first_line, cur_min_index)
        if first_line_index == -1:
            # see if first_line_index overlaps with last_line of previous scene
            other_possible_first_index = formatted_chapter.find(cur_first_line)
            if other_possible_first_index != -1:
                overlap = cur_chapter[other_possible_first_index:cur_min_index]
                print(f"SCENE {i}")
                print("\nPREVIOUS SCENE")
                print(scenes[i-1])
                print("\nOVERLAP")
                print(overlap)
                print()
            first_line_index = cur_min_index
        cur_last_line = f"LINE {scene['last_line'] + 1}:"
        # cur_last_line = formatted_chapter_scenes[i]["last_line"].strip("...").replace("\"", " ").replace("'", " ")
        last_line_index = formatted_chapter.find(cur_last_line, first_line_index)
        # last_line_index += len(cur_last_line)
        if last_line_index == -1:
            last_line_index = len(cur_chapter)
        scene_text = cur_chapter[first_line_index:last_line_index].strip()
        # remove line numbers
        scene_text = re.sub(r'LINE \d+: ', '', scene_text)
        scenes.append(scene_text)
        cur_min_index = last_line_index

    # add scene text to formatted_chapter_scenes
    for i, scene in enumerate(formatted_chapter_scenes):
        formatted_chapter_scenes[i]["text"] = scenes[i]

    return scenes, formatted_chapter_scenes

In [362]:
def get_scene_details(scenes, formatted_chapter_scenes):
    all_scene_details = []
    for i, scene in enumerate(scenes):
        scene_text = formatted_chapter_scenes[i]["text"]
        if len(scene_text) == 0:
            all_scene_details.append([])
            continue
        # get scene details for each scene
        cur_scene = scene
        invalid_input = True
        # if model_type == 'gemini':
        #     prompt = get_scenedetails_prompt(cur_scene)
        #     scene_details = llm.invoke(prompt)
        #     print(scene_details)
        #     scene_details = format_gemini_response(scene_details)
        #     print(scene_details)
        #     scene_details = DotDict(scene_details)
        # else:
        while invalid_input:
            try:
                if analysis_type == "theme":
                    scene_details = scene_details_llm_theme.invoke(cur_scene)
                else:
                    scene_details = scene_details_llm.invoke(cur_scene)
                invalid_input = False
                # print(scene_details)
            except:
                invalid_input = True
        all_scene_details.append(scene_details)

    # add details to formatted_chapter_scenes
    for i, scene in enumerate(formatted_chapter_scenes):
        scene_text = formatted_chapter_scenes[i]["text"]
        text_length = len(scene_text)
        num_lines = formatted_chapter_scenes[i]["last_line"] - formatted_chapter_scenes[i]["first_line"] + 1
        # first_line = formatted_chapter_scenes[i]["first_line"]
        # first_line_length = len(first_line)
        # last_line = formatted_chapter_scenes[i]["last_line"]
        # last_line_length = len(last_line)

        # if text_length == 0 or text_length < first_line_length + last_line_length:
        if text_length == 0:
            # no scene text or scene text is too short
            continue
        
        cur_scene_details = all_scene_details[i]
        formatted_chapter_scenes[i]["length"] = text_length
        formatted_chapter_scenes[i]["num_lines"] = num_lines
        formatted_chapter_scenes[i]["location"] = cur_scene_details.location
        formatted_chapter_scenes[i]["importance"] = cur_scene_details.importance
        formatted_chapter_scenes[i]["conflict"] = cur_scene_details.conflict
        formatted_character_info = []

        characters = cur_scene_details.characters if analysis_type == "character" else cur_scene_details.themes
        # if model_type == "gemini":
        #     characters = [DotDict(character) for character in characters]
        for character in characters:
            character_info = {
                "name": character.name,
                "role": character.role,
                "importance": character.importance,
                "emotion": character.emotion,
                "sentiment": character.sentiment,
                "quote": character.quote
            }
            formatted_character_info.append(character_info)
        formatted_chapter_scenes[i]["characters"] = formatted_character_info

    return formatted_chapter_scenes

In [363]:
# get scene info for each chapter
for i, marker in enumerate(markers):
    print(f"Chapter {marker} start.")
    scenes, formatted_chapter_scenes = get_scene_info(chapters, i)
    print(f"Chapter {marker} scenes done.")
    formatted_chapter_scenes = get_scene_details(scenes, formatted_chapter_scenes)
    print(f"Chapter {marker} details done.")

    # remove scenes with no text
    formatted_chapter_scenes = [scene for scene in formatted_chapter_scenes if scene["text"] != "" and "characters" in scene]
    # remove characters with no quote
    for scene in formatted_chapter_scenes:
        scene["characters"] = [character for character in scene["characters"] if "(" not in character["quote"] != "" and "<" not in character["name"] and "unnamed" not in character["name"].lower()]
        # print(scene)
    
    # remove scenes with no characters
    formatted_chapter_scenes = [scene for scene in formatted_chapter_scenes if len(scene["characters"]) > 0]

    # save as json
    with open(f"json/{story_name}/{marker}.json", "w") as f:
        json.dump(formatted_chapter_scenes, f, indent=4)

    print(f"Chapter {marker} done.\n")

Chapter Chapter I. start.
FIRST LINE: 1
LAST LINE: 34

FIRST LINE: 35
LAST LINE: 157

Chapter Chapter I. scenes done.
Chapter Chapter I. details done.
Chapter Chapter I. done.

Chapter Chapter II. start.
FIRST LINE: 1
LAST LINE: 7

FIRST LINE: 8
LAST LINE: 44

FIRST LINE: 45
LAST LINE: 56

FIRST LINE: 57
LAST LINE: 66

FIRST LINE: 67
LAST LINE: 75

FIRST LINE: 76
LAST LINE: 80

FIRST LINE: 81
LAST LINE: 97

FIRST LINE: 98
LAST LINE: 120

FIRST LINE: 121
LAST LINE: 134

FIRST LINE: 135
LAST LINE: 145

FIRST LINE: 146
LAST LINE: 157

FIRST LINE: 158
LAST LINE: 172

FIRST LINE: 173
LAST LINE: 259

FIRST LINE: 260
LAST LINE: 266

FIRST LINE: 267
LAST LINE: 244

FIRST LINE: 245
LAST LINE: 259

FIRST LINE: 260
LAST LINE: 293

SCENE 15

PREVIOUS SCENE
getting a light by easy friction then; to have got one I must have
struck it out of flint and steel, and have made a noise like the very
pirate himself rattling his chains.
As soon as the great black velvet pall outside my little window was
shot

In [364]:
# rank each scene by importance
# and within each scene, the characters

for i, marker in enumerate(markers):
    print(marker)
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    importances = []
    conflicts = []
    
    # extract importance from each scene
    for j, scene in enumerate(scenes):
        importances.append((j, scene["importance"]))
        conflicts.append((j, scene["conflict"]))
        # now extract character importances
        character_importances = []
        for k, character in enumerate(scene["characters"]):
            character_importances.append((k, character["importance"]))
        # sort character importances
        sorted_character_importances = sorted(character_importances, key=lambda x: x[1], reverse=True)
        # add importance_rank to each character
        for k, (l, _) in enumerate(sorted_character_importances):
            scenes[j]["characters"][l]["importance_rank"] = k+1
        # add number of each scene
        scenes[j]["number"] = j+1
    # sort importances
    sorted_importances = sorted(importances, key=lambda x: x[1], reverse=True)
    # add importance_rank to each scene
    for k, (j, _) in enumerate(sorted_importances):
        scenes[j]["importance_rank"] = k+1
    # sort conflicts
    sorted_conflicts = sorted(conflicts, key=lambda x: x[1], reverse=True)
    # add conflict_rank to each scene
    for k, (j, _) in enumerate(sorted_conflicts):
        scenes[j]["conflict_rank"] = k+1
    
    # print results
    # for j, scene in enumerate(scenes):
    #     print(scenes[j]["title"], scenes[j]["importance_rank"])
    #     print("--------------------------------")
    #     for k, character in enumerate(scene["characters"]):
    #         print(character["name"], character["importance_rank"])
    #     print()

    # save as json
    with open(f"json/{story_name}/{marker}.json", "w") as f:
        json.dump(scenes, f, indent=4)

Chapter I.
Chapter II.
Chapter III.
Chapter IV.
Chapter V.
Chapter VI.
Chapter VII.
Chapter VIII.
Chapter IX.
Chapter X.
Chapter XI.
Chapter XII.
Chapter XIII.
Chapter XIV.
Chapter XV.
Chapter XVI.
Chapter XVII.
Chapter XVIII.
Chapter XIX.
Chapter XX.
Chapter XXI.
Chapter XXII.
Chapter XXIII.
Chapter XXIV.
Chapter XXV.
Chapter XXVI.
Chapter XXVII.
Chapter XXVIII.
Chapter XXIX.
Chapter XXX.
Chapter XXXI.
Chapter XXXII.
Chapter XXXIII.
Chapter XXXIV.
Chapter XXXV.
Chapter XXXVI.
Chapter XXXVII.
Chapter XXXVIII.
Chapter XXXIX.
Chapter XL.
Chapter XLI.
Chapter XLII.
Chapter XLIII.
Chapter XLIV.
Chapter XLV.
Chapter XLVI.
Chapter XLVII.
Chapter XLVIII.
Chapter XLIX.
Chapter L.
Chapter LI.
Chapter LII.
Chapter LIII.
Chapter LIV.
Chapter LV.
Chapter LVI.
Chapter LVII.
Chapter LVIII.
Chapter LIX.


In [365]:
# combine all chapters into one json file
all_scenes = []
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    all_scenes += scenes

with open(f"json/{story_name}/all.json", "w") as f:
    json.dump(all_scenes, f, indent=4)

In [366]:
# collect all unique characters and locations for each chapter
all_characters = {}
all_locations = []
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    for scene in scenes:
        for character in scene["characters"]:
            char = character["name"]
            if char not in all_characters:
                all_characters[char] = character["role"]
        location = scene["location"]
        if location not in all_locations:
            all_locations.append(location)

In [367]:
len(all_characters), len(all_locations)

(628, 250)

In [368]:
all_characters

{'identity': 'Pip grapples with his identity and origins',
 'loss': 'Pip reflects on the loss of his parents and siblings',
 'imagination': "Pip's childhood imagination shapes his perceptions of his family",
 'nature': "The bleak landscape reflects Pip's emotional state",
 'fear and intimidation': 'The protagonist is threatened by the convict, creating a sense of danger.',
 'survival': "The protagonist must comply with the convict's demands to ensure his safety.",
 'isolation': "The convict's appearance and behavior highlight his alienation from society.",
 'Abuse': 'Highlights the harsh upbringing',
 'Family Dynamics': 'Explores the relationship between siblings',
 'Reputation': 'Shows how Mrs. Joe perceives herself',
 'Abuse and Control': "Highlights Mrs. Joe's domineering nature over Joe and Pip",
 'Identity and Self-Perception': "Pip's reflections on his family and their appearances",
 'Childhood Innocence': "Pip's perspective on adult conflicts",
 'Domestic Tension': 'Conflict bet

In [369]:
all_locations

['marsh country churchyard',
 'church porch graveyard',
 "Mrs. Joe's household",
 "Joe's forge kitchen",
 "Joe's house",
 'a small room',
 "blacksmith's home",
 "Mrs. Joe's home",
 'kitchen during tea time',
 "Joe's house during dinner",
 "Joe's kitchen",
 "Pip's home",
 "Joe's house dining room",
 "sister's house",
 'Christmas Eve kitchen',
 'a dark night',
 'misty marshes',
 'a damp morning',
 'marshes near the river',
 'Battery',
 'marshy riverbank',
 'a desolate marshland',
 'misty grassland',
 'Christmas kitchen',
 'a small village church',
 'Christmas dinner gathering',
 "Joe's house dinner",
 'kitchen during dinner',
 'the yard near the marshes',
 'the marshes',
 'churchyard and marshes',
 'old Battery',
 'Marshes near the ditch',
 'marshes at night',
 "Joe's home",
 'prison-ship vicinity',
 'kitchen during a gathering',
 'stairs',
 'churchyard',
 "Joe's forge",
 'village evening school',
 'Chimney corner',
 "Pip's Reflection",
 "Joe's kitchen on market-day",
 "Miss Havisham's h

In [370]:
# Assuming your JSON file is stored at 'all.json'
with open(f"json/{story_name}/all.json") as f:
    json_data = json.load(f)

prompt = """
You are given a list of characters with some duplicates or alternate names.
Your task is to group all the alternate names for each unique character under a single full name, using the descriptions as guidance.
Don't keep characters like "Narrator" unless they are never referred to by another name.
Separate characters that are related to others (e.g., "Bob" and "Ms. Doe (Bob's mom)" and "Bob's sister") into separate entries.

First create a new character list by removing all duplicates and alternate names and only keeping the full name for each character.

Then, output a JSON dictionary where the key is each character in your new list and the value is a list of all possible alternate names. 
All names should be taken directly from the original character list and should only appear once in the output.

For example:
{{
    "Bob Smith": ["Bob", "Bobby", "Robert"],
    "Jane Doe": ["Jane", "Ms. Doe", "Bob's mother"]
}}
"""

theme_prompt = """
You are given a list of themes with some duplicates or alternate names.
Your task is to group all the alternate names for each unique theme under a single name, using the descriptions as guidance.
Your goal is to have as few themes as possible while still capturing all the different ways the themes are described.

First create a new theme list by removing all duplicates and alternate names and only keeping the best description for each theme.

Then, output a JSON dictionary where the key is each theme in your new list and the value is a list of all possible alternate names. 
All names should be taken directly from the original theme list and should only appear once in the output.

For example:
{{
    "Memory": ["Remembering the past", "Recollection", "Nostalgia"],
    "Friendship and Loyalty": ["Friendship and Companionship", "Making Friends", "Bonds of Friendship", "Loyalty and Friendship"]
}}
"""

location_prompt = """
You are given a list of locations with some duplicates or alternate names.
Your task is to group all the alternate or similiar names for each unique location under a single location name.

First create a new location list by removing all duplicates and alternate names and only keeping the full name for each location.
If one location is inside another or nearby to it, only keep the larger location (e.g., "Bob's bedroom", "Bob's garden", and "Bob Smith's castle" can be listed under "Bob's house").
Then, output a JSON dictionary where the key is each location in your new list and the value is a list of all possible alternate location names.
All locations should be taken directly from the original location list and should only appear once in the output dict.

For example:
{{
    "Los Angeles": ["LA", "City of Angels", "Cafe in Los Angeles"],
    "Bob's house": ["Bob's bedroom", "Bob Smith's garden", "Bob's castle"]
}}
"""

if analysis_type == "theme":
    prompt = theme_prompt

# if using llm2 (claude)
prompt += "Just output the JSON dictionary as the final result without any additional information."
location_prompt += "Just output the JSON dictionary as the final result without any additional information."

prompt_template = ChatPromptTemplate.from_messages([("system", prompt), ("human", "{input}")])
location_prompt_template = ChatPromptTemplate.from_messages([("system", location_prompt), ("human", "{input}")])

In [387]:
# Create the LLM chain with the prompt template and model

# Run the chain by passing the characters and JSON data as inputs
if analysis_type == "theme":
    final_prompt = prompt_template.invoke(f"theme list: {all_characters}")
else:
    final_prompt = prompt_template.invoke(f"character list: {all_characters}")
# character_llm = llm.with_structured_output(None, method="json_mode") if model_type == "gpt" else llm
character_llm = llm2
# character_llm = llm
output = character_llm.invoke(final_prompt)


In [388]:
output

AIMessage(content='{\n    "Identity": ["Identity Confusion", "Identity And Recognition", "Identity Revelation", "Perception Of Self", "Identity And Self-worth", "Identity And Roles", "Self-perception", "Identity And Influence", "Identity And Purpose"],\n    "Loss": ["Loss And Decay", "Loss And Mourning", "Loss And Regret", "Grief And Loss"],\n    "Imagination": ["Imagination And Paranoia"],\n    "Nature": ["Nature\'s Oppression"],\n    "Fear": ["Fear And Intimidation", "Fear Of Alienation", "Fear Of Consequences", "Fear And Survival", "Fear And Dread", "Fear And Recognition", "Fear Of Failure", "Fear Of Change", "Fear Of Observation", "Fear Of Pursuit"],\n    "Survival": [],\n    "Isolation": ["Isolation And Loneliness", "Isolation And Retreat"],\n    "Abuse": [],\n    "Family Dynamics": ["Family", "Family Ties", "Family Expectations", "Parental Expectations", "Parental Guidance"],\n    "Reputation": ["Public Perception", "Public Scrutiny"],\n    "Childhood": ["Childhood Innocence", "C

In [389]:
# if model_type == "claude":
output_formatted = output.content
# if output stop_reason = "max_tokens",cut off at last ']' character and add '}'
if output_formatted[-1] != "}":
    last_bracket_ind = output_formatted.rfind("]")
    output_formatted = output_formatted[:last_bracket_ind + 1] + "}"
    # print(output_formatted)
# turn into json
output = json.loads(output_formatted)

In [373]:
# Create the LLM chain with the prompt template and model

# Run the chain by passing the characters and JSON data as inputs
final_location_prompt = location_prompt_template.invoke(f"location list: {all_locations}")
# location_llm = llm.with_structured_output(None, method="json_mode") if model_type == "gpt" else llm
location_llm = llm2
# location_llm = llm
location_output = location_llm.invoke(final_location_prompt)

In [374]:
location_output

AIMessage(content='{\n    "Marsh country": ["marsh country churchyard", "misty marshes", "marshes near the river", "Battery", "marshy riverbank", "a desolate marshland", "misty grassland", "the yard near the marshes", "the marshes", "churchyard and marshes", "old Battery", "Marshes near the ditch", "marshes at night", "The old Battery marshes", "church and marshes", "Childhood countryside", "old marshes", "dark marshes at night", "old sluice-house", "sluice-house"],\n    "Church": ["church porch graveyard", "a small village church", "churchyard", "near the churchyard"],\n    "Joe\'s house": ["Mrs. Joe\'s household", "Joe\'s forge kitchen", "Joe\'s house", "a small room", "blacksmith\'s home", "Mrs. Joe\'s home", "kitchen during tea time", "Joe\'s house during dinner", "Joe\'s kitchen", "Pip\'s home", "Joe\'s house dining room", "sister\'s house", "Christmas Eve kitchen", "Christmas kitchen", "Christmas dinner gathering", "Joe\'s house dinner", "kitchen during dinner", "Joe\'s home", "k

In [None]:
location_output_formatted = location_output.content
# turn into json
location_output = json.loads(location_output_formatted)

{
    "Identity": ["Identity and Self-Perception", "Identity Crisis", "Identity Revelation", "Identity Search", "Identity transformation"],
    "Loss": ["Loss and Grief", "Loss and Memory", "Loss of appetite"],
    "Imagination": ["Childhood imagination"],
    "Nature": ["Nature's Influence", "Nature's Fury", "Nature's beauty"],
    "Fear and Intimidation": ["Fear of Authority", "Fear of Judgment", "Fear of Loss", "Fear of Recognition", "Fear of the Unknown"],
    "Survival": ["Desperation"],
    "Isolation": ["Isolation and Alienation", "Isolation and Confusion", "Isolation and Distance", "Isolation and Waiting"],
    "Abuse": ["Abuse and Control"],
    "Family Dynamics": ["Family Conflict", "Family Bonds"],
    "Reputation": ["Social Status"],
    "Childhood Innocence": ["Childhood experiences"],
    "Domestic Tension": ["Domestic Conflict", "Domestic Routine"],
    "Guilt and Responsibility": ["Guilt and Conscience", "Guilt and Obligation", "Guilt and Remorse"],
    "Social Class": 

In [380]:
# Function to find the correct key based on a search string, ensuring whole-word matches
def find_key_from_alias(text, character_dict):
    for key, aliases in character_dict.items():
        for alias in sorted(aliases, key=len, reverse=True):
            # Match the alias as a whole word, and ensure no partial matches within other words
            pattern = rf'\b{re.escape(alias)}\b(?!\'s)'  # (?!\'s) ensures no match for possessives like Mitsuha's
            if re.search(pattern, text):
                return key  # Return the key if the alias is found as a whole word
    return text  # Return the original text if no match is found

In [390]:
# Load the scenes
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)

# Capitalize output and location output
capitalized_output = {string.capwords(key): [string.capwords(name) for name in value] for key, value in output.items()}
capitalized_location_output = {string.capwords(key): [string.capwords(name) for name in value] for key, value in location_output.items()}

# Process each scene
for scene in scenes:
    # Process characters
    for character in scene["characters"]:
        char = string.capwords(character["name"])
        found_char = False

        # Check if the character's name is a key in the dictionary
        if char in capitalized_output:
            found_char = True
            character["name"] = char  # Capitalize the character name
        
        # If not found in the dictionary, check the aliases
        if not found_char:
            # Find the right key for the alias (if exists)
            matched_key = find_key_from_alias(char, capitalized_output)
            if matched_key != char:
                character["name"] = matched_key
                found_char = True
        if not found_char:
            character["name"] = char

    # Process location
    loc = string.capwords(scene["location"])
    found_loc = False

    # Check if the location is a key in the location dictionary
    if loc in capitalized_location_output:
        found_loc = True
        scene["location"] = loc  # Capitalize the location

    # If not found in the dictionary, check the aliases
    if not found_loc:
        matched_key = find_key_from_alias(loc, capitalized_location_output)
        if matched_key != loc:
            scene["location"] = matched_key
            found_loc = True
    if not found_loc:
        scene["location"] = loc

# Save the updated scenes
with open(f"json/{story_name}/all.json", "w") as f:
    json.dump(scenes, f, indent=4)

In [391]:
# now list all characters and locations again
# collect all unique characters and locations for each chapter
all_characters_new = {}
all_locations_new = []
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)
for scene in scenes:
    for character in scene["characters"]:
        char = character["name"]
        if char not in all_characters_new:
            all_characters_new[char] = 1
        else:
            all_characters_new[char] += 1
    location = scene["location"]
    if location not in all_locations_new:
        all_locations_new.append(location)

In [393]:
len(all_characters_new), len(all_locations_new)

(237, 107)

In [394]:
all_characters_new 

{'Identity': 24,
 'Loss': 18,
 'Imagination': 4,
 'Nature': 8,
 'Fear': 31,
 'Survival': 18,
 'Isolation': 63,
 'Abuse': 2,
 'Family Dynamics': 29,
 'Reputation': 15,
 'Childhood': 16,
 'Domestic Tension': 6,
 'Disagreement': 13,
 'Guilt': 14,
 'Mortality': 6,
 'Obligation': 6,
 'Social Class': 53,
 'Authority': 11,
 'Friendship': 71,
 'Sacrifice': 10,
 'Health': 6,
 'Social Dynamics': 17,
 'Helplessness': 1,
 'Humor': 6,
 'Medical Treatment': 2,
 'Theft': 3,
 'Despair': 7,
 'Mist And Confusion': 3,
 'Unexpected Encounters': 5,
 'Trust': 10,
 'Miscommunication': 5,
 'Pain': 5,
 'Celebration': 6,
 'Performance': 3,
 'Gratitude': 8,
 'Moral Judgment': 3,
 'Desire To Escape': 8,
 'Apprehension': 1,
 'Community': 4,
 'Anticipation': 9,
 'Hospitality': 5,
 'Comradeship': 3,
 'Betrayal': 10,
 'Loyalty': 2,
 'Indecision': 11,
 'Leadership': 1,
 'Justice': 5,
 'Truth And Deception': 7,
 'Innocence And Guilt': 1,
 'Frustration': 1,
 'Observation': 10,
 'Dismissal': 2,
 'State Of Mind': 1,
 'Mem

In [395]:
all_locations_new

['Marsh Country',
 'Church',
 "Joe's House",
 'A Dark Night',
 'A Damp Morning',
 'Prison-ship Vicinity',
 'Stairs',
 "Joe's Forge",
 'Village',
 'Chimney Corner',
 "Pip's Reflection",
 "Miss Havisham's House",
 "Mr. Pumblechook's",
 "Mr. Wopsle's Great-aunt's",
 'Three Jolly Bargemen',
 'Bedroom At Night',
 'Gloomy Room',
 'Town Hall',
 'River With Sails',
 'Turnpike House',
 "Sister's Interaction",
 "Biddy's Home",
 "Pip's Conversation With Biddy",
 'Summer Afternoon Walk',
 'A Dimly Lit Room',
 "Mr. Jaggers' Office",
 'London',
 "Mr. Trabb's Tailor Shop",
 "Pip's Departure From Village",
 "Barnard's Inn",
 "Mrs. Pocket's Background",
 "Wemmick's Office",
 "Wemmick's Home",
 'Crowded Police-court',
 'Somersetshire',
 'Educational Environment',
 'Gerrard Street, Soho',
 'Dinner Table',
 "Mr. Pocket's Dressing-room",
 'Decorated Chambers',
 'A Small Sitting Room',
 'Theater',
 'Dining Room',
 'Stage-coach Yard',
 'Inside The Coach',
 'On The Coach',
 'Blue Boar',
 'A Street In Town',
 

In [396]:
# Pydantic
class ChapterSummary(BaseModel):
    """Summarize a chapter based on scenes."""
    summary: str = Field(description="A brief, 1-line summary of the chapter")

chapter_llm = llm.with_structured_output(ChapterSummary)

In [397]:
# remove any markers without any scenes
markers_to_remove = []
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    if len(scenes) == 0:
        markers_to_remove.append(marker)

for marker in markers_to_remove:
    markers.remove(marker)

In [398]:
# create summary json of each chapter
chapter_summaries = []
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
        
    with open(f"chapters/{story_name}/{marker}.txt") as f:
        chapter = f.read()

    # get summary of chapter
    scene_summaries = " ".join([scene["summary"] for scene in scenes])
    summary = chapter_llm.invoke(scene_summaries)

    chapter_summary = {
        "chapter": marker,
        "scenes": len(scenes),
        "length": len(chapter),
        "num_lines": len(chapter.split("\n")),
        "summary": summary.summary,
        "importance": [],
        "conflict": [],
        "locations": {},
        "characters": {}
    }
    chapter_summaries.append(chapter_summary)
    
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)
    for scene in scenes:
        chapter = scene["chapter"]
        chapter_summary = chapter_summaries[markers.index(chapter)]
        # add location to chapter summary
        location = scene["location"]
        chapter_summary_locations = chapter_summary["locations"]
        if location not in chapter_summary_locations:
            chapter_summary_locations[location] = 1
        else:
            chapter_summary_locations[location] += 1
        # add importance to chapter summary
        importance = scene["importance"]
        chapter_summary["importance"].append(importance)
        # add conflict to chapter summary
        conflict = scene["conflict"]
        chapter_summary["conflict"].append(conflict)
        # add characters to chapter summary
        characters = scene["characters"]
        for character in characters:
            char = character["name"]
            chapter_summary_characters = chapter_summary["characters"]
            if char not in chapter_summary_characters:
                chapter_summary_characters[char] = 1
            else:
                chapter_summary_characters[char] += 1

# now average importance and conflict for each chapter
for chapter_summary in chapter_summaries:
    if len(chapter_summary["importance"]) == 0:
       print(f"Chapter {chapter_summary['chapter']} has no scenes.")
    chapter_summary["importance"] = round(sum(chapter_summary["importance"]) / len(chapter_summary["importance"]), 2)
    chapter_summary["conflict"] = round(sum(chapter_summary["conflict"]) / len(chapter_summary["conflict"]), 2)

# rank each chapter by importance and conflict
importances = []
conflicts = []
for chapter_summary in chapter_summaries:
    importances.append((chapter_summary["chapter"], chapter_summary["importance"]))
    conflicts.append((chapter_summary["chapter"], chapter_summary["conflict"]))
    
# sort importances
sorted_importances = sorted(importances, key=lambda x: x[1], reverse=True)
# add importance_rank to each chapter
for k, (j, _) in enumerate(sorted_importances):
    chapter_summaries[markers.index(j)]["importance_rank"] = k+1

# sort conflicts
sorted_conflicts = sorted(conflicts, key=lambda x: x[1], reverse=True)
# add conflict_rank to each chapter
for k, (j, _) in enumerate(sorted_conflicts):
    chapter_summaries[markers.index(j)]["conflict_rank"] = k+1

In [399]:
chapter_summaries

[{'chapter': 'Chapter I.',
  'scenes': 2,
  'length': 11029,
  'num_lines': 158,
  'summary': 'Pip introduces his family background and encounters a fearsome convict in the churchyard.',
  'importance': 1.0,
  'conflict': 0.5,
  'locations': {'Marsh Country': 1, 'Church': 1},
  'characters': {'Identity': 1,
   'Loss': 1,
   'Imagination': 1,
   'Nature': 1,
   'Fear': 1,
   'Survival': 1,
   'Isolation': 1},
  'importance_rank': 1,
  'conflict_rank': 41},
 {'chapter': 'Chapter II.',
  'scenes': 15,
  'length': 21011,
  'num_lines': 293,
  'summary': 'Pip navigates his fear of his sister Mrs. Joe and her temper, while Joe Gargery offers him support; Pip grapples with guilt over a secret involving a convict, leading to tension at home as Mrs. Joe punishes him.',
  'importance': 0.78,
  'conflict': 0.59,
  'locations': {"Joe's House": 13, 'A Dark Night': 1, 'Marsh Country': 1},
  'characters': {'Abuse': 2,
   'Family Dynamics': 4,
   'Reputation': 1,
   'Identity': 1,
   'Childhood': 4,
 

In [400]:
all_json = {}
all_json["title"] = story_info["title"]
all_json["type"] = story_info["type"]
if (all_json["type"] == "Movie"):
    all_json["director"] = story_info["director"]
else:
    all_json["author"] = story_info["author"]
all_json["year"] = story_info["year"]
if "url" in story_info:
    all_json["url"] = story_info["url"]
if "image" in story_info:
    all_json["image"] = story_info["image"]
all_json["num_chapters"] = len(chapters)
all_json["num_scenes"] = len(scenes)
all_json["num_characters"] = len(all_characters_new)
all_json["num_locations"] = len(all_locations_new)
all_json["chapters"] = chapter_summaries
all_json["scenes"] = scenes

In [401]:
# Pydantic
class CharacterInfo(BaseModel):
    """Get information about a character in the story."""
    name: str = Field(description="Name of character.")
    quote: str = Field(description="Direct quote from the story that the character says or that describes them.")
    group: str = Field(description="Group that this character belongs to (e.g., main characters, upper class, talking animals). There should be at least 2 different groups of characters.")
    # color: str = Field(description="Unique aesthetic RGB color string that represents this character (e.g., rgb(118, 185, 71)). Every character should have a different color. Don't use white and make sure the color is visible against a white background.")
    # explanation: str = Field(description="Explanation why this color was chosen for this character.")

class LegendThemeInfo(BaseModel):
    """Get information about a theme in the story."""
    name: str = Field(description="Name of theme.")
    quote: str = Field(description="Direct quote from the story that describes this theme.")
    group: Optional[str] = Field(description="Group that this theme belongs to (e.g., Family, Social Themes, Emotions). There should be at least 2 different groups of themes. Make sure all groups are distinct and don't overlap with each other.")

class CharacterList(BaseModel):
    """Get overall information about the story."""
    # characterlist: list[CharacterInfo] = Field(description="List of character details. Make sure there is exactly one entry per character in the provided list and no additional characters are added. Choose a different color for each character.")
    characterlist: list[CharacterInfo] = Field(description="List of character details. Make sure there is exactly one entry per character in the provided list and no additional characters are added.")

class LegendThemeList(BaseModel):
     """Get overall information about the story."""
     themelist: list[LegendThemeInfo] = Field(description="List of theme details. Make sure there is exactly one entry per theme in the provided list and no additional themes are added.")

class ColorInfo(BaseModel):
    """Pick a color + explain why for each character"""
    name: str = Field(description="Name of character.")
    color: str = Field(description="Unique aesthetic RGB color string that represents this character (e.g., rgb(118, 185, 71)). Every character should have a different color. Don't use white and make sure the color is visible against a white background.")
    explanation: str = Field(description="Explanation why this color was chosen for this character.")

class ThemeColorInfo(BaseModel):
    """Pick a color + explain why for each theme"""
    name: str = Field(description="Name of theme.")
    color: str = Field(description="Unique aesthetic RGB color string that represents this theme (e.g., rgb(118, 185, 71)). Every theme should have a different color. Don't use white and make sure the color is visible against a white background.")
    explanation: str = Field(description="Explanation why this color was chosen for this theme.")

class ColorList(BaseModel):
    """List of color info for characters"""
    colorlist: list[ColorInfo] = Field(description="List of color details for each character. Make sure there is exactly one entry per character in the provided list and no additional characters are added. Choose a different color for each character.")

class ThemeColorList(BaseModel):
    """List of color info for themes"""
    colorlist: list[ThemeColorInfo] = Field(description="List of color details for each theme. Make sure there is exactly one entry per theme in the provided list and no additional themes are added. Choose a different color for each theme.")

if analysis_type == "theme":
    characterlist_llm = llm.with_structured_output(LegendThemeList)
    colorlist_llm = llm.with_structured_output(ThemeColorList)
else:
    characterlist_llm = llm.with_structured_output(CharacterList)
    colorlist_llm = llm.with_structured_output(ColorList)

In [402]:
# sort all_characters_new by count, descending
sorted_characters = dict(sorted(all_characters_new.items(), key=lambda item: item[1], reverse=True))
# take top 20 characters
top_characters = dict(list(sorted_characters.items())[:20])
top_characters

{'Friendship': 71,
 'Isolation': 63,
 'Social Class': 53,
 'Fear': 31,
 'Family Dynamics': 29,
 'Identity': 24,
 'Loss': 18,
 'Survival': 18,
 'Power Dynamics': 18,
 'Social Dynamics': 17,
 'Secrets': 17,
 'Childhood': 16,
 'Love': 16,
 'Reputation': 15,
 'Memory': 15,
 'Transformation': 15,
 'Guilt': 14,
 'Disagreement': 13,
 'Nostalgia': 13,
 'Unrequited Affection': 12}

In [403]:
character_arr = []
cur_arr = []
max_chars = 20

char_names = list(all_characters_new.keys())
# char_names = list(top_characters.keys())
for i, c in enumerate(char_names):
    cur_arr.append(c)
    if len(cur_arr) == max_chars or i == len(char_names) - 1:
        character_arr.append(cur_arr)
        cur_arr = []

for arr in character_arr:
    print(len(arr))


20
20
20
20
20
20
20
20
20
20
20
17


In [404]:
# read in scenes from all_json
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)

In [405]:
all_res = []
used_colors = []
error_chunks = []
for i, arr in enumerate(character_arr):
    invalid_input = True
    while invalid_input: # try until valid input
        # see if exception is thrown
        # filter all_json to only include characters in arr
        new_scenes = []
        # for scene in all_json["scenes"]:
        for scene in scenes:
            include = False
            for character in scene["characters"]:
                if character["name"] in arr:
                    include = True
                    break
            if include:
                new_scenes.append(scene)

        # take middle 126000 characters of new_scenes
        new_scenes_str = json.dumps(new_scenes)
        new_scenes_str = new_scenes_str[:126000]

        try:
            # prompt = f"characters: {arr}\nstory info: {new_scenes_str}\nyour output should contain exactly the same {len(arr)} characters as in the original list. Make sure each character is assigned a unique color."
            prompt = f"characters: {arr}\nstory info: {new_scenes_str}\nyour output should contain exactly the same {len(arr)} characters as in the original list."
            # if len(used_colors) > 0:
            #     prompt += f"\ndon't use any of these colors: {used_colors}"
            res = characterlist_llm.invoke(prompt)
            all_res.append(res)
            # int_list = res.characterlist
            # for j, char in enumerate(int_list):
            #     used_colors.append(char.color)
            print(f"Chunk {i} done.")
            invalid_input = False
        except Exception as e:
            print("Exception thrown. Trying again.")
            print("Error:", e)
            invalid_input = True
            # error_chunks.append(i)
            # invalid_input = False


Chunk 0 done.
Chunk 1 done.
Chunk 2 done.
Chunk 3 done.
Chunk 4 done.
Chunk 5 done.
Chunk 6 done.
Chunk 7 done.
Chunk 8 done.
Chunk 9 done.
Chunk 10 done.
Chunk 11 done.


In [406]:
error_chunks

[]

In [407]:
# combine characterlists into one
characters = []
for res in all_res:
    if analysis_type == "theme":
        print(len(res.themelist))
        characters += res.themelist
    else:
        print(len(res.characterlist))
        characters += res.characterlist

len(characters)

20
20
20
20
20
20
20
20
20
20
20
17


237

In [408]:
character_names = [char.name for char in characters]
for name in character_names:
    if name not in char_names:
        print("not in orig list:", name)

need_to_add = []
for name in char_names:
    if name not in character_names:
        print("not in llm output:", name)
        need_to_add.append(name)

# find duplicate names
unique_names = []
duplicate_indices = []
for i, name in enumerate(character_names):
    if name not in unique_names:
        unique_names.append(name)
    else:
        print("duplicate:", name, i)
        duplicate_indices.append(i)

print("need_to_add", len(need_to_add))
print("duplicate_indices", len(duplicate_indices))


need_to_add 0
duplicate_indices 0


In [409]:
# group similar groups

group_prompt = """
You are given a list of groups with some duplicates or alternate names.
Your task is to group all the alternate names for each unique group under a single name.
Your goal is to have as few groups as possible while still capturing all the unique groups.

First create a new group list by removing all duplicates and alternate names and only keeping the best description for each group.

Then, output a JSON dictionary where the key is each group in your new list and the value is a list of all possible alternate names. 
All names should be taken directly from the original group list and should only appear once in the output.

For example:
{{
    "Main Characters": ["Main Characters", "Protagonists", "Heroes"],
    "Elders": ["Elders", "Family Authorities"],
    "Exploration": ["Exploration", "Adventure", "Discovery"],
    "Relationships": ["Relationships", "Friendships"]
}}
"""

# if using llm2 (claude)
group_prompt += "Just output the JSON dictionary as the final result without any additional information."

group_prompt_template = ChatPromptTemplate.from_messages([("system", group_prompt), ("human", "{input}")])

In [410]:
# # read in info from final_data.json
# story_name = "yourname_themes"
# analysis_type = "theme"
# with open(f"json/{story_name}/final_data.json") as f:
#     all_json = json.load(f)

# characters = all_json["characters"]

In [411]:
# get all unique groups
# groups = [c['group'] for c in characters]
groups = [c.group for c in characters]
unique_groups = list(set(groups))
len(unique_groups), unique_groups

(19,
 ['Supernatural Themes',
  'Life Stages',
  'Family',
  'Conflict',
  'Friendship',
  'Nature',
  'Fear And Danger',
  'Financial',
  'Concern',
  'Social Dynamics',
  'Relationships',
  'Character Interactions',
  'Art and Culture',
  'Health',
  'Life Lessons',
  'Emotions',
  'Social Themes',
  'Psychological Themes',
  'Discovery And Triumph'])

In [412]:
# Create the LLM chain with the prompt template and model

# Run the chain by passing the characters and JSON data as inputs
if analysis_type == "theme":
    final_prompt = group_prompt_template.invoke(f"list of groups for themes: {unique_groups}")
else:
    final_prompt = group_prompt_template.invoke(f"list of groups for characters: {unique_groups}")
group_llm = llm2
output = group_llm.invoke(final_prompt)

In [413]:
output

AIMessage(content='{\n  "Supernatural Themes": ["Supernatural Themes"],\n  "Life Stages": ["Life Stages"],\n  "Family": ["Family"],\n  "Conflict": ["Conflict"],\n  "Friendship": ["Friendship"],\n  "Nature": ["Nature"],\n  "Fear and Danger": ["Fear And Danger"],\n  "Financial Concerns": ["Financial", "Concern"],\n  "Social Dynamics": ["Social Dynamics"],\n  "Relationships": ["Relationships"],\n  "Character Interactions": ["Character Interactions"],\n  "Art and Culture": ["Art and Culture"],\n  "Health": ["Health"],\n  "Life Lessons": ["Life Lessons"],\n  "Emotions": ["Emotions"],\n  "Social Themes": ["Social Themes"],\n  "Psychological Themes": ["Psychological Themes"],\n  "Discovery and Triumph": ["Discovery And Triumph"]\n}', additional_kwargs={}, response_metadata={'id': 'msg_01MG64gZg7u2X5Kuy1F1gbuE', 'model': 'claude-3-5-sonnet-20240620', 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'input_tokens': 335, 'output_tokens': 212}}, id='run-62961316-b9d3-4a27-baa3-4fb70ba5

In [414]:
output_formatted = output.content
# turn into json
output = json.loads(output_formatted)
len(output), output

(18,
 {'Supernatural Themes': ['Supernatural Themes'],
  'Life Stages': ['Life Stages'],
  'Family': ['Family'],
  'Conflict': ['Conflict'],
  'Friendship': ['Friendship'],
  'Nature': ['Nature'],
  'Fear and Danger': ['Fear And Danger'],
  'Financial Concerns': ['Financial', 'Concern'],
  'Social Dynamics': ['Social Dynamics'],
  'Relationships': ['Relationships'],
  'Character Interactions': ['Character Interactions'],
  'Art and Culture': ['Art and Culture'],
  'Health': ['Health'],
  'Life Lessons': ['Life Lessons'],
  'Emotions': ['Emotions'],
  'Social Themes': ['Social Themes'],
  'Psychological Themes': ['Psychological Themes'],
  'Discovery and Triumph': ['Discovery And Triumph']})

In [322]:
# Capitalize output and location output
capitalized_output = {string.capwords(key): [string.capwords(name) for name in value] for key, value in output.items()}

# Process each scene
for c in characters:

    # Process group
    group = string.capwords(c.group)
    found_group = False

    # Check if the group is a key in the dictionary
    if group in capitalized_output:
        found_group = True
        c.group = group  # Capitalize the group

    # If not found in the dictionary, check the aliases
    if not found_group:
        matched_key = find_key_from_alias(group, capitalized_output)
        if matched_key != group:
            c.group = matched_key
            found_group = True
    if not found_group:
        c.group = group

In [None]:
# # Load the scenes
# with open(f"json/{story_name}/final_data.json") as f:
#     data = json.load(f)
#     characters = data["characters"]

# # Capitalize output and location output
# capitalized_output = {string.capwords(key): [string.capwords(name) for name in value] for key, value in output.items()}

# # Process each scene
# for c in characters:

#     # Process group
#     group = string.capwords(c["group"])
#     found_group = False

#     # Check if the group is a key in the dictionary
#     if group in capitalized_output:
#         found_group = True
#         c["group"] = group  # Capitalize the group

#     # If not found in the dictionary, check the aliases
#     if not found_group:
#         matched_key = find_key_from_alias(group, capitalized_output)
#         if matched_key != group:
#             c["group"] = matched_key
#             found_group = True
#     if not found_group:
#         c["group"] = group

# # Save the updated scenes
# with open(f"json/{story_name}/final_data.json", "w") as f:
#     json.dump(data, f, indent=4)

In [416]:
# now add colors for top characters
top_char_names = list(top_characters.keys())

all_res_colors = []
invalid_input = True
while invalid_input: # try until valid input
    # see if exception is thrown
    # filter all_json to only include characters in arr
    new_scenes = []
    for scene in scenes:
        include = False
        for character in scene["characters"]:
            if character["name"] in top_char_names:
                include = True
                break
        if include:
            new_scenes.append(scene)

    # take middle 126000 characters of new_scenes
    new_scenes_str = json.dumps(new_scenes)
    new_scenes_str = new_scenes_str[:126000]

    try:
        prompt = f"characters: {top_char_names}\nstory info: {new_scenes_str}\nyour output should contain exactly the same {len(top_char_names)} characters as in the original list. Make sure each character is assigned a unique color."
        res = colorlist_llm.invoke(prompt)
        all_res_colors.append(res)
        invalid_input = False
    except Exception as e:
        print("Exception thrown. Trying again.")
        print("Error:", e)
        invalid_input = True

In [417]:
# find duplicate colors
colorlist = []
for res in all_res_colors:
    colorlist += res.colorlist
    
unique_colors = []
duplicate_colors = []
for i, char in enumerate(colorlist):
    color = char.color
    if color not in unique_colors:
        unique_colors.append(color)
    else:
        print("duplicate color:", color, i)
        duplicate_colors.append(i)

duplicate color: rgb(255, 20, 147) 19


In [418]:
colorinfo = {}
for c in colorlist:
    colorinfo[c.name] = {
        "color": c.color,
        "explanation": c.explanation
    }

colorinfo

{'Friendship': {'color': 'rgb(255, 223, 186)',
  'explanation': 'This warm, soft color represents the warmth and comfort found in friendship.'},
 'Isolation': {'color': 'rgb(169, 169, 169)',
  'explanation': 'A muted gray symbolizes the feelings of loneliness and desolation associated with isolation.'},
 'Social Class': {'color': 'rgb(128, 0, 128)',
  'explanation': 'Purple signifies the complexities and nuances of social class, often associated with wealth and status.'},
 'Fear': {'color': 'rgb(255, 0, 0)',
  'explanation': 'Red is a powerful color that evokes feelings of fear and danger.'},
 'Family Dynamics': {'color': 'rgb(255, 204, 204)',
  'explanation': 'A light pink represents the complexities and emotional ties within family relationships.'},
 'Identity': {'color': 'rgb(0, 128, 255)',
  'explanation': 'Blue symbolizes the search for identity and self-discovery.'},
 'Loss': {'color': 'rgb(75, 0, 130)',
  'explanation': 'Indigo reflects the deep sorrow and melancholy associated 

In [419]:
# remove duplicates
characters = [char for i, char in enumerate(characters) if i not in duplicate_indices]
print(len(characters))

237


In [420]:
character_info = []
for c in characters:

    # see if character is in color_info
    name = c.name
    color = ""
    explanation = ""
    if name in colorinfo:
        color = colorinfo[name]["color"]
        explanation = colorinfo[name]["explanation"]
    
    character_info.append({
        "name": name,
        "quote": c.quote,
        "group": c.group,
        "color": color,
        "explanation": explanation
    })

len(character_info), character_info

(237,
 [{'name': 'Identity',
   'quote': 'I called myself Pip, and came to be called Pip.',
   'group': 'Emotions',
   'color': 'rgb(0, 128, 255)',
   'explanation': 'Blue symbolizes the search for identity and self-discovery.'},
  {'name': 'Loss',
   'quote': 'I never saw my father or my mother.',
   'group': 'Emotions',
   'color': 'rgb(75, 0, 130)',
   'explanation': 'Indigo reflects the deep sorrow and melancholy associated with loss.'},
  {'name': 'Imagination',
   'quote': 'my first fancies regarding what they were like were unreasonably derived from their tombstones.',
   'group': 'Emotions',
   'color': '',
   'explanation': ''},
  {'name': 'Nature',
   'quote': 'the dark flat wilderness beyond the churchyard... was the marshes.',
   'group': 'Social Themes',
   'color': '',
   'explanation': ''},
  {'name': 'Fear',
   'quote': 'Oh! Don\'t cut my throat, sir," I pleaded in terror.',
   'group': 'Emotions',
   'color': 'rgb(255, 0, 0)',
   'explanation': 'Red is a powerful color

In [421]:
for name in need_to_add:
    character_info.append({
        "name": name,
        "quote": "",
        "group": "misc",
        "color": "",
        "explanation": ""
    })

len(character_info), character_info

(237,
 [{'name': 'Identity',
   'quote': 'I called myself Pip, and came to be called Pip.',
   'group': 'Emotions',
   'color': 'rgb(0, 128, 255)',
   'explanation': 'Blue symbolizes the search for identity and self-discovery.'},
  {'name': 'Loss',
   'quote': 'I never saw my father or my mother.',
   'group': 'Emotions',
   'color': 'rgb(75, 0, 130)',
   'explanation': 'Indigo reflects the deep sorrow and melancholy associated with loss.'},
  {'name': 'Imagination',
   'quote': 'my first fancies regarding what they were like were unreasonably derived from their tombstones.',
   'group': 'Emotions',
   'color': '',
   'explanation': ''},
  {'name': 'Nature',
   'quote': 'the dark flat wilderness beyond the churchyard... was the marshes.',
   'group': 'Social Themes',
   'color': '',
   'explanation': ''},
  {'name': 'Fear',
   'quote': 'Oh! Don\'t cut my throat, sir," I pleaded in terror.',
   'group': 'Emotions',
   'color': 'rgb(255, 0, 0)',
   'explanation': 'Red is a powerful color

In [422]:
# in final data, for top characters, replace existing color + explanation with new ones

# read in final_data.json
# with open(f"json/{story_name}/final_data.json") as f:
#     final_data = json.load(f)

# character_names = [char["name"] for char in character_info]

# for i, c in enumerate(final_data["characters"]):
#     # see if matching character in character_info
#     if c["name"] in character_names:
#         char_index = character_names.index(c["name"])
#         final_data["characters"][i]["color"] = character_info[char_index]["color"]
#         final_data["characters"][i]["explanation"] = character_info[char_index]["explanation"]
#     else:
#         # clear color and explanation
#         final_data["characters"][i]["color"] = ""
#         final_data["characters"][i]["explanation"] = ""

# # save final_data.json
# with open(f"json/{story_name}/final_data.json", "w") as f:
#     json.dump(final_data, f, indent=4)

In [423]:
# Pydantic
class LocationInfo(BaseModel):
    """Get information about a location in the story."""
    name: str = Field(description="Name of location.")
    quote: str = Field(description="Direct quote from the story that describes this location")
    emoji: str = Field(description="Emoji that represents this location (e.g., 🏰). Make sure each location has a different emoji.")

class LocationList(BaseModel):
    """Get overall information about the story."""
    locationList: list[LocationInfo] = Field(description="List of location details. One entry per location.")

locationList_llm = llm.with_structured_output(LocationList)

In [424]:
# # read in scenes from all_json
# story_name = "gatsby_themes"
# with open(f"json/{story_name}/final_data.json") as f:
#     all_json = json.load(f)

In [425]:
# locations = all_json["locations"]
# all_locations_new = [loc["name"] for loc in locations]
# character_info = all_json["characters"]

In [426]:
cropped_json = json.dumps(all_json)[:126000]
location_prompt = f"locations: {all_locations_new}\nstory info: {cropped_json}\nyour output should contain exactly the same {len(all_locations_new)} locations as in the original list."
invalid_input = True
while invalid_input: # try until valid input
    # see if exception is thrown
    try:
        res = locationList_llm.invoke(location_prompt)
        invalid_input = False
    except Exception as e:
        print("Exception thrown. Trying again.")
        print("Error:", e)
        invalid_input = True

In [427]:
location_info = []
for l in res.locationList:
    location_info.append({
        "name": l.name,
        "quote": l.quote,
        "emoji": l.emoji
    })

print(len(location_info))
print(location_info)

107
[{'name': 'Marsh Country', 'quote': 'Ours was the marsh country, down by the river, within, as the river wound, twenty miles of the sea.', 'emoji': '🌾'}, {'name': 'Church', 'quote': 'A fearful man, all in coarse grey, with a great iron on his leg.', 'emoji': '⛪'}, {'name': "Joe's House", 'quote': 'My sister, Mrs. Joe Gargery, was more than twenty years older than I.', 'emoji': '🏠'}, {'name': 'A Dark Night', 'quote': 'I was afraid to sleep, even if I had been inclined, for I knew that at the first faint dawn of morning I must rob the pantry.', 'emoji': '🌌'}, {'name': 'A Damp Morning', 'quote': 'It was a rimy morning, and very damp.', 'emoji': '🌫️'}, {'name': 'Prison-ship Vicinity', 'quote': "The hulks are prison-ships, right 'cross th' meshes.", 'emoji': '⚓'}, {'name': 'Stairs', 'quote': 'I took the advice. My sister, Mrs. Joe, throwing the door wide open, and finding an obstruction behind it, immediately divined the cause.', 'emoji': '🪜'}, {'name': "Joe's Forge", 'quote': "Joe's fo

In [428]:
# add character and location info to all_json
all_json["characters"] = character_info
all_json["locations"] = location_info
all_json

{'title': 'Great Expectations',
 'type': 'Book',
 'author': 'Charles Dickens',
 'year': 1861,
 'url': 'https://www.gutenberg.org/ebooks/1400',
 'image': 'https://images.booksense.com/images/132/726/9781532726132.jpg',
 'num_chapters': 59,
 'num_scenes': 335,
 'num_characters': 237,
 'num_locations': 107,
 'chapters': [{'chapter': 'Chapter I.',
   'scenes': 2,
   'length': 11029,
   'num_lines': 158,
   'summary': 'Pip introduces his family background and encounters a fearsome convict in the churchyard.',
   'importance': 1.0,
   'conflict': 0.5,
   'locations': {'Marsh Country': 1, 'Church': 1},
   'characters': {'Identity': 1,
    'Loss': 1,
    'Imagination': 1,
    'Nature': 1,
    'Fear': 1,
    'Survival': 1,
    'Isolation': 1},
   'importance_rank': 1,
   'conflict_rank': 41},
  {'chapter': 'Chapter II.',
   'scenes': 15,
   'length': 21011,
   'num_lines': 293,
   'summary': 'Pip navigates his fear of his sister Mrs. Joe and her temper, while Joe Gargery offers him support; Pip

In [429]:
# save as json
with open(f"json/{story_name}/final_data.json", "w") as f:
    json.dump(all_json, f, indent=4)