In [2]:
import os
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
import json
from pydantic import BaseModel, Field
import string
import re
from typing import Optional 
from typing_extensions import TypedDict

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
try:
    with open("../secrets.json") as f:
        secrets = json.load(f)
    open_ai_key = secrets["openai"]
    anthropic_key = secrets["anthropic"]
    google_key = secrets["google"]
    os.environ["OPENAI_API_KEY"] = open_ai_key
    os.environ["ANTHROPIC_API_KEY"] = anthropic_key
    os.environ["GOOGLE_API_KEY"] = google_key
    print("API key loaded.")
except FileNotFoundError:
    print("Secrets file not found. YOU NEED THEM TO RUN THIS.")

API key loaded.


In [45]:
def format_gemini_response(res):
    output_formatted = res.content[7:-3]
    # turn into json
    output = json.loads(output_formatted)
    return output

class DotDict:
    # allow dot notation for dicts
    def __init__(self, dictionary):
        for key, value in dictionary.items():
            setattr(self, key, value)

In [4]:
model_type = "default"
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)
llm2 = ChatAnthropic(model_name="claude-3-5-sonnet-20240620", temperature=0.1, api_key=os.getenv("ANTHROPIC_API_KEY"))

# model_type = "gemini"
# llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.1)

In [40]:
# Pydantic
class StoryInfo(BaseModel):
    """Get overall information about the story."""
    firstline: str = Field(description="First line of the story")
    lastline: Optional[str] = Field(description="Last line of the story or first volume")
    markers: list[str] = Field(description="List of markers to split the story into chapters. If there are multiple volumes, focus on the first volume.")

def get_marker_prompt(story):
    prompt = f'''List the markers needed to split this story into chapters: {story}. Format your response like Story Info:
    class StoryInfo(BaseModel):
        """Get overall information about the story."""
        firstline: str = Field(description="First line of the story")
        lastline: Optional[str] = Field(description="Last line of the story or first volume")
        markers: list[str] = Field(description="List of markers to split the story into chapters. Keep all punctuation marks and capitalization. If there are multiple volumes, focus on the first volume.")
    '''
    return prompt

structured_llm = llm2.with_structured_output(StoryInfo)
# structured_llm = llm.with_structured_output(StoryInfo)

## split text into chapters

In [6]:
story_info = {
    "title": "The Tragedy of Romeo and Juliet",
    "type": "Book",
    "author": "William Shakespeare",
    "year": 1597,
    "url": "https://www.gutenberg.org/ebooks/1112",
    "image": "https://m.media-amazon.com/images/I/71jUCfPUkDL._AC_UF894,1000_QL80_.jpg",
}

In [7]:
# read in txt file from scripts folder
story_name = "romeo"
with open(f"scripts/{story_name}.txt") as f:
    story = f.read()

In [8]:
# focus on the first volume if there are multiple volumes
if "END OF VOL." in story:
    volumes = story.split("END OF VOL.")
    story = volumes[0]

In [9]:
# split story into 500000 character chunks
print(len(story))
story_chunks = [story[i:i + 500000] for i in range(0, len(story), 500000)]
print(len(story_chunks))

184524
1


In [10]:
all_res = []

In [41]:
# for i, chunk in enumerate(story_chunks):
#     invalid_input = True
#     while invalid_input: # try until valid input
#         # see if exception is thrown
#         try:
#             res = structured_llm.invoke(chunk)
#             all_res.append(res)
#             print(f"Chunk {i} done.")
#             invalid_input = False
#         except:
#             print("Exception thrown. Trying again.")
#             invalid_input = True

#     if i < len(story_chunks) - 1:
#         # Pause for a specified time (e.g., 60 seconds) to avoid rate limits
#         time.sleep(120)  # Adjust the sleep duration as per your rate limit requirements

invalid_input = True
while invalid_input: # try until valid input
    # see if exception is thrown
    try:
        res = structured_llm.invoke(story)
        invalid_input = False
    except Exception as e:
        print("Exception thrown. Trying again.")
        print("Error:", e)
        invalid_input = True

In [42]:
# first_line = all_res[0].firstline
# last_line = all_res[-1].lastline
# markers = [marker for res in all_res for marker in res.markers]

first_line = res.firstline
last_line = res.lastline
markers = res.markers

In [43]:
# remove any marker that doesn't include one of these words
exclude_words = ["part", "preface", "contents", "by", "end"]
markers = [m for m in markers if not any(word in m.lower() for word in exclude_words)]

In [44]:
markers

['Prologue', 'Act I', 'Act II', 'Act III', 'Act IV', 'Act V']

In [14]:
# save as json 
story_json = {
    "first_line": first_line,
    "last_line": last_line,
    "markers": markers
}
if not os.path.exists(f"json/{story_name}"):
    os.makedirs(f"json/{story_name}")
with open(f"json/{story_name}/summary.json", "w") as f:
    json.dump(story_json, f, indent=4)

In [15]:
story = re.sub(r' {2,}', ' ', story)  # Replace consecutive spaces with one space
story = re.sub(r'\n{2,}', '\n', story)  # Replace consecutive newlines with one newline
story = re.sub(r' ?\n ?', '\n', story)  # Replace space + newline or newline + space with one newline
story = story.replace("“", "\"").replace("”", "\"").replace("‘", "'").replace("’", "'")

In [16]:
story_formatted = story.replace("\n", " ")
# replace weird quotes too
story_formatted = story_formatted.replace("\"", " ").replace("'", " ")

In [17]:
# read in markers from summary.json
with open(f"json/{story_name}/summary.json") as f:
    summary = json.load(f)
    markers = summary["markers"]
    first_line = summary["first_line"]
    last_line = summary["last_line"]

In [18]:
story_formatted[-100:]

'help produce our new eBooks, and how to subscribe to our email newsletter to hear about new eBooks. '

In [19]:
story_formatted.find(last_line)
last_line

'For never was a story of more woe Than this of Juliet and her Romeo.'

In [62]:
# split story into chapters based on markers
chapters = []
cur_first_line = first_line.replace("\"", " ").replace("'", " ")
cur_min_index = 0
for i, marker in enumerate(markers):
    exists_next = i+1 < len(markers)
    next_marker = markers[i+1] if exists_next else ""
    
    if exists_next and "ACT" in next_marker.upper():
        next_marker = next_marker.upper()
    formatted_last_line = next_marker if exists_next else last_line
    if "ACT" not in next_marker.upper():
        formatted_last_line += " "
    cur_last_line = formatted_last_line
    cur_last_line = cur_last_line.replace("\"", " ").replace("'", " ")
    first_line_index = story_formatted.find(cur_first_line, cur_min_index)

    if i != 0 and "ACT" not in marker.upper():
        first_line_index += len(cur_first_line)
    last_line_index = story_formatted.find(cur_last_line, first_line_index)
    if i+1 == len(markers):
        if last_line_index == -1:
            last_line_index = len(story_formatted)
        else:
            last_line_index += len(cur_last_line) 
    chapter = story[first_line_index:last_line_index].strip()

    # add line number in front of each line
    chapter = chapter.split("\n")
    for j, line in enumerate(chapter):
        chapter[j] = f"LINE {j+1}: {line}"
    chapter = "\n".join(chapter)
    chapters.append(chapter)
    cur_first_line = cur_last_line
    cur_min_index = last_line_index

In [63]:
# check results
for i, marker in enumerate(markers):
    print(f"Chapter {marker}")
    chapter_first_line = chapters[i][:100]
    chapter_last_line = chapters[i][-100:]
    print("first_line:", chapter_first_line)
    print("last_line:", chapter_last_line)
    print("\n")

Chapter Prologue
first_line: LINE 1: Two households, both alike in dignity,
LINE 2: In fair Verona, where we lay our scene,
LINE 
last_line: AL DISTRIBUTION INCLUDES BY ANY
LINE 23: SERVICE THAT CHARGES FOR DOWNLOAD TIME OR FOR MEMBERSHIP.>>


Chapter Act I
first_line: LINE 1: ACT I. Scene I.
LINE 2: Verona. A public place.
LINE 3: Enter Sampson and Gregory (with swor
last_line: them power, time means, to meet,
LINE 807: Temp'ring extremities with extreme sweet.
LINE 808: Exit.


Chapter Act II
first_line: LINE 1: ACT II. Scene I.
LINE 2: A lane by the wall of Capulet's orchard.
LINE 3: Enter Romeo alone.
last_line: L DISTRIBUTION INCLUDES BY ANY
LINE 753: SERVICE THAT CHARGES FOR DOWNLOAD TIME OR FOR MEMBERSHIP.>>


Chapter Act III
first_line: LINE 1: ACT III. Scene I.
LINE 2: A public place.
LINE 3: Enter Mercutio, Benvolio, and Men.
LINE 4:
last_line: L DISTRIBUTION INCLUDES BY ANY
LINE 884: SERVICE THAT CHARGES FOR DOWNLOAD TIME OR FOR MEMBERSHIP.>>


Chapter Act IV
first_line: LIN

In [64]:
# save to new txt files
# create folder for story if it doesn't exist
if not os.path.exists(f"chapters/{story_name}"):
    os.makedirs(f"chapters/{story_name}")
for i, marker in enumerate(markers):
    with open(f"chapters/{story_name}/{marker}.txt", "w") as f:
        f.write(chapters[i])
        print(f"Chapter {marker} saved.")

Chapter Prologue saved.
Chapter Act I saved.
Chapter Act II saved.
Chapter Act III saved.
Chapter Act IV saved.
Chapter Act V saved.


## analyze scene

In [65]:
# Pydantic
class SceneInfo(BaseModel):
    """Get information about a scene in the story."""
    scene: str = Field(description="Title of scene (don't include chapter name or number)")
    summary: str = Field(description="1 line summary of the scene")
    firstline: int = Field(description="First line number of the current scene in this chapter (should be after last line number of previous scene)")
    lastline: int = Field(description="Last line number of the current scene in this chapter (should be after first line number of current scene)")

class SceneListInfo(BaseModel):
    """Get overall information about the story."""
    scenelist: list[SceneInfo] = Field(description="List of key scenes in this chapter")

def get_scenelist_prompt(chapter):
    scene_prompt = f'''List the key scenes in this chapter: {chapter}. Format your response like SceneListInfo:
                    class SceneInfo(BaseModel):
                        """Get information about a scene in the story."""
                        scene: str = Field(description="Title of scene (don't include chapter name or number)")
                        summary: str = Field(description="1 line summary of the scene")
                        firstline: int = Field(description="First line number of the current scene in this chapter (should be after last line number of previous scene)")
                        lastline: int = Field(description="Last line number of the current scene in this chapter (should be after first line number of current scene)")

                    class SceneListInfo(BaseModel):
                        """Get overall information about the story."""
                        scenelist: list[SceneInfo] = Field(description="List of key scenes in this chapter")'''
    return scene_prompt

scene_llm = llm.with_structured_output(SceneListInfo)

In [66]:
# Pydantic
class CharacterInfo(BaseModel):
    """Get information about a character in this scene."""
    name: str = Field(description="Full name of the character.")
    role: str = Field(description="Main actions and/or motivations of the character in this scene")
    importance: float = Field(description="Importance of the character in this scene from from 0: not important at all to 1: very important character")
    emotion: str = Field(description="Emotion of the character in this scene, described in a few words or a short phrase")
    sentiment: float = Field(description="Sentiment of the character in this scene from -1 (very negative) to 1 (very positive)")
    quote: str = Field(description="A direct quote from the character in this scene to illustrate their emotions and/or actions")

class SceneDetails(BaseModel):
    """Get overall information about the scene."""
    location: str = Field(description="Location of the scene")
    importance: float = Field(description="Importance of the scene from 0: not important at all to 1: very important and pivotal scene")
    conflict: float = Field(description="Conflict in the scene from 0: no conflict to 1: high conflict")
    characters: list[CharacterInfo] = Field(description="List of key characters in the scene. Make sure there are no duplicates, and focus on speaking characters.")

def get_scenedetails_prompt(scene):
    scene_prompt = f'''List the key details in this scene: {scene}. Format your response like SceneDetails:
                    class CharacterInfo(BaseModel):
                        """Get information about a character in this scene."""
                        name: str = Field(description="Full name of the character.")
                        role: str = Field(description="Main actions and/or motivations of the character in this scene")
                        importance: float = Field(description="Importance of the character in this scene from from 0: not important at all to 1: very important character")
                        emotion: str = Field(description="Emotion of the character in this scene, described in a few words or a short phrase")
                        sentiment: float = Field(description="Sentiment of the character in this scene from -1 (very negative) to 1 (very positive)")
                        quote: str = Field(description="A direct quote from the character in this scene to illustrate their emotions and/or actions")

                    class SceneDetails(BaseModel):
                        """Get overall information about the scene."""
                        location: str = Field(description="Location of the scene")
                        importance: float = Field(description="Importance of the scene from 0: not important at all to 1: very important and pivotal scene")
                        conflict: float = Field(description="Conflict in the scene from 0: no conflict to 1: high conflict")
                        characters: list[CharacterInfo] = Field(description="List of key characters in the scene. Make sure there are no duplicates, and focus on speaking characters.")'''
    return scene_prompt

scene_details_llm = llm.with_structured_output(SceneDetails)

In [67]:
def get_scene_info(chapters, chapter_num):
    cur_chapter = chapters[chapter_num]

    invalid_input = True
    while invalid_input: # try until valid input
        # see if exception is thrown
        try:
            if model_type == "gemini":
                prompt = get_scenelist_prompt(cur_chapter)
                chapter_scenes = llm.invoke(prompt)
                chapter_scenes = format_gemini_response(chapter_scenes)
            chapter_scenes = scene_llm.invoke(cur_chapter)
            invalid_input = False
        except:
            invalid_input = True
            
    if model_type == "gemini":
        chapter_scenes_list = [DotDict(scene) for scene in chapter_scenes['scenelist']]
    else: 
        chapter_scenes_list = chapter_scenes.scenelist
    formatted_chapter_scenes = []

    for scene in chapter_scenes_list:
        scene_title = scene.scene
        scene_summary = scene.summary
        scene_first_line = scene.firstline
        scene_last_line = scene.lastline

        print(f"FIRST LINE: {scene_first_line}")
        print(f"LAST LINE: {scene_last_line}")
        print()

        formatted_scene = {
            "title": scene_title,
            "summary": scene_summary,
            "chapter": markers[chapter_num],
            "first_line": scene_first_line,
            "last_line": scene_last_line
        }
        formatted_chapter_scenes.append(formatted_scene)
        formatted_chapter = cur_chapter.replace("\n", " ")
        # replace weird quotes too
        formatted_chapter = formatted_chapter.replace("\"", " ").replace("'", " ")

    # split chapter into scenes based on formatted_chapter_scenes
    scenes = []
    cur_min_index = 0
    for i, scene in enumerate(formatted_chapter_scenes):
        cur_first_line = f"LINE {scene['first_line']}:"
        # cur_first_line = formatted_chapter_scenes[i]["first_line"].strip("...").replace("\"", " ").replace("'", " ")
        first_line_index = formatted_chapter.find(cur_first_line, cur_min_index)
        if first_line_index == -1:
            # see if first_line_index overlaps with last_line of previous scene
            other_possible_first_index = formatted_chapter.find(cur_first_line)
            if other_possible_first_index != -1:
                overlap = cur_chapter[other_possible_first_index:cur_min_index]
                print(f"SCENE {i}")
                print("\nPREVIOUS SCENE")
                print(scenes[i-1])
                print("\nOVERLAP")
                print(overlap)
                print()
            first_line_index = cur_min_index
        cur_last_line = f"LINE {scene['last_line'] + 1}:"
        # cur_last_line = formatted_chapter_scenes[i]["last_line"].strip("...").replace("\"", " ").replace("'", " ")
        last_line_index = formatted_chapter.find(cur_last_line, first_line_index)
        # last_line_index += len(cur_last_line)
        if last_line_index == -1:
            last_line_index = len(cur_chapter)
        scene_text = cur_chapter[first_line_index:last_line_index].strip()
        # remove line numbers
        scene_text = re.sub(r'LINE \d+: ', '', scene_text)
        scenes.append(scene_text)
        cur_min_index = last_line_index

    # add scene text to formatted_chapter_scenes
    for i, scene in enumerate(formatted_chapter_scenes):
        formatted_chapter_scenes[i]["text"] = scenes[i]

    return scenes, formatted_chapter_scenes

In [68]:
def get_scene_details(scenes, formatted_chapter_scenes):
    all_scene_details = []
    for i, scene in enumerate(scenes):
        scene_text = formatted_chapter_scenes[i]["text"]
        if len(scene_text) == 0:
            all_scene_details.append([])
            continue
        # get scene details for each scene
        cur_scene = scene
        if model_type == 'gemini':
            prompt = get_scenedetails_prompt(cur_scene)
            scene_details = llm.invoke(prompt)
            scene_details = format_gemini_response(scene_details)
            scene_details = DotDict(scene_details)
        else:
            scene_details = scene_details_llm.invoke(cur_scene)
        all_scene_details.append(scene_details)

    # add details to formatted_chapter_scenes
    for i, scene in enumerate(formatted_chapter_scenes):
        scene_text = formatted_chapter_scenes[i]["text"]
        text_length = len(scene_text)
        num_lines = formatted_chapter_scenes[i]["last_line"] - formatted_chapter_scenes[i]["first_line"] + 1
        # first_line = formatted_chapter_scenes[i]["first_line"]
        # first_line_length = len(first_line)
        # last_line = formatted_chapter_scenes[i]["last_line"]
        # last_line_length = len(last_line)

        # if text_length == 0 or text_length < first_line_length + last_line_length:
        if text_length == 0:
            # no scene text or scene text is too short
            continue
        
        cur_scene_details = all_scene_details[i]
        formatted_chapter_scenes[i]["length"] = text_length
        formatted_chapter_scenes[i]["num_lines"] = num_lines
        formatted_chapter_scenes[i]["location"] = cur_scene_details.location
        formatted_chapter_scenes[i]["importance"] = cur_scene_details.importance
        formatted_chapter_scenes[i]["conflict"] = cur_scene_details.conflict
        formatted_character_info = []

        characters = cur_scene_details.characters
        if model_type == "gemini":
            characters = [DotDict(character) for character in characters]
        for character in characters:
            character_info = {
                "name": character.name,
                "role": character.role,
                "importance": character.importance,
                "emotion": character.emotion,
                "sentiment": character.sentiment,
                "quote": character.quote
            }
            formatted_character_info.append(character_info)
        formatted_chapter_scenes[i]["characters"] = formatted_character_info

    return formatted_chapter_scenes

In [69]:
# get scene info for each chapter
for i, marker in enumerate(markers):
    print(f"Chapter {marker} start.")
    scenes, formatted_chapter_scenes = get_scene_info(chapters, i)
    print(f"Chapter {marker} scenes done.")
    formatted_chapter_scenes = get_scene_details(scenes, formatted_chapter_scenes)
    print(f"Chapter {marker} details done.")

    # remove scenes with no text
    formatted_chapter_scenes = [scene for scene in formatted_chapter_scenes if scene["text"] != "" and "characters" in scene]
    # remove characters with no quote
    for scene in formatted_chapter_scenes:
        scene["characters"] = [character for character in scene["characters"] if "(" not in character["quote"] != "" and "<" not in character["name"]]
        # print(scene)
    
    # remove scenes with no characters
    formatted_chapter_scenes = [scene for scene in formatted_chapter_scenes if len(scene["characters"]) > 0]

    # save as json
    with open(f"json/{story_name}/{marker}.json", "w") as f:
        json.dump(formatted_chapter_scenes, f, indent=4)

    print(f"Chapter {marker} done.\n")

Chapter Prologue start.
FIRST LINE: 1
LAST LINE: 14

Chapter Prologue scenes done.
Chapter Prologue details done.
Chapter Prologue done.

Chapter Act I start.
FIRST LINE: 1
LAST LINE: 260

FIRST LINE: 261
LAST LINE: 303

FIRST LINE: 376
LAST LINE: 491

FIRST LINE: 492
LAST LINE: 616

FIRST LINE: 617
LAST LINE: 784

Chapter Act I scenes done.
Chapter Act I details done.
Chapter Act I done.

Chapter Act II start.
FIRST LINE: 1
LAST LINE: 51

FIRST LINE: 52
LAST LINE: 264

FIRST LINE: 265
LAST LINE: 365

FIRST LINE: 366
LAST LINE: 614

FIRST LINE: 615
LAST LINE: 703

FIRST LINE: 704
LAST LINE: 745

Chapter Act II scenes done.
Chapter Act II details done.
Chapter Act II done.

Chapter Act III start.
FIRST LINE: 1
LAST LINE: 232

FIRST LINE: 233
LAST LINE: 385

FIRST LINE: 386
LAST LINE: 569

FIRST LINE: 574
LAST LINE: 614

FIRST LINE: 615
LAST LINE: 877

Chapter Act III scenes done.
Chapter Act III details done.
Chapter Act III done.

Chapter Act IV start.
FIRST LINE: 3
LAST LINE: 132

FIR

In [70]:
# rank each scene by importance
# and within each scene, the characters

for i, marker in enumerate(markers):
    print(marker)
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    importances = []
    conflicts = []
    # extract importance from each scene
    for j, scene in enumerate(scenes):
        importances.append((j, scene["importance"]))
        conflicts.append((j, scene["conflict"]))
        # now extract character importances
        character_importances = []
        for k, character in enumerate(scene["characters"]):
            character_importances.append((k, character["importance"]))
        # sort character importances
        sorted_character_importances = sorted(character_importances, key=lambda x: x[1], reverse=True)
        # add importance_rank to each character
        for k, (l, _) in enumerate(sorted_character_importances):
            scenes[j]["characters"][l]["importance_rank"] = k+1
        # add number of each scene
        scenes[j]["number"] = j+1
    # sort importances
    sorted_importances = sorted(importances, key=lambda x: x[1], reverse=True)
    # add importance_rank to each scene
    for k, (j, _) in enumerate(sorted_importances):
        scenes[j]["importance_rank"] = k+1
    # sort conflicts
    sorted_conflicts = sorted(conflicts, key=lambda x: x[1], reverse=True)
    # add conflict_rank to each scene
    for k, (j, _) in enumerate(sorted_conflicts):
        scenes[j]["conflict_rank"] = k+1
    
    # print results
    # for j, scene in enumerate(scenes):
    #     print(scenes[j]["title"], scenes[j]["importance_rank"])
    #     print("--------------------------------")
    #     for k, character in enumerate(scene["characters"]):
    #         print(character["name"], character["importance_rank"])
    #     print()

    # save as json
    with open(f"json/{story_name}/{marker}.json", "w") as f:
        json.dump(scenes, f, indent=4)

Prologue
Act I
Act II
Act III
Act IV
Act V


In [71]:
# combine all chapters into one json file
all_scenes = []
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    all_scenes += scenes

with open(f"json/{story_name}/all.json", "w") as f:
    json.dump(all_scenes, f, indent=4)

In [72]:
# collect all unique characters and locations for each chapter
all_characters = {}
all_locations = []
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    for scene in scenes:
        for character in scene["characters"]:
            char = character["name"]
            if char not in all_characters:
                all_characters[char] = character["role"]
        location = scene["location"]
        if location not in all_locations:
            all_locations.append(location)

In [73]:
len(all_characters), len(all_locations)

(32, 13)

In [74]:
all_characters

{'Romeo Montague': 'A young man in love with Juliet, driven by passion and fate.',
 'Juliet Capulet': 'A young woman deeply in love with Romeo, torn between family loyalty and love.',
 'Lord Capulet': "Juliet's father, who is determined to uphold family honor.",
 'Lord Montague': "Romeo's father, who is also embroiled in the family feud.",
 'Sampson': 'Provokes a fight with the Montagues and expresses disdain for them.',
 'Gregory': "Sampson's companion who engages in banter and encourages the fight.",
 'Abram': 'A servant of the Montagues who gets drawn into the conflict.',
 'Balthasar': 'Another servant of the Montagues who supports Abram in the conflict.',
 'Benvolio': 'Attempts to keep the peace and stop the fighting.',
 'Tybalt': 'Aggressively confronts Benvolio and seeks to escalate the conflict.',
 'Prince Escalus': 'Intervenes to stop the fighting and restore order.',
 'Old Capulet': 'Joins the fray, seeking to defend his honor.',
 'Old Montague': "Responds to Capulet's aggress

In [75]:
all_locations

['Verona',
 'Verona, a public place',
 'A Street',
 "Capulet's house",
 'A street',
 "A lane by the wall of Capulet's orchard",
 "Capulet's orchard",
 "Friar Laurence's cell",
 'A public place',
 "Juliet's chamber",
 'Mantua, a street',
 "Verona, Friar Laurence's cell",
 'Verona, a churchyard']

In [76]:
# Assuming your JSON file is stored at 'all.json'
with open(f"json/{story_name}/all.json") as f:
    json_data = json.load(f)

prompt = """
You are given a list of characters with some duplicates or alternate names.
Your task is to group all the alternate names for each unique character under a single full name, using the descriptions as guidance.
Don't keep characters like "Narrator" unless they are never referred to by another name.
Separate characters that are related to others (e.g., "Bob" and "Ms. Doe (Bob's mom)") into separate entries.

First create a new character list by removing all duplicates and alternate names and only keeping the full name for each character.

Then, output a JSON dictionary where the key is each character in your new list and the value is a list of all possible alternate names. 
All names should be taken directly from the original character list and should only appear once in the output.

For example:
{{
    "Bob Smith": ["Bob", "Bobby", "Robert"],
    "Jane Doe": ["Jane", "Ms. Doe", "Bob's mother"]
}}
"""

location_prompt = """
You are given a list of locations with some duplicates or alternate names.
Your task is to group all the alternate or similiar names for each unique location under a single location name.

First create a new location list by removing all duplicates and alternate names and only keeping the full name for each location.
If one location is inside another or nearby to it, only keep the larger location (e.g., "Bob's bedroom", "Bob's garden", and "Bob Smith's castle" can be listed under "Bob's house").
Then, output a JSON dictionary where the key is each location in your new list and the value is a list of all possible alternate location names.
All locations should be taken directly from the original location list and should only appear once in the output dict.

For example:
{{
    "Los Angeles": ["LA", "City of Angels", "Cafe in Los Angeles"],
    "Bob's house": ["Bob's bedroom", "Bob Smith's garden", "Bob's castle"]
}}
"""

# if using llm2 (claude)
prompt += "Just output the JSON dictionary as the final result without any additional information."
location_prompt += "Just output the JSON dictionary as the final result without any additional information."

prompt_template = ChatPromptTemplate.from_messages([("system", prompt), ("human", "{input}")])
location_prompt_template = ChatPromptTemplate.from_messages([("system", location_prompt), ("human", "{input}")])

In [77]:
# Create the LLM chain with the prompt template and model

# Run the chain by passing the characters and JSON data as inputs
final_prompt = prompt_template.invoke(f"character list: {all_characters}")
# character_llm = llm.with_structured_output(None, method="json_mode") if model_type == "gpt" else llm
character_llm = llm2
# character_llm = llm
output = character_llm.invoke(final_prompt)


In [78]:
output

AIMessage(content='{\n    "Romeo Montague": ["Romeo"],\n    "Juliet Capulet": ["Juliet"],\n    "Lord Capulet": ["Capulet", "Old Capulet", "Father Capulet", "Father"],\n    "Lord Montague": ["Old Montague", "Montague"],\n    "Sampson": [],\n    "Gregory": [],\n    "Abram": [],\n    "Balthasar": [],\n    "Benvolio": [],\n    "Tybalt": [],\n    "Prince Escalus": [],\n    "County Paris": ["Paris"],\n    "Servant (the Clown)": ["Servingman"],\n    "Lady Capulet": ["Capulet\'s Wife", "Mother"],\n    "Nurse": [],\n    "Mercutio": [],\n    "Friar Laurence": [],\n    "Peter": [],\n    "Apothecary": [],\n    "Friar John": []\n}', additional_kwargs={}, response_metadata={'id': 'msg_01B2a5ZbFkaGysH2xbXAvd89', 'model': 'claude-3-5-sonnet-20240620', 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'input_tokens': 996, 'output_tokens': 240}}, id='run-06352b96-8fb4-4914-aa43-fdeb8f406756-0', usage_metadata={'input_tokens': 996, 'output_tokens': 240, 'total_tokens': 1236})

In [79]:
# Create the LLM chain with the prompt template and model

# Run the chain by passing the characters and JSON data as inputs
final_location_prompt = location_prompt_template.invoke(f"location list: {all_locations}")
# location_llm = llm.with_structured_output(None, method="json_mode") if model_type == "gpt" else llm
location_llm = llm2
# location_llm = llm
location_output = location_llm.invoke(final_location_prompt)

In [80]:
location_output

AIMessage(content='{\n  "Verona": ["Verona", "Verona, a public place", "A Street", "A street", "A public place", "Verona, a churchyard"],\n  "Capulet\'s house": ["Capulet\'s house", "A lane by the wall of Capulet\'s orchard", "Capulet\'s orchard", "Juliet\'s chamber"],\n  "Friar Laurence\'s cell": ["Friar Laurence\'s cell", "Verona, Friar Laurence\'s cell"],\n  "Mantua": ["Mantua, a street"]\n}', additional_kwargs={}, response_metadata={'id': 'msg_01CZuk2bZfU3bj1sr9q5gAdH', 'model': 'claude-3-5-sonnet-20240620', 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'input_tokens': 353, 'output_tokens': 149}}, id='run-c9a81a4f-5b12-49aa-900c-c943a1e11731-0', usage_metadata={'input_tokens': 353, 'output_tokens': 149, 'total_tokens': 502})

In [81]:
# if model_type == "claude":
output_formatted = output.content
# turn into json
output = json.loads(output_formatted)
location_output_formatted = location_output.content
# turn into json
location_output = json.loads(location_output_formatted)

In [82]:
# replace characters and locations with new names
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)

capitalized_output = {string.capwords(key): [string.capwords(name) for name in value] for key, value in output.items()}
capitalized_location_output = {string.capwords(key): [string.capwords(name) for name in value] for key, value in location_output.items()}

for scene in scenes:
    for character in scene["characters"]:
        char = string.capwords(character["name"])
        found_char = False
        if char in capitalized_output:
            character["name"] = char # capitalize the character name
            found_char = True
        for key, value in capitalized_output.items():
            # else find the key that contains the character
            for v in value:
                if char in v or v in char:
                    character["name"] = key
                    found_char = True
                    break
        if not found_char:
            character["name"] = char

    loc = string.capwords(scene["location"])
    found_loc = False
    if loc in capitalized_location_output:
        scene["location"] = loc # capitalize the location
        found_loc = True
    for key, value in capitalized_location_output.items():
        # else find the key that contains the location
        for v in value:
            if loc in v or v in loc:
                scene["location"] = key
                found_loc = True
                break
    if not found_loc:
        scene["location"] = loc

# save as json
with open(f"json/{story_name}/all.json", "w") as f:
    json.dump(scenes, f, indent=4)

In [83]:
# now list all characters and locations again
# collect all unique characters and locations for each chapter
all_characters_new = {}
all_locations_new = []
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)
for scene in scenes:
    for character in scene["characters"]:
        char = character["name"]
        if char not in all_characters_new:
            all_characters_new[char] = character["role"]
    location = scene["location"]
    if location not in all_locations_new:
        all_locations_new.append(location)

In [84]:
len(all_characters_new), len(all_locations_new)

(20, 4)

In [85]:
all_characters_new

{'Lord Montague': 'A young man in love with Juliet, driven by passion and fate.',
 'Lord Capulet': 'A young woman deeply in love with Romeo, torn between family loyalty and love.',
 'Sampson': 'Provokes a fight with the Montagues and expresses disdain for them.',
 'Gregory': "Sampson's companion who engages in banter and encourages the fight.",
 'Abram': 'A servant of the Montagues who gets drawn into the conflict.',
 'Balthasar': 'Another servant of the Montagues who supports Abram in the conflict.',
 'Benvolio': 'Attempts to keep the peace and stop the fighting.',
 'Tybalt': 'Aggressively confronts Benvolio and seeks to escalate the conflict.',
 'Prince Escalus': 'Intervenes to stop the fighting and restore order.',
 'Romeo Montague': 'Expresses his melancholy about love and the conflict around him.',
 'Lady Capulet': 'Father of Juliet, discussing her marriage with Paris',
 'County Paris': "Suitor to Juliet, seeking Capulet's approval for marriage",
 'Servant (the Clown)': 'Messenger

In [86]:
all_locations_new

["Friar Laurence's Cell", 'Verona', 'Mantua', "Capulet's House"]

In [87]:
# Pydantic
class ChapterSummary(BaseModel):
    """Summarize a chapter based on scenes."""
    summary: str = Field(description="A brief, 1-line summary of the chapter")

chapter_llm = llm.with_structured_output(ChapterSummary)

In [88]:
# create summary json of each chapter
chapter_summaries = []
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
        
    with open(f"chapters/{story_name}/{marker}.txt") as f:
        chapter = f.read()

    # get summary of chapter
    scene_summaries = " ".join([scene["summary"] for scene in scenes])
    summary = chapter_llm.invoke(scene_summaries)

    chapter_summary = {
        "chapter": marker,
        "scenes": len(scenes),
        "length": len(chapter),
        "num_lines": len(chapter.split("\n")),
        "summary": summary.summary,
        "importance": [],
        "conflict": [],
        "locations": {},
        "characters": {}
    }
    chapter_summaries.append(chapter_summary)
    
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)
    for scene in scenes:
        chapter = scene["chapter"]
        chapter_summary = chapter_summaries[markers.index(chapter)]
        # add location to chapter summary
        location = scene["location"]
        chapter_summary_locations = chapter_summary["locations"]
        if location not in chapter_summary_locations:
            chapter_summary_locations[location] = 1
        else:
            chapter_summary_locations[location] += 1
        # add importance to chapter summary
        importance = scene["importance"]
        chapter_summary["importance"].append(importance)
        # add conflict to chapter summary
        conflict = scene["conflict"]
        chapter_summary["conflict"].append(conflict)
        # add characters to chapter summary
        characters = scene["characters"]
        for character in characters:
            char = character["name"]
            chapter_summary_characters = chapter_summary["characters"]
            if char not in chapter_summary_characters:
                chapter_summary_characters[char] = 1
            else:
                chapter_summary_characters[char] += 1

# now average importance and conflict for each chapter
for chapter_summary in chapter_summaries:
    if len(chapter_summary["importance"]) == 0:
       print(f"Chapter {chapter_summary['chapter']} has no scenes.")
    chapter_summary["importance"] = round(sum(chapter_summary["importance"]) / len(chapter_summary["importance"]), 2)
    chapter_summary["conflict"] = round(sum(chapter_summary["conflict"]) / len(chapter_summary["conflict"]), 2)

# rank each chapter by importance and conflict
importances = []
conflicts = []
for chapter_summary in chapter_summaries:
    importances.append((chapter_summary["chapter"], chapter_summary["importance"]))
    conflicts.append((chapter_summary["chapter"], chapter_summary["conflict"]))
    
# sort importances
sorted_importances = sorted(importances, key=lambda x: x[1], reverse=True)
# add importance_rank to each chapter
for k, (j, _) in enumerate(sorted_importances):
    chapter_summaries[markers.index(j)]["importance_rank"] = k+1

# sort conflicts
sorted_conflicts = sorted(conflicts, key=lambda x: x[1], reverse=True)
# add conflict_rank to each chapter
for k, (j, _) in enumerate(sorted_conflicts):
    chapter_summaries[markers.index(j)]["conflict_rank"] = k+1

In [89]:
chapter_summaries

[{'chapter': 'Prologue',
  'scenes': 1,
  'length': 1333,
  'num_lines': 23,
  'summary': 'The Chorus sets the stage for the tragic tale of two rival families and the fate of their ill-fated offspring.',
  'importance': 1.0,
  'conflict': 1.0,
  'locations': {"Friar Laurence's Cell": 1},
  'characters': {'Lord Montague': 2, 'Lord Capulet': 2},
  'importance_rank': 1,
  'conflict_rank': 1},
 {'chapter': 'Act I',
  'scenes': 5,
  'length': 40992,
  'num_lines': 808,
  'summary': "In this chapter, a brawl between the Capulet and Montague servants escalates, leading to the Prince's decree against further violence; Count Paris seeks Juliet's hand in marriage, while Romeo and his friends attend the Capulet feast, where Romeo and Juliet meet and fall in love, unaware of their families' feud.",
  'importance': 0.84,
  'conflict': 0.64,
  'locations': {'Verona': 1, 'Mantua': 2, "Capulet's House": 2},
  'characters': {'Sampson': 1,
   'Gregory': 1,
   'Abram': 1,
   'Balthasar': 1,
   'Benvolio'

In [90]:
all_json = {}
all_json["title"] = story_info["title"]
all_json["type"] = story_info["type"]
all_json["author"] = story_info["author"]
all_json["year"] = story_info["year"]
all_json["url"] = story_info["url"]
all_json["image"] = story_info["image"]
all_json["num_chapters"] = len(chapters)
all_json["num_scenes"] = len(scenes)
all_json["num_characters"] = len(all_characters_new)
all_json["num_locations"] = len(all_locations_new)
all_json["chapters"] = chapter_summaries
all_json["scenes"] = scenes

In [91]:
all_json

{'title': 'The Tragedy of Romeo and Juliet',
 'type': 'Book',
 'author': 'William Shakespeare',
 'year': 1597,
 'url': 'https://www.gutenberg.org/ebooks/1112',
 'image': 'https://m.media-amazon.com/images/I/71jUCfPUkDL._AC_UF894,1000_QL80_.jpg',
 'num_chapters': 6,
 'num_scenes': 25,
 'num_characters': 20,
 'num_locations': 4,
 'chapters': [{'chapter': 'Prologue',
   'scenes': 1,
   'length': 1333,
   'num_lines': 23,
   'summary': 'The Chorus sets the stage for the tragic tale of two rival families and the fate of their ill-fated offspring.',
   'importance': 1.0,
   'conflict': 1.0,
   'locations': {"Friar Laurence's Cell": 1},
   'characters': {'Lord Montague': 2, 'Lord Capulet': 2},
   'importance_rank': 1,
   'conflict_rank': 1},
  {'chapter': 'Act I',
   'scenes': 5,
   'length': 40992,
   'num_lines': 808,
   'summary': "In this chapter, a brawl between the Capulet and Montague servants escalates, leading to the Prince's decree against further violence; Count Paris seeks Juliet'

In [92]:
# save as json
with open(f"json/{story_name}/final_data.json", "w") as f:
    json.dump(all_json, f, indent=4)