## set up

In [6]:
# imports
import os
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
import json
from pydantic import BaseModel, Field
import string
import re
from typing import Optional 
import asyncio
import aiofiles

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Define a semaphore to limit concurrency
SEMAPHORE_LIMIT = 20
semaphore = asyncio.Semaphore(SEMAPHORE_LIMIT)

async def semaphore_wrapper(func, *args):
    """Wrap async function with semaphore to limit concurrency"""
    async with semaphore:
        return await func(*args)

In [8]:
# api keys
try:
    with open("../secrets.json") as f:
        secrets = json.load(f)
    open_ai_key = secrets["openai"]
    anthropic_key = secrets["anthropic"]
    google_key = secrets["google"]
    os.environ["OPENAI_API_KEY"] = open_ai_key
    os.environ["ANTHROPIC_API_KEY"] = anthropic_key
    os.environ["GOOGLE_API_KEY"] = google_key
    print("API key loaded.")
except FileNotFoundError:
    print("Secrets file not found. YOU NEED THEM TO RUN THIS.")

API key loaded.


In [9]:
## make sure all folders are created inside notebooks folder
if not os.path.exists("scripts"):
    os.makedirs("scripts")
if not os.path.exists("json"):
    os.makedirs("json")
if not os.path.exists("chapters"):
    os.makedirs("chapters")

In [10]:
# NOT VERY RELEVANT
def format_gemini_response(res):
    output_formatted = res.content[7:-3]
    # turn into json
    output = json.loads(output_formatted)
    return output

class DotDict:
    # allow dot notation for dicts
    def __init__(self, dictionary):
        # check if dictionary is a string
        if isinstance(dictionary, str):
            # if it is a string, convert it to a dictionary
            print(dictionary)
            dictionary = json.loads(dictionary)
        for key, value in dictionary.items():
            setattr(self, key, value)

In [11]:
model_type = "default"
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)
# llm = ChatOpenAI(model="gpt-4o", temperature=0.1)
llm2 = ChatAnthropic(model_name="claude-3-5-sonnet-20240620", temperature=0.1, max_tokens_to_sample=4096, api_key=os.getenv("ANTHROPIC_API_KEY"))

# NOT USING RIGHT NOW
# model_type = "gemini"
llm3 = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.1)

# safety_settings={
#     HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
# }

# # original method
# og_generate = ChatGoogleGenerativeAI._generate

# # patch

# ChatGoogleGenerativeAI._generate = partial(llm._generate, safety_settings=safety_settings)

In [50]:
# Pydantic
class StoryInfo(BaseModel):
    """Get overall information about the story."""
    numChapters: int = Field(description="Number of chapters in the story")
    firstline: str = Field(description="First line of the story")
    lastline: Optional[str] = Field(description="Last line of the story or first volume")
    markers: list[str] = Field(description="List of markers to split the story into chapters, as formatted in the table of contents. Make sure the formatting matches the markers in the actual text (including the word 'chapter' if applicable). Keep all punctuation marks and capitalization, and use the full chapter names. If there are multiple volumes, focus on the first volume.")

def get_marker_prompt(story):
    prompt = f'''List the markers needed to split this story into chapters: {story}. The marker usually appears after "(本章完)“. An example of a marker looks like "第2章 贾夫人仙逝扬州城 冷子兴演说荣国府". PLEASE DO NOT MIX UP CHAPTERS AND MAKE THE MARKERS APPEAR IN THE ORDER THEY APPEAR IN THE TEXT. DO NOT REARRANGE CHAPTER NUMBERS AND CHAPTER TITLES. Format your response like Story Info:
    class StoryInfo(BaseModel):
        """Get overall information about the story."""
        firstline: str = Field(description="First line of the story")
        lastline: Optional[str] = Field(description="Last line of the story or first volume")
        markers: list[str] = Field(description="List of markers to split the story into chapters. Keep all punctuation marks and capitalization.")
    '''
    return prompt

# structured_llm = llm2.with_structured_output(StoryInfo)
structured_llm = llm.with_structured_output(StoryInfo)

## split text into chapters

In [51]:
# NOTE: Make sure your story file is in the scripts folder (as a txt file)
# read in txt file from scripts folder (inside notebooks folder)
og_story_name = "红楼梦" # name of the story (should match the name of the file (e.g., type "color" for color.txt)
story_name = og_story_name
analysis_type = "character" # character, theme

In [52]:
# optional, but this part needs to be manually looked up
story_info = {
    "title": "红楼梦 Dream of the Red Chamber",
    "type": "小说 Novel",
    "author": "曹雪芹",
    "year": 1791,
    "url": "https://github.com/hankinghu/literature-books/blob/master/%E7%BA%A2%E6%A5%BC%E6%A2%A6.txt",
    "image": "covers/红楼梦.png",
}

In [53]:
# read in the script
with open(f"scripts/{story_name}.txt") as f:
    story = f.read()

In [54]:
if analysis_type == "theme":
    story_name += "_themes"
elif analysis_type == "2":
    story_name += "_2"

In [55]:
if not os.path.exists(f"json/{story_name}"):
    os.makedirs(f"json/{story_name}")

In [56]:
# focus on the first volume if there are multiple volumes
if "END OF VOL." in story:
    volumes = story.split("END OF VOL.")
    story = volumes[0]
elif "VOLUME II." in story:
    volumes = story.split("VOLUME II.")
    story = volumes[0]

In [57]:
# split story into chunks
char_chunk = 120000
story_len = len(story)
print(story_len)
if story_len > char_chunk:
    # story_chunks = [story[i:i + char_chunk] for i in range(0, len(story), char_chunk)]
    # take 0:char_chunk/2 and -char_chunk/2: for the first chunk
    chunk = story[0:char_chunk//2] + story[-char_chunk//2:]
    story_chunks = [chunk]
else: # if story is less than 120k characters, use the whole story
    story_chunks = [story]
print(len(story_chunks))

151930
1


In [58]:
all_res = []
for i, chunk in enumerate(story_chunks):
    invalid_input = True
    while invalid_input: # try until valid input
        # see if exception is thrown
        try:
            # if model_type == 'gemini':
            #     prompt = get_marker_prompt(story)
            #     markers = llm3.invoke(prompt)
            #     res = format_gemini_response(markers)
            #     print(res)
            #     res = DotDict(res['Story Info'])
            # else:
            res = structured_llm.invoke(chunk)
            all_res.append(res)
            print(f"Chunk {i}")
            invalid_input = False
        except Exception as e:
            print("Exception thrown. Trying again.")
            print("Error:", e)
            invalid_input = True
    # if i < len(story_chunks) - 1:
    #     # Pause for a specified time (e.g., 60 seconds) to avoid rate limits
    #     time.sleep(60)  # Adjust the sleep duration as per your rate limit requirements

Chunk 0


In [59]:
# save first line from first chunk, last line from last chunk, and combine markers
first_line = all_res[0].firstline
last_line = all_res[-1].lastline
num_chapters = sum([res.numChapters for res in all_res])
markers = []
for res in all_res:
    markers.extend(res.markers)

num_chapters, markers

(24,
 ['第1章 甄士隐梦幻识通灵 贾雨村风尘怀闺秀',
  '第2章 贾夫人仙逝扬州城 冷子兴演说荣国府',
  '第3章 托内兄如海荐西宾 接外孙贾母惜孤女',
  '第4章 薄命女偏逢簿命郎 葫芦僧判断葫芦案',
  '第5章 贾宝玉神游太虚境 警幻仙曲演红楼梦',
  '第6章 贾宝玉初试云雨情 刘老老一进荣国府',
  '第7章 送宫花贾琏戏熙凤 宴宁府宝玉会秦钟',
  '第8章 皇恩重元妃省父母 天伦乐宝玉呈才藻',
  '第9章 训劣子李贵承申饬 嗔顽童茗烟闹书房',
  '第10章 金寡妇贪利权受辱 张太医论病细穷源',
  '第11章 贾宝玉梦游太虚境 警幻仙曲演红楼梦',
  '第12章 贾母省亲贾宝玉初试云雨情',
  '第13章 贾宝玉初试云雨情',
  '第14章 贾母省亲贾宝玉初试云雨情',
  '第15章 贾母省亲贾宝玉初试云雨情',
  '第16章 贾母省亲贾宝玉初试云雨情',
  '第17章 大观园试才题对额 荣国府归省庆元宵',
  '第18章 皇恩重元妃省父母 天伦乐宝玉呈才藻',
  '第19章 情切切良宵花解语 意绵绵静日玉生香',
  '第20章 王熙凤正言弹妒意 林黛玉俏语谑娇音',
  '第21章 贤袭人娇嗔箴宝玉 俏平儿软语救贾琏',
  '第22章 听曲文宝玉悟禅机 制灯迷贾政悲谶语',
  '第23章 西厢记妙词通戏语 牡丹亭艳曲警芳心',
  '第24章 醉金刚轻财尚义侠 痴女儿遗帕惹相思'])

In [60]:
# first_line = res.firstline
# last_line = res.lastline
# markers = res.markers
first_line, last_line

('第1章 甄士隐梦幻识通灵 贾雨村风尘怀闺秀', '要知端底，下回分解。')

In [26]:
story = re.sub(r' {2,}', ' ', story)  # Replace consecutive spaces with one space
story = re.sub(r'\n{2,}', '\n', story)  # Replace consecutive newlines with one newline
story = re.sub(r' ?\n ?', '\n', story)  # Replace space + newline or newline + space with one newline
story = story.replace("“", "\"").replace("”", "\"").replace("‘", "'").replace("’", "'")

In [27]:
story_formatted = story.replace("\n", " ")
# replace weird quotes too
story_formatted = story_formatted.replace("\"", " ").replace("'", " ")

In [28]:
# # remove any marker that doesn't include one of these words
# exclude_words = ["part", "preface", "contents", "by", "end", "epilogue", "letter"]
exclude_words = ["part", "preface", "contents", "introduction", "end", "epilogue", "letter", "volume"]
markers = [m for m in markers if not any(word in m.lower() for word in exclude_words)]

In [29]:
# save as json 
story_json = {
    "first_line": first_line,
    "last_line": last_line,
    "markers": markers
}

if not os.path.exists(f"json/{og_story_name}"):
    os.makedirs(f"json/{og_story_name}")
with open(f"json/{og_story_name}/summary.json", "w") as f:
    json.dump(story_json, f, indent=4)

In [30]:
# read in markers from summary.json
with open(f"json/{og_story_name}/summary.json") as f:
    summary = json.load(f)
    markers = summary["markers"]
    first_line = summary["first_line"]
    last_line = summary["last_line"]

In [31]:
story_formatted[-100:] 

'是别人，正是贾芸。小红不觉粉面含羞，问道： 二爷在那里拾着的？ 只见那贾芸笑道： 你过来，我告诉你。 一面说一面就上来拉他的衣裳。那小红臊的转身一跑，却被门槛子绊倒。 要知端底，下回分解。 (本章完)'

In [32]:
first_ind = story_formatted.find(first_line.replace("\"", " ").replace("'", " "))
first_ind, first_line.replace("\"", " ").replace("'", " ")

(0, '第1章 甄士隐梦幻识通灵 贾雨村风尘怀闺秀')

In [34]:
second_marker = markers[1] + " "
last_line_index = story_formatted.find(second_marker, first_ind)
last_line_index, second_marker, story_formatted[last_line_index-50:last_line_index+50]

(-1, '贾夫人仙逝扬州城\u3000冷子兴演说荣国府 ', '')

In [35]:
last_ind = story_formatted.find(last_line)
last_ind, last_line

(-1, '不知端底，下回分解。')

In [36]:
# split story into chapters based on markers
chapters = []
cur_first_line = first_line.replace("\"", " ").replace("'", " ")
cur_min_index = 0
for i, marker in enumerate(markers):
    # print("marker", marker)
    exists_next = i+1 < len(markers)
    next_marker = markers[i+1] if exists_next else ""
    # print("next_marker", next_marker)
    
    if exists_next and " ACT " in next_marker.upper():
        next_marker = next_marker.upper()
    formatted_last_line = next_marker if exists_next else last_line
    # print("formatted_last_line", formatted_last_line)

    if " ACT " not in next_marker.upper():
        formatted_last_line += " "
    cur_last_line = formatted_last_line
    cur_last_line = cur_last_line.replace("\"", " ").replace("'", " ")
    first_line_index = story_formatted.rfind(cur_first_line, cur_min_index)
    print("cur_first_line", repr(cur_first_line))
    print("first_line_index", first_line_index)
    print("Snippet around first_line_index:", repr(story_formatted[first_line_index:first_line_index + 200]))

    if i != 0 and " ACT " not in marker.upper():
        first_line_index += len(cur_first_line)
    last_line_index = story_formatted.find(cur_last_line, first_line_index)
    print("cur_last_line", repr(cur_last_line))
    print("last_line_index", last_line_index)
    print("Snippet around last_line_index:", repr(story_formatted[last_line_index-200:last_line_index]))
    
    if i+1 == len(markers):
        if last_line_index == -1:
            last_line_index = len(story_formatted)
        else:
            last_line_index += len(cur_last_line) 
    chapter = story[first_line_index:last_line_index].strip()

    # add line number in front of each line
    chapter = chapter.split("\n")
    for j, line in enumerate(chapter):
        chapter[j] = f"LINE {j+1}: {line}"
    chapter = "\n".join(chapter)
    chapters.append(chapter)
    cur_first_line = cur_last_line
    cur_min_index = last_line_index

cur_first_line '第1章 甄士隐梦幻识通灵 贾雨村风尘怀闺秀'
first_line_index 0
Snippet around first_line_index: '第1章 甄士隐梦幻识通灵 贾雨村风尘怀闺秀 \u3000\u3000此开卷第一回也。 \u3000\u3000作者自云：因曾历过一番梦幻之后，故将真事隐去，而借通灵说撰此《石头记》一书也，故曰 甄士隐 云云。但书中所记何事何人？自己又云： 今风尘碌碌，一事无成，忽念及当日所有之女子，一一细考较去，觉其行止见识皆出我之上。我堂堂须眉，诚不若彼裙钗，我实愧则有馀，悔又无益，大无可如何之日也。当此日，欲将已往所赖天思祖德，锦衣纨裤之时，饫甘餍'
cur_last_line '贾夫人仙逝扬州城\u3000冷子兴演说荣国府 '
last_line_index -1
Snippet around last_line_index: '早灰了一半。正没好气，忽然听见老嬷嬷说起贾芸来，不觉心中一动，便闷闷的回房。睡在床上，暗暗思量，翻来复去，自觉没情没趣的。忽听的窗外低低的叫道： 红儿，你的绢子我拾在这里呢。 小红听了，忙走出来看时，不是别人，正是贾芸。小红不觉粉面含羞，问道： 二爷在那里拾着的？ 只见那贾芸笑道： 你过来，我告诉你。 一面说一面就上来拉他的衣裳。那小红臊的转身一跑，却被门槛子绊倒。 要知端底，下回分解。 (本章完'
cur_first_line '贾夫人仙逝扬州城\u3000冷子兴演说荣国府 '
first_line_index -1
Snippet around first_line_index: ''
cur_last_line '金陵城起复贾雨村\u3000荣国府收养林黛玉 '
last_line_index -1
Snippet around last_line_index: '早灰了一半。正没好气，忽然听见老嬷嬷说起贾芸来，不觉心中一动，便闷闷的回房。睡在床上，暗暗思量，翻来复去，自觉没情没趣的。忽听的窗外低低的叫道： 红儿，你的绢子我拾在这里呢。 小红听了，忙走出来看时，不是别人，正是贾芸。小红不觉粉面含羞，问道： 二爷在那里拾着的？ 只见那贾芸笑道： 你过来，我告诉你。 一面说一面就上来拉他的衣裳。那小红臊的转身一跑，却被门槛子绊倒。 要知端底，下回分解。 (本章完'
c

In [37]:
# check results
for i, marker in enumerate(markers):
    print(f"Chapter {marker}")
    chapter_first_line = chapters[i][:100]
    chapter_last_line = chapters[i][-100:]
    print("first_line:", chapter_first_line)
    print("last_line:", chapter_last_line)
    print("\n")

Chapter 甄士隐梦幻识通灵　贾雨村风尘怀闺秀
first_line: LINE 1: 第1章 甄士隐梦幻识通灵 贾雨村风尘怀闺秀
LINE 2: 　　此开卷第一回也。
LINE 3: 　　作者自云：因曾历过一番梦幻之后，故将真事隐去，而借通灵说撰此《石头记》一书也，故曰
last_line: 道："二爷在那里拾着的？"只见那贾芸笑道："你过来，我告诉你。"一面说一面就上来拉他的衣裳。那小红臊的转身一跑，却被门槛子绊倒。
LINE 741: 要知端底，下回分解。
LINE 742: (本章完


Chapter 贾夫人仙逝扬州城　冷子兴演说荣国府
first_line: LINE 1: 尘怀闺秀
LINE 2: 　　此开卷第一回也。
LINE 3: 　　作者自云：因曾历过一番梦幻之后，故将真事隐去，而借通灵说撰此《石头记》一书也，故曰"甄士隐"云云。但书中所记何事何人
last_line: 道："二爷在那里拾着的？"只见那贾芸笑道："你过来，我告诉你。"一面说一面就上来拉他的衣裳。那小红臊的转身一跑，却被门槛子绊倒。
LINE 741: 要知端底，下回分解。
LINE 742: (本章完


Chapter 金陵城起复贾雨村　荣国府收养林黛玉
first_line: LINE 1: 尘怀闺秀
LINE 2: 　　此开卷第一回也。
LINE 3: 　　作者自云：因曾历过一番梦幻之后，故将真事隐去，而借通灵说撰此《石头记》一书也，故曰"甄士隐"云云。但书中所记何事何人
last_line: 道："二爷在那里拾着的？"只见那贾芸笑道："你过来，我告诉你。"一面说一面就上来拉他的衣裳。那小红臊的转身一跑，却被门槛子绊倒。
LINE 741: 要知端底，下回分解。
LINE 742: (本章完


Chapter 薄命女偏逢薄命郎　葫芦僧乱判葫芦案
first_line: LINE 1: 尘怀闺秀
LINE 2: 　　此开卷第一回也。
LINE 3: 　　作者自云：因曾历过一番梦幻之后，故将真事隐去，而借通灵说撰此《石头记》一书也，故曰"甄士隐"云云。但书中所记何事何人
last_line: 道："二爷在那里拾着的？"只见那贾芸笑道："你过来，我告诉你。"一面说一面就上来拉他的衣裳。那小红臊的转身一跑，却被门槛子绊倒。
LINE 741: 要知端底，下回分解。
LINE 

In [173]:
# save to new txt files
# create folder for story if it doesn't exist
if not os.path.exists(f"chapters/{og_story_name}"):
    os.makedirs(f"chapters/{og_story_name}")
for i, marker in enumerate(markers):
    with open(f"chapters/{og_story_name}/{marker}.txt", "w") as f:
        f.write(chapters[i])
        print(f"Chapter {marker} saved.")

Chapter 第1章 甄士隐梦幻识通灵 贾雨村风尘怀闺秀 saved.
Chapter 第2章 贾夫人仙逝扬州城 冷子兴演说荣国府 saved.
Chapter 第3章 托内兄如海荐西宾 接外孙贾母惜孤女 saved.
Chapter 第4章 薄命女偏逢簿命郎 葫芦僧判断葫芦案 saved.
Chapter 第5章 贾宝玉神游太虚境 警幻仙曲演红楼梦 saved.
Chapter 第6章 贾宝玉初试云雨情 刘老老一进荣国府 saved.
Chapter 第7章 送宫花贾琏戏熙凤 宴宁府宝玉会秦钟 saved.
Chapter 第8章 金寡妇贪利权受辱 张太医论病细穷源 saved.
Chapter 第9章 贾宝玉奇缘识金锁 薛宝钗巧合认通灵 saved.
Chapter 第10章 王熙凤正言弹妒意 林黛玉俏语谑娇音 saved.
Chapter 第11章 贾宝玉梦游太虚境 警幻仙曲演红楼梦 saved.
Chapter 第12章 贾母仙逝扬州城 冷子兴演说荣国府 saved.
Chapter 第13章 托内兄如海荐西宾 接外孙贾母惜孤女 saved.
Chapter 第14章 薄命女偏逢簿命郎 葫芦僧判断葫芦案 saved.
Chapter 第15章 贾宝玉神游太虚境 警幻仙曲演红楼梦 saved.
Chapter 第16章 贾宝玉初试云雨情 刘老老一进荣国府 saved.
Chapter 第17章 大观园试才题对额 荣国府归省庆元宵 saved.
Chapter 第18章 皇恩重元妃省父母 天伦乐宝玉呈才藻 saved.
Chapter 第19章 情切切良宵花解语 意绵绵静日玉生香 saved.
Chapter 第20章 王熙凤正言弹妒意 林黛玉俏语谑娇音 saved.
Chapter 第21章 贤袭人娇嗔箴宝玉 俏平儿软语救贾琏 saved.
Chapter 第22章 听曲文宝玉悟禅机 制灯迷贾政悲谶语 saved.
Chapter 第23章 西厢记妙词通戏语 牡丹亭艳曲警芳心 saved.
Chapter 第24章 醉金刚轻财尚义侠 痴女儿遗帕惹相思 saved.


In [174]:
# read in chapters from txt files
chapters = []
for i, marker in enumerate(markers):
    with open(f"chapters/{og_story_name}/{marker}.txt") as f:
        chapter = f.read()
        chapters.append(chapter)

In [175]:
chapters[0]

'LINE 1: 第1章 甄士隐梦幻识通灵 贾雨村风尘怀闺秀\nLINE 2: \u3000\u3000此开卷第一回也。\nLINE 3: \u3000\u3000作者自云：因曾历过一番梦幻之后，故将真事隐去，而借通灵说撰此《石头记》一书也，故曰"甄士隐"云云。但书中所记何事何人？自己又云："今风尘碌碌，一事无成，忽念及当日所有之女子，一一细考较去，觉其行止见识皆出我之上。我堂堂须眉，诚不若彼裙钗，我实愧则有馀，悔又无益，大无可如何之日也。当此日，欲将已往所赖天思祖德，锦衣纨裤之时，饫甘餍肥之日，背父兄教育之恩，负师友规训之德，以致今日一技无成，半生潦倒之罪，编述一集，以告天下。知我之负罪固多，然闺阁中历历有人，万不可因我之不肖，自护己短，一并使其泯灭也。所以蓬牖茅椽，绳床瓦灶，并不足妨我襟怀；况那晨风夕月。阶柳庭花，更觉得润人笔墨。我虽不学无文，又何妨用假语村言敷演出来，亦可使闺阁昭传，复可破一时之闷，醒同人之目，不亦宜乎？"故曰"贾雨村"云云。更于篇中间用"梦""幻"等字，却是此书本旨，兼寓提醒阅者之意。\nLINE 4: \u3000\u3000看官，你道此书从何而起？说来虽近荒唐，细玩颇有趣味。\nLINE 5: \u3000\u3000却说那女娲氏炼石补天之时，于大荒山无稽崖炼成高十二丈、见方二十四丈大的顽石三万六千五百零一块。那娲皇只用了三万六千五百块，单单剩下一块未用，弃在青埂峰下。谁知此石自经锻炼之后，灵性已通，自去自来，可大可小。因见众石俱得补天，独自己无才不得入选，遂自怨自愧，日夜悲哀。\nLINE 6: \u3000\u3000一日，正当嗟悼之际，俄见一僧一道远远而来生得骨格不凡，丰神迥异，来到这青埂峰下，席地坐谈。见着这块鲜莹明洁的石头，且又缩成扇坠一般，甚属可爱。那僧托于掌上，笑道："形体倒也是个灵物了，只是没有实在的好处。须得再镌上几个字。使人人见了便知你是件奇物，然后携你到那昌明隆盛之邦、诗礼簪缨之族、花柳繁华地、温柔富贵乡那里去走一遭。"石头听了大喜，因问："不知可镌何字？携到何方？望乞明示。"那僧笑道："你且莫问，日后自然明白。"说毕，便袖了，同那道人飘然而去，竟不知投向何方。\nLINE 7: \u3000\u3000又不知过了几世几劫，因有个空空道人访道求仙，从这大荒山无稽崖青埂蜂下经过。忽见一块大石，上面字迹分明，编述历历。空

## analyze scene

In [177]:
scene_type = "character" # location, character, or n/a

In [178]:
extra_scene_instruction = "Analyze the scenes in this chapter."
if scene_type == "location":
    extra_scene_instruction += " Start a new scene when the location changes."
elif scene_type == "character":
    extra_scene_instruction += " Start a new scene when new characters enter or old characters leave."

extra_scene_instruction += "\nChapter text:"

extra_scene_instruction

'Analyze the scenes in this chapter. Start a new scene when new characters enter or old characters leave.\nChapter text:'

In [179]:
# Pydantic
class SceneInfo(BaseModel):
    """Get information about a scene in the story."""
    scene: str = Field(description="Short title of scene (don't include chapter name or number). Each scene should have a unique title, and each title should be no longer than 6 words.")
    summary: str = Field(description="1 line summary of the scene")
    firstline: int = Field(description="First line number of the current scene in this chapter (should be after last line number of previous scene)")
    lastline: int = Field(description="Last line number of the current scene in this chapter (should be after first line number of current scene)")

class SceneListInfo(BaseModel):
    """Get overall information about the story."""
    scenelist: list[SceneInfo] = Field(description="List of key scenes in this chapter")

def get_scenelist_prompt(chapter):
    scene_prompt = f'''List the key scenes in this chapter: {chapter}. Format your response like SceneListInfo:
                    class SceneInfo(BaseModel):
                        """Get information about a scene in the story."""
                        scene: str = Field(description="Title of scene (don't include chapter name or number)")
                        summary: str = Field(description="1 line summary of the scene")
                        firstline: int = Field(description="First line number of the current scene in this chapter (should be after last line number of previous scene)")
                        lastline: int = Field(description="Last line number of the current scene in this chapter (should be after first line number of current scene)")

                    class SceneListInfo(BaseModel):
                        """Get overall information about the story."""
                        scenelist: list[SceneInfo] = Field(description="List of key scenes in this chapter")'''
    return scene_prompt

scene_llm = llm.with_structured_output(SceneListInfo)
# scene_llm = llm2.with_structured_output(SceneListInfo)

In [180]:
# Pydantic
class CharacterInfo(BaseModel):
    """Get information about a character in this scene."""
    name: str = Field(description="Full name of the character. Character name should be 5 words or less.")
    role: str = Field(description="Main actions and/or motivations of the character in this scene")
    importance: float = Field(description="Importance of the character in this scene from from 0: not important at all to 1: very important character")
    emotion: str = Field(description="Emotion of the character in this scene, described in a few words or a short phrase")
    sentiment: float = Field(description="Sentiment of the character in this scene from -1 (very negative) to 1 (very positive)")
    quote: str = Field(description="A direct quote from the character in this scene to illustrate their emotions and/or actions")
    fake_quote: str = Field(description="A brief explanation of the character's emotions and/or actions in this scene if you cannot find a direct quote in the text. If you can find a direct quote, leave this field empty.")

# Pydantic
class ThemeInfo(BaseModel):
    """Get information about a theme in this scene."""
    name: str = Field(description="Short general phrase to describe the theme. Theme name should be 3 words or less.")
    role: str = Field(description="Short description of the role of the theme in this scene")
    importance: float = Field(description="Importance of the theme in this scene from from 0: not important at all to 1: very important theme")
    emotion: str = Field(description="Emotion associated with the theme in this scene, described in a few words or a short phrase")
    sentiment: float = Field(description="Sentiment of the theme in this scene from -1 (very negative) to 1 (very positive)")
    quote: str = Field(description="A direct quote in this scene to illustrate the theme")
    fake_quote: str = Field(description="A brief explanation of the theme in this scene if you cannot find a direct quote in the text. If you can find a direct quote, leave this field empty.")

class SceneDetails(BaseModel):
    """Get overall information about the scene."""
    location: str = Field(description="Physical location of the scene. Location name should be 5 words or less.")
    importance: float = Field(description="Importance of the scene from 0: not important at all to 1: very important and pivotal scene")
    conflict: float = Field(description="Conflict in the scene from 0: no conflict to 1: high conflict")
    characters: list[CharacterInfo] = Field(description="List of key characters in the scene. Make sure there are no duplicates, and focus on speaking characters. Don't include any unnamed characters.")

class SceneDetailsTheme(BaseModel):
    """Get overall information about the scene."""
    location: str = Field(description="Physical location of the scene. Location name should be 5 words or less.")
    importance: float = Field(description="Importance of the scene from 0: not important at all to 1: very important and pivotal scene")
    conflict: float = Field(description="Conflict in the scene from 0: no conflict to 1: high conflict")
    themes: list[ThemeInfo] = Field(description="List of key themes in the scene. Make sure there are no duplicates but try to make the themes as general as possible.")

def get_scenedetails_prompt(scene):
    scene_prompt = f'''List the key details in this scene: {scene}. Format your response like SceneDetails:
                    class CharacterInfo(BaseModel):
                        """Get information about a character in this scene."""
                        name: str = Field(description="Full name of the character. Character name should be 5 words or less.")
                        role: str = Field(description="Main actions and/or motivations of the character in this scene")
                        importance: float = Field(description="Importance of the character in this scene from from 0: not important at all to 1: very important character")
                        emotion: str = Field(description="Emotion of the character in this scene, described in a few words or a short phrase")
                        sentiment: float = Field(description="Sentiment of the character in this scene from -1 (very negative) to 1 (very positive)")
                        quote: str = Field(description="A direct quote from the character in this scene to illustrate their emotions and/or actions")

                    class SceneDetails(BaseModel):
                        """Get overall information about the scene."""
                        location: str = Field(description="Physical location of the scene. Location name should be 5 words or less.")
                        importance: float = Field(description="Importance of the scene from 0: not important at all to 1: very important and pivotal scene")
                        conflict: float = Field(description="Conflict in the scene from 0: no conflict to 1: high conflict")
                        characters: list[CharacterInfo] = Field(description="List of key characters in the scene. Make sure there are no duplicates, and focus on speaking characters.  Don't include any unnamed characters.")'''
    return scene_prompt

scene_details_llm = llm.with_structured_output(SceneDetails)
scene_details_llm_theme = llm.with_structured_output(SceneDetailsTheme)
# scene_details_llm = llm2.with_structured_output(SceneDetails)

In [181]:
async def get_scene_info_async(chapters, chapter_num):
    print(f"Splitting Chapter {markers[chapter_num]} into scenes...")

    cur_chapter = chapters[chapter_num]

    invalid_input = True
    while invalid_input: # try until valid input
        # see if exception is thrown
        try:
            if model_type == "gemini":
                prompt = get_scenelist_prompt(cur_chapter)
                chapter_scenes = await llm3.ainvoke(prompt)
                chapter_scenes = format_gemini_response(chapter_scenes)
            else:
                chapter_scenes = await scene_llm.ainvoke(extra_scene_instruction + "\n" + cur_chapter)
            invalid_input = False
        except:
            invalid_input = True
            
    if model_type == "gemini":
        chapter_scenes_list = [DotDict(scene) for scene in chapter_scenes['scenelist']]
    else: 
        chapter_scenes_list = chapter_scenes.scenelist
    formatted_chapter_scenes = []

    cur_last_line = 0

    for scene in chapter_scenes_list:
        scene_title = scene.scene
        scene_summary = scene.summary
        scene_first_line = scene.firstline
        scene_last_line = scene.lastline

        print(f"FIRST LINE: {scene_first_line}")
        print(f"LAST LINE: {scene_last_line}")
        print()

        formatted_scene = {
            "title": scene_title,
            "summary": scene_summary,
            "chapter": markers[chapter_num],
            "first_line": scene_first_line,
            "last_line": scene_last_line
        }

        if scene_last_line > scene_first_line and scene_first_line > cur_last_line:
            # only add scene if last_line > first_line and first_line > cur_last_line
            formatted_chapter_scenes.append(formatted_scene)
            formatted_chapter = cur_chapter.replace("\n", " ")
            # replace weird quotes too
            formatted_chapter = formatted_chapter.replace("\"", " ").replace("'", " ")
            cur_last_line = scene_last_line

    # sort scenes by first_line
    formatted_chapter_scenes = sorted(formatted_chapter_scenes, key=lambda x: x["first_line"])

    # split chapter into scenes based on formatted_chapter_scenes
    scenes = []
    cur_min_index = 0
    for i, scene in enumerate(formatted_chapter_scenes):
        cur_first_line = f"LINE {scene['first_line']}:"
        # cur_first_line = formatted_chapter_scenes[i]["first_line"].strip("...").replace("\"", " ").replace("'", " ")
        first_line_index = formatted_chapter.find(cur_first_line, cur_min_index)
        if first_line_index == -1:
            # see if first_line_index overlaps with last_line of previous scene
            other_possible_first_index = formatted_chapter.find(cur_first_line)
            if other_possible_first_index != -1:
                overlap = cur_chapter[other_possible_first_index:cur_min_index]
                print(f"SCENE {i}")
                print("\nPREVIOUS SCENE")
                print(scenes[i-1])
                print("\nOVERLAP")
                print(overlap)
                print()
            first_line_index = cur_min_index
        cur_last_line = f"LINE {scene['last_line'] + 1}:"
        # cur_last_line = formatted_chapter_scenes[i]["last_line"].strip("...").replace("\"", " ").replace("'", " ")
        last_line_index = formatted_chapter.find(cur_last_line, first_line_index)
        # last_line_index += len(cur_last_line)
        if last_line_index == -1:
            last_line_index = len(cur_chapter)
        scene_text = cur_chapter[first_line_index:last_line_index].strip()
        # remove line numbers
        scene_text = re.sub(r'LINE \d+: ', '', scene_text)
        scenes.append(scene_text)
        cur_min_index = last_line_index

    # add scene text to formatted_chapter_scenes
    for i, scene in enumerate(formatted_chapter_scenes):
        formatted_chapter_scenes[i]["text"] = scenes[i]

    return scenes, formatted_chapter_scenes

In [182]:
async def get_scene_details_async(scenes, formatted_chapter_scenes, chapter_num):
    print(f"Generating scene details for Chapter {markers[chapter_num]}...")
    all_scene_details = []

    async def process_scene(scene_text, cur_scene):
        """Async function to analyze a single scene"""
        if len(scene_text) == 0:
            return
        
        start_prompt = """
                          When analyzing this scene, focus on characters with a significant role in the scene.
                          If you can't determine the character's role or emotion, don't include them in the analysis.

                          Make sure to only use direct quotes from the text.
                          Don't paraphrase or change the text in any way.
                          If the character speaks, the quote should be something they say.
                          If the character is only described, the quote should be a description of the character's actions or emotions
                          (or any quote that mentions them, even if it is said by another character).
                          Try your best to find a direct quote for each character, but if you can't find a quote, 
                          (1) set the "quote" field to "No quote available" and (2) in the "fake_quote" field,
                          write a brief explanation of the character's emotions and actions in the scene.
                          Leave the "fake_quote" field empty if you can find a direct quote.
                        
                          Scene text:

                          """
        if analysis_type == "theme":
            start_prompt = """
                            When analyzing this scene, focus on themes with a significant role in the scene.
                            If you can't determine the theme's role or emotion, don't include it in the analysis.

                            Make sure to only use direct quotes from the text that describe the theme or relate to it.
                            Don't paraphrase or change the text in any way.
                            Try your best to find a direct quote for each theme, but if you can't find a quote,
                            (1) set the "quote" field to "No quote available" and (2) in the "fake_quote" field,
                            write a brief explanation of the theme in this scene.
                            Leave the "fake_quote" field empty if you can find a direct quote.
                        
                            Scene text:

                            """
            
        invalid_input = True
        while invalid_input:
            try:
                if analysis_type == "theme":
                    scene_details = await scene_details_llm_theme.ainvoke(start_prompt + cur_scene)
                else:
                    scene_details = await scene_details_llm.ainvoke(start_prompt + cur_scene)
                invalid_input = False
                # print(scene_details)
            except:
                invalid_input = True
        
        return scene_details

    # Run all scene details in parallel
    tasks = [process_scene(formatted_chapter_scenes[i]["text"], scenes[i]) for i in range(len(scenes))]
    all_scene_details = await asyncio.gather(*tasks)

    # add details to formatted_chapter_scenes
    for i, scene in enumerate(formatted_chapter_scenes):
        scene_text = formatted_chapter_scenes[i]["text"]
        text_length = len(scene_text)
        num_lines = formatted_chapter_scenes[i]["last_line"] - formatted_chapter_scenes[i]["first_line"] + 1
        # first_line = formatted_chapter_scenes[i]["first_line"]
        # first_line_length = len(first_line)
        # last_line = formatted_chapter_scenes[i]["last_line"]
        # last_line_length = len(last_line)

        # if text_length == 0 or text_length < first_line_length + last_line_length:
        if text_length == 0:
            # no scene text or scene text is too short
            continue
        
        cur_scene_details = all_scene_details[i]
        formatted_chapter_scenes[i]["length"] = text_length
        formatted_chapter_scenes[i]["num_lines"] = num_lines
        formatted_chapter_scenes[i]["location"] = cur_scene_details.location
        formatted_chapter_scenes[i]["importance"] = cur_scene_details.importance
        formatted_chapter_scenes[i]["conflict"] = cur_scene_details.conflict
        formatted_character_info = []

        characters = cur_scene_details.themes if analysis_type == "theme" else cur_scene_details.characters
        # if model_type == "gemini":
        #     characters = [DotDict(character) for character in characters]
        for character in characters:
            character_info = {
                "name": character.name,
                "role": character.role,
                "importance": character.importance,
                "emotion": character.emotion,
                "sentiment": character.sentiment,
                "quote": character.quote,
                "fake_quote": character.fake_quote
            }
            # add character only if role is not N/A and emotion is not N/A and quote is not "No quote available"
            if not (character.role == "N/A" and character.emotion == "N/A" and "No quote available" in character.quote):
                formatted_character_info.append(character_info)
        formatted_chapter_scenes[i]["characters"] = formatted_character_info

    return formatted_chapter_scenes

In [183]:
async def process_all_chapters_async():
    # Step 1: Split each chapter into scenes in parallel
    chapter_tasks = [get_scene_info_async(chapters, i) for i in range(len(chapters))]
    chapter_results = await asyncio.gather(*chapter_tasks)

    # extract scenes and formatted_chapter_scenes
    scenes = [result[0] for result in chapter_results]
    formatted_chapter_scenes = [result[1] for result in chapter_results]

    # Step 2: Get scene details for each scene in parallel
    detail_tasks = [semaphore_wrapper(get_scene_details_async, scenes[i], formatted_chapter_scenes[i], i) for i in range(len(chapters))]
    final_formatted_scenes = await asyncio.gather(*detail_tasks)

    # Step 3: Filter and save results
    for i, marker in enumerate(markers):
        formatted_chapter_scenes = final_formatted_scenes[i]

        # remove scenes with no text
        formatted_chapter_scenes = [scene for scene in formatted_chapter_scenes if scene["text"] != "" and "characters" in scene]
        # remove characters with no quote
        for scene in formatted_chapter_scenes:
            scene["characters"] = [character for character in scene["characters"] if "(" not in character["quote"] != "" and "<" not in character["name"] and "unnamed" not in character["name"].lower()]
            # print(scene)
        
        # remove scenes with no characters
        formatted_chapter_scenes = [scene for scene in formatted_chapter_scenes if len(scene["characters"]) > 0]

        # save as json
        with open(f"json/{story_name}/{marker}.json", "w") as f:
            json.dump(formatted_chapter_scenes, f, indent=4)

        print(f"All details for Chapter {marker} saved.\n")

In [None]:
# RUN MAIN SCENE LOOP ASYNC that will get all the scene/character/etc. info for each chapter
await process_all_chapters_async()

Splitting Chapter 第1章 甄士隐梦幻识通灵 贾雨村风尘怀闺秀 into scenes...
Splitting Chapter 第2章 贾夫人仙逝扬州城 冷子兴演说荣国府 into scenes...
Splitting Chapter 第3章 托内兄如海荐西宾 接外孙贾母惜孤女 into scenes...
Splitting Chapter 第4章 薄命女偏逢簿命郎 葫芦僧判断葫芦案 into scenes...
Splitting Chapter 第5章 贾宝玉神游太虚境 警幻仙曲演红楼梦 into scenes...
Splitting Chapter 第6章 贾宝玉初试云雨情 刘老老一进荣国府 into scenes...
Splitting Chapter 第7章 送宫花贾琏戏熙凤 宴宁府宝玉会秦钟 into scenes...
Splitting Chapter 第8章 金寡妇贪利权受辱 张太医论病细穷源 into scenes...
Splitting Chapter 第9章 贾宝玉奇缘识金锁 薛宝钗巧合认通灵 into scenes...
Splitting Chapter 第10章 王熙凤正言弹妒意 林黛玉俏语谑娇音 into scenes...
Splitting Chapter 第11章 贾宝玉梦游太虚境 警幻仙曲演红楼梦 into scenes...
Splitting Chapter 第12章 贾母仙逝扬州城 冷子兴演说荣国府 into scenes...
Splitting Chapter 第13章 托内兄如海荐西宾 接外孙贾母惜孤女 into scenes...
Splitting Chapter 第14章 薄命女偏逢簿命郎 葫芦僧判断葫芦案 into scenes...
Splitting Chapter 第15章 贾宝玉神游太虚境 警幻仙曲演红楼梦 into scenes...
Splitting Chapter 第16章 贾宝玉初试云雨情 刘老老一进荣国府 into scenes...
Splitting Chapter 第17章 大观园试才题对额 荣国府归省庆元宵 into scenes...
Splitting Chapter 第18章 皇恩重元妃省父母 天伦乐宝玉呈才藻 into scenes...
Sp

In [None]:
# rank each scene by importance
# and within each scene, the characters

for i, marker in enumerate(markers):
    print(marker)
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    importances = []
    conflicts = []
    
    # extract importance from each scene
    for j, scene in enumerate(scenes):
        importances.append((j, scene["importance"]))
        conflicts.append((j, scene["conflict"]))
        # now extract character importances
        character_importances = []
        for k, character in enumerate(scene["characters"]):
            character_importances.append((k, character["importance"]))
        # sort character importances
        sorted_character_importances = sorted(character_importances, key=lambda x: x[1], reverse=True)
        # add importance_rank to each character
        for k, (l, _) in enumerate(sorted_character_importances):
            scenes[j]["characters"][l]["importance_rank"] = k+1
        # add number of each scene
        scenes[j]["number"] = j+1
    # sort importances
    sorted_importances = sorted(importances, key=lambda x: x[1], reverse=True)
    # add importance_rank to each scene
    for k, (j, _) in enumerate(sorted_importances):
        scenes[j]["importance_rank"] = k+1
    # sort conflicts
    sorted_conflicts = sorted(conflicts, key=lambda x: x[1], reverse=True)
    # add conflict_rank to each scene
    for k, (j, _) in enumerate(sorted_conflicts):
        scenes[j]["conflict_rank"] = k+1
    
    # print results
    # for j, scene in enumerate(scenes):
    #     print(scenes[j]["title"], scenes[j]["importance_rank"])
    #     print("--------------------------------")
    #     for k, character in enumerate(scene["characters"]):
    #         print(character["name"], character["importance_rank"])
    #     print()

    # save as json
    with open(f"json/{story_name}/{marker}.json", "w") as f:
        json.dump(scenes, f, indent=4)

第一　標目
第二　言懷
第三　訓女
第四　腐歎
第五　延師
第六　悵眺
第七　閨塾
第八　勸農
第九　肅苑
第十　驚夢
第十一　慈戒
第十二　尋夢
第十三　訣謁
第十四　寫真
第十五　虜諜
第十六　詰病
第十七　道覡
第十八　診祟
第十九　牝賊
第二十　鬧殤
第二十一　謁遇
第二十二　旅寄
第二十三　冥判
第二十四　拾畫
第二十五　憶女
第二十六　玩真
第二十七　魂遊
第二十八　幽媾
第二十九　旁疑
第三十　懽撓
第三十一　繕備
第三十二　冥誓
第三十三　秘議
第三十四　詗藥
第三十五　回生
第三十六　婚走
第三十七　駭變
第三十八　淮警
第三十九　如杭
第四十　僕偵
第四十一　耽試
第四十二　移鎮
第四十三　禦淮
第四十四　急難
第四十五　寇間
第四十六　折寇
第四十七　圍釋
第四十八　遇母
第四十九　淮泊
第五十　鬧宴
第五十一　榜下
第五十二　索元
第五十三　硬拷
第五十四　聞喜
第五十五　圓駕


In [None]:
# combine all chapters into one json file
all_scenes = []
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    all_scenes += scenes

with open(f"json/{story_name}/all.json", "w") as f:
    json.dump(all_scenes, f, indent=4)

### grouping duplicate characters / location names

In [None]:
# collect all unique characters and locations for each chapter
all_characters = {}
all_locations = []
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    for scene in scenes:
        for character in scene["characters"]:
            char = character["name"]
            if char not in all_characters:
                all_characters[char] = character["role"]
        location = scene["location"]
        if location not in all_locations:
            all_locations.append(location)

In [None]:
len(all_characters), len(all_locations)

(247, 97)

In [None]:
all_characters

{'張窈窕': 'Expressing longing and doubt about her dreams of the king',
 '王昌齡': "Describing his heart's desire and unfulfilled feelings",
 '曹松': 'Expressing the desire for immortality and transcendence',
 '韓偓': 'No significant role identified',
 '嬌鸎': 'Expresses joy and gratitude to her parents',
 '孩兒': 'Brings food and drink to the family',
 '春香': 'Serves wine to the lady',
 '老旦': 'A mother expressing her hopes for her children and their education.',
 '旦': 'A daughter reflecting on her upbringing and the expectations placed on her.',
 '外': 'A character discussing the importance of education and societal expectations.',
 '柳宗元': 'Expressing admiration for the spring breeze and its connection to the addressee.',
 '王建': 'Reflecting on the past and the absence of heirs in old age.',
 '苗發': 'Inquiring about a notable woman, possibly seeking connection or recognition.',
 '劉禹錫': 'Mentioned in context, possibly as a figure of respect or authority.',
 'Unknown Character 1': 'Expressing dissatisfac

In [None]:
all_locations

['門前梅柳',
 '酒臺',
 '後堂公所',
 'Spring Breeze Gathering',
 'Unknown location',
 '杏壇',
 '台上',
 '書堂',
 '西園',
 'East Suburbs',
 '閒庭院',
 'Spring Garden',
 '花園',
 '畫堂',
 '後花園',
 'Forest of Spirits',
 '昭容閣',
 '花郎的房間',
 'Flower Garden',
 '江南',
 '蓬山',
 '黃堂',
 '紫微宮',
 '海雲東',
 'Battlefield',
 '拜月堂',
 'Unknown setting',
 'Funeral site',
 'Unknown',
 'Temple Courtyard',
 'Theater Stage',
 '華陽洞裏仙壇上',
 '閻浮界',
 '南安府後花園',
 '望鄉臺',
 'Near the forest and springs',
 '寒塘蕙艸',
 '畫館',
 '戶庭',
 '道場',
 'Temple altar',
 '春殿',
 '斜樓上',
 '梅花觀',
 '梅花院',
 'Poetic Exchange',
 'First Human Realm',
 '仙院重門',
 'Nighttime Dreamscape',
 '江北新樓',
 '夾城雲煖下霓旄',
 '城西',
 '墓地',
 'Grave site',
 '牡丹亭內',
 '泉臺',
 '天台',
 '深院閒階',
 '船上',
 '藍橋驛',
 'Moonlit River',
 '丘墳發掘當官路',
 '空房',
 '金魚館',
 'Red Powder Tower',
 '郡城南下',
 '鳳凰閣',
 '考試現場',
 'Poetic Gathering',
 '江湖',
 '淮城',
 '淮陰城',
 '城外',
 '孤城',
 '危樓',
 '不明',
 '秦淮夜泊',
 'Yangzhou Road',
 '屠門',
 '城圍的鐵桶似緊',
 '淮安城',
 'Riverbank',
 'Imaginary Realm',
 '太平宴',
 '宴會廳',
 '酒亭',
 'Imperial Palace',
 '朝暉殿',
 '

In [None]:
# # create csv file for characters
# import csv

# with open(f"csv/{story_name}_characters.csv", "w", newline="") as f:
#     writer = csv.writer(f)
#     writer.writerow(["Character", "Role"])
#     for char, role in all_characters.items():
#         writer.writerow([char, role])

In [None]:
# # create csv file for locations
# with open(f"csv/{story_name}_locations.csv", "w", newline="") as f:
#     writer = csv.writer(f)
#     writer.writerow(["Location"])
#     for location in all_locations:
#         writer.writerow([location])

In [None]:
# Assuming your JSON file is stored at 'all.json'
with open(f"json/{story_name}/all.json") as f:
    json_data = json.load(f)

prompt = """
You are given a list of characters with some duplicates or alternate names.
Your task is to group all the alternate names for each unique character under a single full name, using the descriptions as guidance.
Don't keep characters like "Narrator" unless they are never referred to by another name.
Separate characters that are related to others (e.g., "Bob" and "Ms. Doe (Bob's mom)" and "Bob's sister") into separate entries.

First create a new character list by removing all duplicates and alternate names and only keeping the full name for each character.

Then, output a JSON dictionary where the key is each character in your new list and the value is a list of all possible alternate names. 
All names should be taken directly from the original character list and should only appear once in the output.

For example:
{{
    "Bob Smith": ["Bob", "Bobby", "Robert"],
    "Jane Doe": ["Jane", "Ms. Doe", "Bob's mother"]
}}
"""

theme_prompt = """
You are given a list of themes with some duplicates or alternate names.
Your task is to group all the alternate names for each unique theme under a single name, using the descriptions as guidance.
Your goal is to have as few themes as possible while still capturing all the different ways the themes are described.

First create a new theme list by removing all duplicates and alternate names and only keeping the best description for each theme.

Then, output a JSON dictionary where the key is each theme in your new list and the value is a list of all possible alternate names. 
All names should be taken directly from the original theme list and should only appear once in the output.

For example:
{{
    "Memory": ["Remembering the past", "Recollection", "Nostalgia"],
    "Friendship and Loyalty": ["Friendship and Companionship", "Making Friends", "Bonds of Friendship", "Loyalty and Friendship"]
}}
"""

location_prompt = """
You are given a list of locations with some duplicates or alternate names.
Your task is to group all the alternate or similiar names for each unique location under a single location name.

First create a new location list by removing all duplicates and alternate names and only keeping the full name for each location.
If one location is inside another or nearby to it, only keep the larger location (e.g., "Bob's bedroom", "Bob's garden", and "Bob Smith's castle" can be listed under "Bob's house").
Then, output a JSON dictionary where the key is each location in your new list and the value is a list of all possible alternate location names.
All locations should be taken directly from the original location list and should only appear once in the output dict.

For example:
{{
    "Los Angeles": ["LA", "City of Angels", "Cafe in Los Angeles"],
    "Bob's house": ["Bob's bedroom", "Bob Smith's garden", "Bob's castle"]
}}
"""

if analysis_type == "theme":
    prompt = theme_prompt

# if using llm2 (claude)
prompt += "Just output the JSON dictionary as the final result without any additional information."
location_prompt += "Just output the JSON dictionary as the final result without any additional information."

prompt_template = ChatPromptTemplate.from_messages([("system", prompt), ("human", "{input}")])
location_prompt_template = ChatPromptTemplate.from_messages([("system", location_prompt), ("human", "{input}")])

In [None]:
# Create the LLM chain with the prompt template and model

# Run the chain by passing the characters and JSON data as inputs
if analysis_type == "theme":
    final_prompt = prompt_template.invoke(f"theme list: {all_characters}")
else:
    final_prompt = prompt_template.invoke(f"character list: {all_characters}")
# character_llm = llm.with_structured_output(None, method="json_mode") if model_type == "gpt" else llm
character_llm = llm2
# character_llm = llm
output = character_llm.invoke(final_prompt)


In [None]:
output

AIMessage(content='{\n    "張窈窕": [],\n    "王昌齡": [],\n    "曹松": [],\n    "韓偓": [],\n    "嬌鸎": [],\n    "孩兒": [],\n    "春香": ["春香姐"],\n    "老旦": ["老夫人"],\n    "旦": ["旦醒介"],\n    "外": [],\n    "柳宗元": [],\n    "王建": ["王 建"],\n    "苗發": [],\n    "劉禹錫": [],\n    "杜甫": ["Du Fu", "杜 甫"],\n    "錢起": [],\n    "柳春卿": [],\n    "趙佗王": [],\n    "韓退之": [],\n    "陳最良": [],\n    "杜小姐": ["Miss Du", "杜麗娘"],\n    "李山甫": [],\n    "趙嘏": [],\n    "張泌": [],\n    "張祜": ["張 祜"],\n    "崔日用": [],\n    "韋應物": [],\n    "裊晴絲": [],\n    "翠生生": [],\n    "姹紫嫣紅": [],\n    "小金鈴": [],\n    "Spring Girl": [],\n    "Old Lady": ["老姑姑"],\n    "柳夢梅": ["柳郎", "秀才", "柳秀才"],\n    "花神": ["Flower God"],\n    "張說": [],\n    "許渾": [],\n    "羅隱": [],\n    "韋莊": ["韋 莊"],\n    "女孩兒": [],\n    "丫頭": ["蠢丫頭"],\n    "蘇廣文": [],\n    "鄭谷": [],\n    "段成式": [],\n    "唐彥謙": [],\n    "杜寶": ["杜宝", "杜老大人", "杜老爺", "杜平章"],\n    "普天樂": [],\n    "雁過聲": [],\n    "玉芙蓉": [],\n    "心喜": [],\n    "花郎": [],\n    "鮑老催": ["南鮑老催"],\n    "崔道融": [],\n    "羅虬": []

In [None]:
# if model_type == "claude":
output_formatted = output.content
# if output stop_reason = "max_tokens",cut off at last ']' character and add '}'
if output_formatted[-1] != "}":
    last_bracket_ind = output_formatted.rfind("]")
    output_formatted = output_formatted[:last_bracket_ind + 1] + "}"
    # print(output_formatted)
# turn into json
output = json.loads(output_formatted)

In [None]:
output

{'張窈窕': [],
 '王昌齡': [],
 '曹松': [],
 '韓偓': [],
 '嬌鸎': [],
 '孩兒': [],
 '春香': ['春香姐'],
 '老旦': ['老夫人'],
 '旦': ['旦醒介'],
 '外': [],
 '柳宗元': [],
 '王建': ['王 建'],
 '苗發': [],
 '劉禹錫': [],
 '杜甫': ['Du Fu', '杜 甫'],
 '錢起': [],
 '柳春卿': [],
 '趙佗王': [],
 '韓退之': [],
 '陳最良': [],
 '杜小姐': ['Miss Du', '杜麗娘'],
 '李山甫': [],
 '趙嘏': [],
 '張泌': [],
 '張祜': ['張 祜'],
 '崔日用': [],
 '韋應物': [],
 '裊晴絲': [],
 '翠生生': [],
 '姹紫嫣紅': [],
 '小金鈴': [],
 'Spring Girl': [],
 'Old Lady': ['老姑姑'],
 '柳夢梅': ['柳郎', '秀才', '柳秀才'],
 '花神': ['Flower God'],
 '張說': [],
 '許渾': [],
 '羅隱': [],
 '韋莊': ['韋 莊'],
 '女孩兒': [],
 '丫頭': ['蠢丫頭'],
 '蘇廣文': [],
 '鄭谷': [],
 '段成式': [],
 '唐彥謙': [],
 '杜寶': ['杜宝', '杜老大人', '杜老爺', '杜平章'],
 '普天樂': [],
 '雁過聲': [],
 '玉芙蓉': [],
 '心喜': [],
 '花郎': [],
 '鮑老催': ['南鮑老催'],
 '崔道融': [],
 '羅虬': [],
 '譚峭': [],
 '李紳': [],
 '劉長卿': [],
 '張籍': ['張 籍'],
 '項斯': [],
 '丑府主': [],
 '夫人': [],
 '淨': ['淨扮判官', '淨扮郭駝', '淨瓶'],
 '釋皎然': [],
 '司空圖': [],
 '六么令': [],
 '合': [],
 '前腔': [],
 '杜牧': [],
 '曹唐': ['曹 唐'],
 '王維': [],
 '病旦': [],
 '陳師父': [],
 '小

In [None]:
# Create the LLM chain with the prompt template and model

# Run the chain by passing the characters and JSON data as inputs
final_location_prompt = location_prompt_template.invoke(f"location list: {all_locations}")
# location_llm = llm.with_structured_output(None, method="json_mode") if model_type == "gpt" else llm
location_llm = llm2
# location_llm = llm
location_output = location_llm.invoke(final_location_prompt)

In [None]:
location_output

AIMessage(content='{\n    "門前梅柳": ["門前梅柳"],\n    "酒臺": ["酒臺"],\n    "後堂公所": ["後堂公所"],\n    "Spring Breeze Gathering": ["Spring Breeze Gathering"],\n    "Unknown location": ["Unknown location", "Unknown setting", "Unknown", "不明"],\n    "杏壇": ["杏壇"],\n    "台上": ["台上"],\n    "書堂": ["書堂"],\n    "西園": ["西園"],\n    "East Suburbs": ["East Suburbs"],\n    "閒庭院": ["閒庭院"],\n    "Spring Garden": ["Spring Garden"],\n    "花園": ["花園", "Flower Garden", "後花園"],\n    "畫堂": ["畫堂"],\n    "Forest of Spirits": ["Forest of Spirits"],\n    "昭容閣": ["昭容閣"],\n    "花郎的房間": ["花郎的房間"],\n    "江南": ["江南"],\n    "蓬山": ["蓬山"],\n    "黃堂": ["黃堂"],\n    "紫微宮": ["紫微宮"],\n    "海雲東": ["海雲東"],\n    "Battlefield": ["Battlefield"],\n    "拜月堂": ["拜月堂"],\n    "Funeral site": ["Funeral site"],\n    "Temple Courtyard": ["Temple Courtyard"],\n    "Theater Stage": ["Theater Stage"],\n    "華陽洞裏仙壇上": ["華陽洞裏仙壇上"],\n    "閻浮界": ["閻浮界"],\n    "南安府後花園": ["南安府後花園"],\n    "望鄉臺": ["望鄉臺"],\n    "Near the forest and springs": ["Near the forest 

In [None]:
location_output_formatted = location_output.content
# turn into json
location_output = json.loads(location_output_formatted)

In [None]:
location_output

{'門前梅柳': ['門前梅柳'],
 '酒臺': ['酒臺'],
 '後堂公所': ['後堂公所'],
 'Spring Breeze Gathering': ['Spring Breeze Gathering'],
 'Unknown location': ['Unknown location', 'Unknown setting', 'Unknown', '不明'],
 '杏壇': ['杏壇'],
 '台上': ['台上'],
 '書堂': ['書堂'],
 '西園': ['西園'],
 'East Suburbs': ['East Suburbs'],
 '閒庭院': ['閒庭院'],
 'Spring Garden': ['Spring Garden'],
 '花園': ['花園', 'Flower Garden', '後花園'],
 '畫堂': ['畫堂'],
 'Forest of Spirits': ['Forest of Spirits'],
 '昭容閣': ['昭容閣'],
 '花郎的房間': ['花郎的房間'],
 '江南': ['江南'],
 '蓬山': ['蓬山'],
 '黃堂': ['黃堂'],
 '紫微宮': ['紫微宮'],
 '海雲東': ['海雲東'],
 'Battlefield': ['Battlefield'],
 '拜月堂': ['拜月堂'],
 'Funeral site': ['Funeral site'],
 'Temple Courtyard': ['Temple Courtyard'],
 'Theater Stage': ['Theater Stage'],
 '華陽洞裏仙壇上': ['華陽洞裏仙壇上'],
 '閻浮界': ['閻浮界'],
 '南安府後花園': ['南安府後花園'],
 '望鄉臺': ['望鄉臺'],
 'Near the forest and springs': ['Near the forest and springs'],
 '寒塘蕙艸': ['寒塘蕙艸'],
 '畫館': ['畫館'],
 '戶庭': ['戶庭'],
 '道場': ['道場'],
 'Temple altar': ['Temple altar'],
 '春殿': ['春殿'],
 '斜樓上': ['斜樓上'],
 '梅

In [None]:
# Function to find the correct key based on a search string, ensuring whole-word matches
def find_key_from_alias(text, character_dict):
    for key, aliases in character_dict.items():
        for alias in sorted(aliases, key=len, reverse=True):
            # Match the alias as a whole word, and ensure no partial matches within other words
            pattern = rf'\b{re.escape(alias)}\b(?!\'s)'  # (?!\'s) ensures no match for possessives like Mitsuha's
            if re.search(pattern, text):
                return key  # Return the key if the alias is found as a whole word
    return text  # Return the original text if no match is found

In [None]:
# Load the scenes
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)

# Capitalize output and location output
capitalized_output = {string.capwords(key): [string.capwords(name) for name in value] for key, value in output.items()}
capitalized_location_output = {string.capwords(key): [string.capwords(name) for name in value] for key, value in location_output.items()}

# Process each scene
for scene in scenes:
    # Process characters
    for character in scene["characters"]:
        char = string.capwords(character["name"])
        found_char = False

        # Check if the character's name is a key in the dictionary
        if char in capitalized_output:
            found_char = True
            character["name"] = char  # Capitalize the character name
        
        # If not found in the dictionary, check the aliases
        if not found_char:
            # Find the right key for the alias (if exists)
            matched_key = find_key_from_alias(char, capitalized_output)
            if matched_key != char:
                character["name"] = matched_key
                found_char = True
        if not found_char:
            character["name"] = char

    # Process location
    loc = string.capwords(scene["location"])
    found_loc = False

    # Check if the location is a key in the location dictionary
    if loc in capitalized_location_output:
        found_loc = True
        scene["location"] = loc  # Capitalize the location

    # If not found in the dictionary, check the aliases
    if not found_loc:
        matched_key = find_key_from_alias(loc, capitalized_location_output)
        if matched_key != loc:
            scene["location"] = matched_key
            found_loc = True
    if not found_loc:
        scene["location"] = loc

# Save the updated scenes
with open(f"json/{story_name}/all.json", "w") as f:
    json.dump(scenes, f, indent=4)

In [None]:
# now list all characters and locations again
# collect all unique characters and locations for each chapter
all_characters_new = {}
all_locations_new = []
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)
for scene in scenes:
    for character in scene["characters"]:
        char = character["name"]
        if char not in all_characters_new:
            all_characters_new[char] = 1
        else:
            all_characters_new[char] += 1
    location = scene["location"]
    if location not in all_locations_new:
        all_locations_new.append(location)

In [None]:
len(all_characters_new), len(all_locations_new)

(210, 85)

In [None]:
all_characters_new 

{'張窈窕': 1,
 '王昌齡': 1,
 '曹松': 3,
 '韓偓': 2,
 '嬌鸎': 1,
 '孩兒': 1,
 '春香': 8,
 '老旦': 13,
 '旦': 12,
 '外': 5,
 '柳宗元': 3,
 '王建': 5,
 '苗發': 1,
 '劉禹錫': 2,
 'Unknown Character 1': 1,
 'Unknown Character 2': 1,
 'Unknown Character 3': 1,
 'Unknown Character 4': 1,
 'Unknown Character 5': 1,
 '杜甫': 8,
 '錢起': 4,
 '柳春卿': 1,
 '趙佗王': 1,
 '韓退之': 1,
 '陳最良': 3,
 '杜小姐': 10,
 '李山甫': 1,
 '趙嘏': 1,
 '張泌': 1,
 '張祜': 4,
 '崔日用': 1,
 '韋應物': 1,
 '裊晴絲': 1,
 '翠生生': 1,
 '姹紫嫣紅': 1,
 '小金鈴': 1,
 'Spring Girl': 1,
 'Old Lady': 4,
 '柳夢梅': 13,
 '花神': 4,
 '張說': 1,
 '許渾': 2,
 '羅隱': 4,
 '韋莊': 3,
 '女孩兒': 1,
 '丫頭': 2,
 '蘇廣文': 1,
 '鄭 谷': 2,
 '段成式': 1,
 '唐彥謙': 1,
 'Unknown Scholar': 1,
 'Unknown Performer': 1,
 '普天樂': 1,
 '雁過聲': 1,
 '玉芙蓉': 1,
 '心喜': 1,
 '花郎': 1,
 '鮑老催': 2,
 '崔道融': 1,
 '羅虬': 1,
 '譚峭': 1,
 '李紳': 2,
 '劉長卿': 5,
 '張籍': 4,
 '項斯': 1,
 '丑府主': 1,
 '夫人': 1,
 '淨': 17,
 '釋皎然': 1,
 '司空圖': 3,
 '六么令': 1,
 '合': 1,
 '前腔': 1,
 '杜牧': 3,
 '曹唐': 3,
 '王維': 3,
 '病旦': 1,
 '陳師父': 2,
 'Unknown Character': 4,
 'Old Woman': 1,
 '小姐': 4,
 '陈先生

In [None]:

all_locations_new

['門前梅柳',
 '酒臺',
 '後堂公所',
 'Spring Breeze Gathering',
 'Unknown Location',
 '杏壇',
 '台上',
 '書堂',
 '西園',
 'East Suburbs',
 '閒庭院',
 'Spring Garden',
 '花園',
 '畫堂',
 'Forest Of Spirits',
 '昭容閣',
 '花郎的房間',
 '江南',
 '蓬山',
 '黃堂',
 '紫微宮',
 '海雲東',
 'Battlefield',
 '拜月堂',
 'Funeral Site',
 'Temple Courtyard',
 'Theater Stage',
 '華陽洞裏仙壇上',
 '閻浮界',
 '南安府後花園',
 '望鄉臺',
 'Near The Forest And Springs',
 '寒塘蕙艸',
 '畫館',
 '戶庭',
 '道場',
 'Temple Altar',
 '春殿',
 '斜樓上',
 '梅花觀',
 'Poetic Exchange',
 'First Human Realm',
 '仙院重門',
 'Nighttime Dreamscape',
 '江北新樓',
 '夾城雲煖下霓旄',
 '城西',
 '墓地',
 '牡丹亭內',
 '泉臺',
 '天台',
 '深院閒階',
 '船上',
 '藍橋驛',
 'Moonlit River',
 '丘墳發掘當官路',
 '空房',
 '金魚館',
 'Red Powder Tower',
 '郡城南下',
 '鳳凰閣',
 '考試現場',
 'Poetic Gathering',
 '江湖',
 '淮城',
 '城外',
 '孤城',
 '危樓',
 '秦淮夜泊',
 'Yangzhou Road',
 '屠門',
 '城圍的鐵桶似緊',
 'Riverbank',
 'Imaginary Realm',
 '太平宴',
 '宴會廳',
 '酒亭',
 'Imperial Palace',
 'Long An Road',
 'Night Reading',
 '前腔',
 '深巷門斜',
 'Heavenly Gate',
 '陰陽門前',
 '官前']

## produce chapter summaries

In [None]:
# Pydantic
class ChapterSummary(BaseModel):
    """Summarize a chapter based on scenes."""
    summary: str = Field(description="A brief, 1-line summary of the chapter")

chapter_llm = llm.with_structured_output(ChapterSummary)

In [None]:
# remove any markers without any scenes
markers_to_remove = []
for i, marker in enumerate(markers):
    with open(f"json/{story_name}/{marker}.json") as f:
        scenes = json.load(f)
    if len(scenes) == 0:
        markers_to_remove.append(marker)

for marker in markers_to_remove:
    markers.remove(marker)

In [None]:
async def read_json_async(file_path):
    """Asynchronously read a JSON file."""
    async with aiofiles.open(file_path, mode="r") as f:
        data = await f.read()
        return json.loads(data)
    
async def read_text_async(file_path):
    """Asynchronously read a text file."""
    async with aiofiles.open(file_path, mode="r") as f:
        return await f.read()
    
async def process_chapter_summary_async(marker):
    """Process the summary for a single chapter."""
    scenes_file = f"json/{story_name}/{marker}.json"
    chapter_file = f"chapters/{og_story_name}/{marker}.txt"

    # Read JSON and text files asynchronously
    scenes = await asyncio.gather(read_json_async(scenes_file), read_text_async(chapter_file))

    # Create scene summaries string
    scene_summaries = " ".join([scene["summary"] for scene in scenes[0]])

    # Get summary from LLM
    summary = await chapter_llm.ainvoke(scene_summaries)

    chapter_summary = {
        "chapter": marker,
        "scenes": len(scenes),
        "length": len(chapter),
        "num_lines": len(chapter.split("\n")),
        "summary": summary.summary,
        "importance": [],
        "conflict": [],
        "locations": {},
        "characters": {}
    }

    return chapter_summary

async def generate_chapter_summaries_async():
    """Generate summaries for all chapters in parallel."""
    # Processs chapters in parallel
    chapter_summaries = await asyncio.gather(*[process_chapter_summary_async(marker) for marker in markers])

    # Read all.json file asynchronously
    scenes = await read_json_async(f"json/{story_name}/all.json")

    # Aggregate character and location information
    for scene in scenes:
        for scene in scenes:
            chapter = scene["chapter"]
            chapter_summary = chapter_summaries[markers.index(chapter)]
            # add location to chapter summary
            location = scene["location"]
            chapter_summary_locations = chapter_summary["locations"]
            if location not in chapter_summary_locations:
                chapter_summary_locations[location] = 1
            else:
                chapter_summary_locations[location] += 1
            # add importance to chapter summary
            importance = scene["importance"]
            chapter_summary["importance"].append(importance)
            # add conflict to chapter summary
            conflict = scene["conflict"]
            chapter_summary["conflict"].append(conflict)
            # add characters to chapter summary
            characters = scene["characters"]
            for character in characters:
                char = character["name"]
                chapter_summary_characters = chapter_summary["characters"]
                if char not in chapter_summary_characters:
                    chapter_summary_characters[char] = 1
                else:
                    chapter_summary_characters[char] += 1

    # now average importance and conflict for each chapter
    for chapter_summary in chapter_summaries:
        if len(chapter_summary["importance"]) == 0:
            print(f"Chapter {chapter_summary['chapter']} has no scenes.")
            continue
        chapter_summary["importance"] = round(sum(chapter_summary["importance"]) / len(chapter_summary["importance"]), 2)
        chapter_summary["conflict"] = round(sum(chapter_summary["conflict"]) / len(chapter_summary["conflict"]), 2)

    print("Importances", [chapter_summary["importance"] for chapter_summary in chapter_summaries])
    print("Conflicts", [chapter_summary["conflict"] for chapter_summary in chapter_summaries])

    # rank each chapter by importance and conflict
    importances = []
    conflicts = []
    for chapter_summary in chapter_summaries:
        importances.append((chapter_summary["chapter"], chapter_summary["importance"]))
        conflicts.append((chapter_summary["chapter"], chapter_summary["conflict"]))
        
    # sort importances
    sorted_importances = sorted(importances, key=lambda x: x[1], reverse=True)
    # add importance_rank to each chaptera
    for k, (j, _) in enumerate(sorted_importances):
        chapter_summaries[markers.index(j)]["importance_rank"] = k+1

    # sort conflicts
    sorted_conflicts = sorted(conflicts, key=lambda x: x[1], reverse=True)
    # add conflict_rank to each chapter
    for k, (j, _) in enumerate(sorted_conflicts):
        chapter_summaries[markers.index(j)]["conflict_rank"] = k+1

    return chapter_summaries

In [None]:
# create summary json of each chapter
chapter_summaries = await generate_chapter_summaries_async()

Importances [0.7, 0.77, 0.5, 0.5, 0.7, 0.75, 0.5, 0.78, 0.7, 0.5, 0.8, 0.5, 0.7, 0.7, 0.75, 0.77, 0.75, 0.7, 0.83, 0.7, 0.8, 0.77, 0.83, 0.5, 0.77, 0.78, 0.75, 0.7, 0.7, 0.78, 0.78, 0.8, 0.75, 0.5, 0.8, 0.8, 0.82, 0.77, 0.9, 0.8, 0.7, 0.77, 0.8, 0.5, 0.7, 0.77, 0.8]
Conflicts [0.2, 0.3, 0.3, 0.0, 0.3, 0.3, 0.0, 0.4, 0.4, 0.2, 0.44, 0.2, 0.3, 0.4, 0.6, 0.57, 0.25, 0.2, 0.62, 0.2, 0.5, 0.3, 0.45, 0.2, 0.5, 0.52, 0.25, 0.3, 0.5, 0.52, 0.45, 0.6, 0.35, 0.2, 0.5, 0.65, 0.7, 0.57, 0.8, 0.6, 0.3, 0.47, 0.3, 0.2, 0.3, 0.52, 0.62]


In [None]:
chapter_summaries

[{'chapter': '第二\u3000言懷',
  'scenes': 2,
  'length': 4670,
  'num_lines': 16,
  'summary': 'The chapter explores themes of beauty and longing through evocative poetic lines.',
  'importance': 0.7,
  'conflict': 0.2,
  'locations': {'門前梅柳': 115},
  'characters': {'張窈窕': 115, '王昌齡': 115, '曹松': 115, '韓偓': 115},
  'importance_rank': 29,
  'conflict_rank': 38},
 {'chapter': '第三\u3000訓女',
  'scenes': 2,
  'length': 4670,
  'num_lines': 16,
  'summary': "The family gathers to celebrate with wine and blessings, while Lady Zhen and Du Taishou discuss their daughter's education amidst the recitation of reflective poems on life and legacy.",
  'importance': 0.77,
  'conflict': 0.3,
  'locations': {'酒臺': 115, '後堂公所': 115, 'Spring Breeze Gathering': 115},
  'characters': {'嬌鸎': 115,
   '孩兒': 115,
   '春香': 115,
   '老旦': 115,
   '旦': 115,
   '外': 115,
   '柳宗元': 115,
   '王建': 115,
   '苗發': 115,
   '劉禹錫': 115},
  'importance_rank': 17,
  'conflict_rank': 26},
 {'chapter': '第四\u3000腐歎',
  'scenes': 2,


In [None]:
# read in scenes from all_json
with open(f"json/{story_name}/all.json") as f:
    scenes = json.load(f)

In [None]:
all_json = {}
all_json["title"] = story_info["title"]
all_json["type"] = story_info["type"]
if (all_json["type"] == "Movie"):
    all_json["director"] = story_info["director"]
else:
    all_json["author"] = story_info["author"]
all_json["year"] = story_info["year"]
if "url" in story_info:
    all_json["url"] = story_info["url"]
if "image" in story_info:
    all_json["image"] = story_info["image"]
all_json["num_chapters"] = len(chapters)
all_json["num_scenes"] = len(scenes)
all_json["num_characters"] = len(all_characters_new)
all_json["num_locations"] = len(all_locations_new)
all_json["chapters"] = chapter_summaries
all_json["scenes"] = scenes

## produce character summaries

In [None]:
# Pydantic
class CharacterInfo(BaseModel):
    """Get information about a character in the story."""
    name: str = Field(description="Name of character.")
    quote: str = Field(description="Direct quote from the story that the character says or that describes them.")
    group: str = Field(description="Group that this character belongs to (e.g., main characters, upper class, talking animals). There should be at least 2 different groups of characters.")
    # color: str = Field(description="Unique aesthetic RGB color string that represents this character (e.g., rgb(118, 185, 71)). Every character should have a different color. Don't use white and make sure the color is visible against a white background.")
    # explanation: str = Field(description="Explanation why this color was chosen for this character.")

class LegendThemeInfo(BaseModel):
    """Get information about a theme in the story."""
    name: str = Field(description="Name of theme.")
    quote: str = Field(description="Direct quote from the story that describes this theme.")
    group: Optional[str] = Field(description="Group that this theme belongs to (e.g., Family, Social Themes, Emotions). There should be at least 2 different groups of themes. Make sure all groups are distinct and don't overlap with each other.")

class CharacterList(BaseModel):
    """Get overall information about the story."""
    # characterlist: list[CharacterInfo] = Field(description="List of character details. Make sure there is exactly one entry per character in the provided list and no additional characters are added. Choose a different color for each character.")
    characterlist: list[CharacterInfo] = Field(description="List of character details. Make sure there is exactly one entry per character in the provided list and no additional characters are added.")

class LegendThemeList(BaseModel):
     """Get overall information about the story."""
     themelist: list[LegendThemeInfo] = Field(description="List of theme details. Make sure there is exactly one entry per theme in the provided list and no additional themes are added.")

class ColorInfo(BaseModel):
    """Pick a color + explain why for each character"""
    name: str = Field(description="Name of character.")
    color: str = Field(description="Unique aesthetic RGB color string that represents this character (e.g., rgb(118, 185, 71)). Every character should have a different color. Don't use white and make sure the color is visible against a white background.")
    explanation: str = Field(description="Explanation why this color was chosen for this character.")

class ThemeColorInfo(BaseModel):
    """Pick a color + explain why for each theme"""
    name: str = Field(description="Name of theme.")
    color: str = Field(description="Unique aesthetic RGB color string that represents this theme (e.g., rgb(118, 185, 71)). Every theme should have a different color. Don't use white and make sure the color is visible against a white background.")
    explanation: str = Field(description="Explanation why this color was chosen for this theme.")

class ColorList(BaseModel):
    """List of color info for characters"""
    colorlist: list[ColorInfo] = Field(description="List of color details for each character. Make sure there is exactly one entry per character in the provided list and no additional characters are added. Choose a different color for each character.")

class ThemeColorList(BaseModel):
    """List of color info for themes"""
    colorlist: list[ThemeColorInfo] = Field(description="List of color details for each theme. Make sure there is exactly one entry per theme in the provided list and no additional themes are added. Choose a different color for each theme.")

if analysis_type == "theme":
    characterlist_llm = llm.with_structured_output(LegendThemeList)
    colorlist_llm = llm.with_structured_output(ThemeColorList)
else:
    characterlist_llm = llm.with_structured_output(CharacterList)
    colorlist_llm = llm.with_structured_output(ColorList)

In [None]:
# sort all_characters_new by count, descending
sorted_characters = dict(sorted(all_characters_new.items(), key=lambda item: item[1], reverse=True))
# take top 20 characters
top_characters = dict(list(sorted_characters.items())[:20])
top_characters

{'淨': 17,
 '老旦': 13,
 '柳夢梅': 13,
 '生': 13,
 '旦': 12,
 '丑': 11,
 '杜小姐': 10,
 '春香': 8,
 '杜甫': 8,
 '杜寶': 8,
 '外': 5,
 '王建': 5,
 '劉長卿': 5,
 '白居易': 5,
 '錢起': 4,
 '張祜': 4,
 'Old Lady': 4,
 '花神': 4,
 '羅隱': 4,
 '張籍': 4}

In [None]:
character_arr = []
cur_arr = []
max_chars = 20

char_names = list(all_characters_new.keys())
# char_names = list(top_characters.keys())
for i, c in enumerate(char_names):
    cur_arr.append(c)
    if len(cur_arr) == max_chars or i == len(char_names) - 1:
        character_arr.append(cur_arr)
        cur_arr = []

for arr in character_arr:
    print(len(arr))


20
20
20
20
20
20
20
20
20
20
10


In [None]:
async def process_character_chunk(i, arr):
    """Process a single chunk of character assignment asynchronously."""
    invalid_input = True
    while invalid_input:
        try:
            # filter scenes to include only those with relevant characters
            new_scenes = [scene for scene in scenes if any(char["name"] in arr for char in scene["characters"])]

             # take middle 126000 characters of new_scenes
            new_scenes_str = json.dumps(new_scenes)
            new_scenes_str = new_scenes_str[:126000]

            # construct prompt
            prompt = f"""characters: {arr}
                         story info: {new_scenes_str}

                         your output should contain exactly the same {len(arr)} characters as in the original list.

                         remember to only use direct quotes from the text and not paraphrase or change the text in any way.
                         """

            # invoke model
            res = await characterlist_llm.ainvoke(prompt)

            print(f"Chunk {i} processed.")
            return res

        except:
            print("Exception thrown. Trying again.")
            print("Error:", e)
            await asyncio.sleep(1) # wait for 1 second before trying again

async def process_all_characters():
    """Process all characters in parallel."""
    tasks = [process_character_chunk(i, arr) for i, arr in enumerate(character_arr)]
    results = await asyncio.gather(*tasks)
    return results

In [None]:
# Run the main function to process all characters
all_res = await process_all_characters()

Chunk 8 processed.
Chunk 2 processed.
Chunk 5 processed.
Chunk 10 processed.
Chunk 1 processed.
Chunk 6 processed.
Chunk 3 processed.
Chunk 4 processed.
Chunk 7 processed.
Chunk 9 processed.
Chunk 0 processed.


In [None]:
# combine characterlists into one
characters = []
for res in all_res:
    if analysis_type == "theme":
        print(len(res.themelist))
        characters += res.themelist
    else:
        print(len(res.characterlist))
        characters += res.characterlist

len(characters)

20
20
20
20
20
20
20
20
20
20
10


210

In [None]:
character_names = [char.name for char in characters]
for name in character_names:
    if name not in char_names:
        print("not in orig list:", name)

need_to_add = []
for name in char_names:
    if name not in character_names:
        print("not in llm output:", name)
        need_to_add.append(name)

# find duplicate names
unique_names = []
duplicate_indices = []
for i, name in enumerate(character_names):
    if name not in unique_names:
        unique_names.append(name)
    else:
        print("duplicate:", name, i)
        duplicate_indices.append(i)

print("need_to_add", len(need_to_add))
print("duplicate_indices", len(duplicate_indices))


need_to_add 0
duplicate_indices 0


### group duplicate character groups

In [None]:
# group similar groups

group_prompt = """
You are given a list of groups with some duplicates or alternate names.
Your task is to group all the alternate names for each unique group under a single name.
Your goal is to have as few groups as possible while still capturing all the unique groups.

First create a new group list by removing all duplicates and alternate names and only keeping the best description for each group.

Then, output a JSON dictionary where the key is each group in your new list and the value is a list of all possible alternate names. 
All names should be taken directly from the original group list and should only appear once in the output.

For example:
{{
    "Main Characters": ["Main Characters", "Protagonists", "Heroes"],
    "Elders": ["Elders", "Family Authorities"],
    "Exploration": ["Exploration", "Adventure", "Discovery"],
    "Relationships": ["Relationships", "Friendships"]
}}
"""

# if using llm2 (claude)
group_prompt += "Just output the JSON dictionary as the final result without any additional information."

group_prompt_template = ChatPromptTemplate.from_messages([("system", group_prompt), ("human", "{input}")])

In [None]:
# get all unique groups
# groups = [c['group'] for c in characters]
groups = [c.group for c in characters]
unique_groups = list(set(groups))
len(unique_groups), unique_groups

(2, ['upper class', 'main characters'])

In [None]:
# Create the LLM chain with the prompt template and model

# Run the chain by passing the characters and JSON data as inputs
if analysis_type == "theme":
    final_prompt = group_prompt_template.invoke(f"list of groups for themes: {unique_groups}")
else:
    final_prompt = group_prompt_template.invoke(f"list of groups for characters: {unique_groups}")
group_llm = llm2
output = group_llm.invoke(final_prompt)

In [None]:
output

AIMessage(content='{\n    "Upper Class": ["upper class"],\n    "Main Characters": ["main characters"]\n}', additional_kwargs={}, response_metadata={'id': 'msg_01LzgqE1FcsScKxWQqhVdTxu', 'model': 'claude-3-5-sonnet-20240620', 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0, 'input_tokens': 240, 'output_tokens': 26}}, id='run-080e5ecf-57e9-49d5-8688-c42e69400cfc-0', usage_metadata={'input_tokens': 240, 'output_tokens': 26, 'total_tokens': 266, 'input_token_details': {'cache_read': 0, 'cache_creation': 0}})

In [None]:
output_formatted = output.content
# turn into json
output = json.loads(output_formatted)
len(output), output

(2, {'Upper Class': ['upper class'], 'Main Characters': ['main characters']})

In [None]:
# Capitalize output and location output
capitalized_output = {string.capwords(key): [string.capwords(name) for name in value] for key, value in output.items()}

# Process each scene
for c in characters:

    # Process group
    group = string.capwords(c.group)
    found_group = False

    # Check if the group is a key in the dictionary
    if group in capitalized_output:
        found_group = True
        c.group = group  # Capitalize the group

    # If not found in the dictionary, check the aliases
    if not found_group:
        matched_key = find_key_from_alias(group, capitalized_output)
        if matched_key != group:
            c.group = matched_key
            found_group = True
    if not found_group:
        c.group = group

### assign colors to characters

In [None]:
# now add colors for top characters
top_char_names = list(top_characters.keys())

all_res_colors = []
invalid_input = True
while invalid_input: # try until valid input
    # see if exception is thrown
    # filter all_json to only include characters in arr
    new_scenes = []
    for scene in scenes:
        include = False
        for character in scene["characters"]:
            if character["name"] in top_char_names:
                include = True
                break
        if include:
            new_scenes.append(scene)

    # take middle 126000 characters of new_scenes
    new_scenes_str = json.dumps(new_scenes)
    new_scenes_str = new_scenes_str[:126000]

    try:
        prompt = f"characters: {top_char_names}\nstory info: {new_scenes_str}\nyour output should contain exactly the same {len(top_char_names)} characters as in the original list. Make sure each character is assigned a unique color."
        res = colorlist_llm.invoke(prompt)
        all_res_colors.append(res)
        invalid_input = False
    except Exception as e:
        print("Exception thrown. Trying again.")
        print("Error:", e)
        invalid_input = True

In [None]:
# find duplicate colors
colorlist = []
for res in all_res_colors:
    colorlist += res.colorlist
    
unique_colors = []
duplicate_colors = []
for i, char in enumerate(colorlist):
    color = char.color
    if color not in unique_colors:
        unique_colors.append(color)
    else:
        print("duplicate color:", color, i)
        duplicate_colors.append(i)

duplicate color: rgb(255, 105, 180) 17


In [None]:
colorinfo = {}

for c in colorlist:
    colorinfo[c.name] = {
        "color": c.color,
        "explanation": c.explanation
    }

colorinfo

{'淨': {'color': 'rgb(255, 223, 186)',
  'explanation': "This soft peach color represents purity and innocence, reflecting the character's joyful and respectful nature."},
 '老旦': {'color': 'rgb(139, 69, 19)',
  'explanation': 'A rich brown symbolizes wisdom and experience, fitting for a character that embodies tradition.'},
 '柳夢梅': {'color': 'rgb(255, 105, 180)',
  'explanation': 'This vibrant pink reflects her youthful spirit and romantic aspirations.'},
 '生': {'color': 'rgb(0, 128, 0)',
  'explanation': "A deep green symbolizes growth and vitality, representing the character's connection to life."},
 '旦': {'color': 'rgb(255, 215, 0)',
  'explanation': "A bright gold color signifies brightness and optimism, aligning with the character's cheerful demeanor."},
 '丑': {'color': 'rgb(128, 0, 128)',
  'explanation': "A regal purple reflects the character's complexity and depth, embodying both humor and wisdom."},
 '杜小姐': {'color': 'rgb(173, 216, 230)',
  'explanation': 'A light blue represen

In [None]:
# remove duplicates
characters = [char for i, char in enumerate(characters) if i not in duplicate_indices]
print(len(characters))

210


In [None]:
character_info = []
for c in characters:

    # see if character is in color_info
    name = c.name
    color = ""
    explanation = ""
    if name in colorinfo:
        color = colorinfo[name]["color"]
        explanation = colorinfo[name]["explanation"]
    
    character_info.append({
        "name": name,
        "quote": c.quote,
        "group": c.group,
        "color": color,
        "explanation": explanation
    })

len(character_info), character_info

(210,
 [{'name': '張窈窕',
   'quote': 'No quote available',
   'group': 'Main Characters',
   'color': '',
   'explanation': ''},
  {'name': '王昌齡',
   'quote': 'No quote available',
   'group': 'Main Characters',
   'color': '',
   'explanation': ''},
  {'name': '曹松',
   'quote': 'No quote available',
   'group': 'Main Characters',
   'color': '',
   'explanation': ''},
  {'name': '韓偓',
   'quote': 'No quote available',
   'group': 'Main Characters',
   'color': '',
   'explanation': ''},
  {'name': '嬌鸎',
   'quote': 'No quote available',
   'group': 'Main Characters',
   'color': '',
   'explanation': ''},
  {'name': '孩兒',
   'quote': 'No quote available',
   'group': 'Main Characters',
   'color': '',
   'explanation': ''},
  {'name': '春香',
   'quote': 'No quote available',
   'group': 'Main Characters',
   'color': 'rgb(255, 182, 193)',
   'explanation': 'A soft pink symbolizes affection and warmth, reflecting her nurturing personality.'},
  {'name': '老旦',
   'quote': 'No quote availa

In [None]:
for name in need_to_add:
    character_info.append({
        "name": name,
        "quote": "",
        "group": "misc",
        "color": "",
        "explanation": ""
    })

len(character_info), character_info

(210,
 [{'name': '張窈窕',
   'quote': 'No quote available',
   'group': 'Main Characters',
   'color': '',
   'explanation': ''},
  {'name': '王昌齡',
   'quote': 'No quote available',
   'group': 'Main Characters',
   'color': '',
   'explanation': ''},
  {'name': '曹松',
   'quote': 'No quote available',
   'group': 'Main Characters',
   'color': '',
   'explanation': ''},
  {'name': '韓偓',
   'quote': 'No quote available',
   'group': 'Main Characters',
   'color': '',
   'explanation': ''},
  {'name': '嬌鸎',
   'quote': 'No quote available',
   'group': 'Main Characters',
   'color': '',
   'explanation': ''},
  {'name': '孩兒',
   'quote': 'No quote available',
   'group': 'Main Characters',
   'color': '',
   'explanation': ''},
  {'name': '春香',
   'quote': 'No quote available',
   'group': 'Main Characters',
   'color': 'rgb(255, 182, 193)',
   'explanation': 'A soft pink symbolizes affection and warmth, reflecting her nurturing personality.'},
  {'name': '老旦',
   'quote': 'No quote availa

## produce location summaries

In [None]:
# Pydantic
class LocationInfo(BaseModel):
    """Get information about a location in the story."""
    name: str = Field(description="Name of location.")
    quote: str = Field(description="Direct quote from the story that describes this location")
    emoji: str = Field(description="Emoji that represents this location (e.g., 🏰). Make sure each location has a different emoji.")

class LocationList(BaseModel):
    """Get overall information about the story."""
    locationList: list[LocationInfo] = Field(description="List of location details. One entry per location.")

locationList_llm = llm.with_structured_output(LocationList)

In [None]:
cropped_json = json.dumps(all_json)[:126000]
location_prompt = f"locations: {all_locations_new}\nstory info: {cropped_json}\nyour output should contain exactly the same {len(all_locations_new)} locations as in the original list."
invalid_input = True
while invalid_input: # try until valid input
    # see if exception is thrown
    try:
        res = locationList_llm.invoke(location_prompt)
        invalid_input = False
    except Exception as e:
        print("Exception thrown. Trying again.")
        print("Error:", e)
        invalid_input = True

In [None]:
location_info = []
for l in res.locationList:
    location_info.append({
        "name": l.name,
        "quote": l.quote,
        "emoji": l.emoji
    })

print(len(location_info))
print(location_info)

85
[{'name': '門前梅柳', 'quote': '門前梅柳，春風拂面，花開似錦。', 'emoji': '🌸'}, {'name': '酒臺', 'quote': '酒臺上，杯盞交錯，笑語盈盈。', 'emoji': '🍷'}, {'name': '後堂公所', 'quote': '後堂公所，靜謐如水，思緒萬千。', 'emoji': '🏛️'}, {'name': 'Spring Breeze Gathering', 'quote': '春風聚會，詩詞歌賦，情意綿綿。', 'emoji': '🌼'}, {'name': 'Unknown Location', 'quote': '此地不明，卻充滿神秘的氣息。', 'emoji': '❓'}, {'name': '杏壇', 'quote': '杏壇之下，教誨如春，桃李滿天下。', 'emoji': '🌳'}, {'name': '台上', 'quote': '台上表演，才子佳人，風華絕代。', 'emoji': '🎭'}, {'name': '書堂', 'quote': '書堂靜謐，墨香四溢，心靈的港灣。', 'emoji': '📚'}, {'name': '西園', 'quote': '西園花開，鳥語花香，心隨意動。', 'emoji': '🌺'}, {'name': 'East Suburbs', 'quote': '東郊晨曦，萬物復甦，生機勃勃。', 'emoji': '🌅'}, {'name': '閒庭院', 'quote': '閒庭院中，竹影搖曳，心隨景動。', 'emoji': '🏡'}, {'name': 'Spring Garden', 'quote': '春天的花園，繁花似錦，生機盎然。', 'emoji': '🌷'}, {'name': '花園', 'quote': '花園裡，百花爭艷，芬芳四溢。', 'emoji': '🌻'}, {'name': '畫堂', 'quote': '畫堂內，墨韻悠揚，藝術的殿堂。', 'emoji': '🎨'}, {'name': 'Forest Of Spirits', 'quote': '靈魂之林，神秘而幽靜，靜謐的思考。', 'emoji': '🌲'}, {'name': '昭容閣', 'quote': '昭容閣上，月明點滴，思念如潮。', 'em

## generating the final json file!

In [None]:
# add character and location info to all_json
all_json["characters"] = character_info
all_json["locations"] = location_info
all_json

{'title': '牡丹亭 Peony Pavilion',
 'type': 'Theater',
 'author': '汤显祖 Tang Xianzu',
 'year': 1598,
 'url': 'https://zh.wikisource.org/zh-hans/%E7%89%A1%E4%B8%B9%E4%BA%AD',
 'image': 'covers/牡丹亭.png',
 'num_chapters': 55,
 'num_scenes': 115,
 'num_characters': 210,
 'num_locations': 85,
 'chapters': [{'chapter': '第二\u3000言懷',
   'scenes': 2,
   'length': 4670,
   'num_lines': 16,
   'summary': 'The chapter explores themes of beauty and longing through evocative poetic lines.',
   'importance': 0.7,
   'conflict': 0.2,
   'locations': {'門前梅柳': 115},
   'characters': {'張窈窕': 115, '王昌齡': 115, '曹松': 115, '韓偓': 115},
   'importance_rank': 29,
   'conflict_rank': 38},
  {'chapter': '第三\u3000訓女',
   'scenes': 2,
   'length': 4670,
   'num_lines': 16,
   'summary': "The family gathers to celebrate with wine and blessings, while Lady Zhen and Du Taishou discuss their daughter's education amidst the recitation of reflective poems on life and legacy.",
   'importance': 0.77,
   'conflict': 0.3,
   '

In [None]:
# save as json
with open(f"json/{story_name}/final_data.json", "w") as f:
    json.dump(all_json, f, indent=4)