## see if scene text can be found in chapter file

In [600]:
import json
import os

import pandas as pd
import altair as alt

import re

In [45]:
# load json data
story = "alice"
with open(f"../src/data/{story}-new.json") as f:
    data = json.load(f)

In [46]:
chapters_folder = f"chapters/{story}/"
chapters = data["chapters"]
scenes = data["scenes"]

In [47]:
for chap in chapters:
    chap_title = chap["chapter"]
    chap_text_file = f"{chapters_folder}{chap_title}.txt"
    with open(chap_text_file, "r") as f:
        chap_text = f.read()

    # only take text after ": " in each line
    chap_text = "\n".join([line.split(": ", 1)[1] for line in chap_text.split("\n") if ": " in line])

    chap_scenes = [scene for scene in scenes if scene["chapter"] == chap_title]
    for scene in chap_scenes:
        scene_num = scene["number"]
        scene_title = scene["title"]
        scene_text = scene["text"]
        
        if scene_text not in chap_text:
            print(f"Scene {chap_title}-{scene_title} (scene #{scene_num}) not found in {chap_title}.")
            # print(f"Scene text: {scene_text}")
            # print(f"Chapter text: {chap_text}")
            print()

## check if chapters in public match those in notebooks

In [None]:
# list public chapter folder names
public_chapter_folder = f"../public/chapters/"
public_data_folder = f"../src/data/"

ignore_stories = ["yourname"]

public_chapters = os.listdir(public_chapter_folder)
public_chapters = [chap for chap in public_chapters if os.path.isdir(public_chapter_folder + chap)]
public_chapters = sorted([chap for chap in public_chapters if chap not in ignore_stories])

public_data = os.listdir(public_data_folder)
public_data = [data for data in public_data if data.endswith(".json") and "-new.json" in data]
# also remove ignore_stories
public_data = [data for data in public_data if data.split("-new.json")[0] not in ignore_stories]

# len(public_chapters), public_chapters
len(public_data), public_data

(36,
 ['whispers-new.json',
  'littlewomen-new.json',
  'tenant-new.json',
  'bookstore-new.json',
  'redchamber-new.json',
  'starlight-new.json',
  'genji-new.json',
  'ulysses-new.json',
  'romeo-new.json',
  'victoria-new.json',
  'janeeyre-new.json',
  'odyssey-new.json',
  'trial-new.json',
  'pygmalion-new.json',
  'faust-new.json',
  'time-new.json',
  'mendips-new.json',
  'candide-new.json',
  'artofwar-new.json',
  'metamorphosis-new.json',
  'war-new.json',
  'anne-new.json',
  'frankenstein-new.json',
  'greatexp-new.json',
  'school-new.json',
  'alice-new.json',
  'threads-new.json',
  'gatsby-new.json',
  'iliad-new.json',
  'color-new.json',
  'donquixote-new.json',
  'pride-new.json',
  'hamlet-new.json',
  'emma-new.json',
  'wizard-new.json',
  'marrow-new.json'])

In [63]:
for story in public_chapters:
    # compare each .txt file in this folder with the corresponding chapter in the json file
    chapters_folder = f"{public_chapter_folder}/{story}/"
    notebook_chapter_folder = f"chapters/{story}/"
    public_txt_files = os.listdir(chapters_folder)
    public_txt_files = [f for f in public_txt_files if f.endswith(".txt")]
    public_txt_files = sorted(public_txt_files)

    for pf in public_txt_files:
        # load in txt file
        with open(chapters_folder + pf, "r") as f:
            chap_text = f.read()
        
        # load corresponding chapter txt from notebook_chapter_folder
        with open(f"{notebook_chapter_folder}{pf}", "r") as f:
            notebook_chap_text = f.read()
        
        if chap_text != notebook_chap_text:
            print(f"Chapter {pf} in {story} does not match.")
            # find problematic lines
            chap_text_lines = chap_text.split("\n")
            notebook_chap_text_lines = notebook_chap_text.split("\n")

            for i, (line1, line2) in enumerate(zip(chap_text_lines, notebook_chap_text_lines)):
                if line1 != line2:
                    print(f"Line {i}:")
                    print(f"  {line1}")
                    print(f"  {line2}")
                    print()
            print()
            print()

## print number of lines in each story

In [90]:
# llm-generated stories
llm_stories = ["bookstore", "color", "starlight", "threads", "time", "whispers"]
plays = ["faust", "hamlet", "pygmalion", "romeo", "school"]

In [91]:
all_line_counts = {}
all_quote_counts = {}
scene_divisions = []
for f in sorted(public_data):
    story_formatted = f.split("-new.json")[0]

    with open(f"{public_data_folder}{f}", "r") as file:
        data = json.load(file)
    
    chapters = data["chapters"]
    scenes = data["scenes"]
    num_lines = 0
    num_quotes = 0
    for chap in chapters:
        chap_lines = chap["num_lines"]
        num_lines += chap_lines
    for scene in scenes:
        scene_quotes = len(scene["characters"])
        num_quotes += scene_quotes
        story_type = "llm" if story_formatted in llm_stories else "play" if story_formatted in plays else "non-play"
        scene_divisions.append({"exp": scene["explanation"], "story_type": story_type})
    
    print(f"{f}: {num_lines}, {num_quotes}")

    all_line_counts[story_formatted] = num_lines
    all_quote_counts[story_formatted] = num_quotes

alice-new.json: 2463, 176
anne-new.json: 8703, 626
artofwar-new.json: 7571, 351
bookstore-new.json: 1388, 103
candide-new.json: 2877, 350
color-new.json: 1672, 92
donquixote-new.json: 16099, 804
emma-new.json: 4215, 276
faust-new.json: 6242, 123
frankenstein-new.json: 6655, 332
gatsby-new.json: 4710, 181
genji-new.json: 7864, 342
greatexp-new.json: 16105, 908
hamlet-new.json: 5199, 92
iliad-new.json: 19463, 561
janeeyre-new.json: 16560, 615
littlewomen-new.json: 16680, 1022
marrow-new.json: 7820, 526
mendips-new.json: 7511, 424
metamorphosis-new.json: 1752, 82
odyssey-new.json: 9246, 511
pride-new.json: 10997, 911
pygmalion-new.json: 2630, 95
redchamber-new.json: 15778, 882
romeo-new.json: 3401, 85
school-new.json: 2948, 94
starlight-new.json: 1575, 101
tenant-new.json: 14546, 739
threads-new.json: 1504, 100
time-new.json: 1650, 123
trial-new.json: 6472, 155
ulysses-new.json: 25435, 583
victoria-new.json: 7555, 341
war-new.json: 4482, 309
whispers-new.json: 1741, 165
wizard-new.json: 3

In [51]:
# llm-generated stories
llm_stories = ["bookstore", "color", "starlight", "time", "threads", "whispers"]

In [52]:
# line_count_dict for llm stories
llm_count_dict = {}
for story in llm_stories:
    llm_count_dict[story] = all_line_counts[story]

human_count_dict = {}
for story in public_chapters:
    if story not in llm_stories:
        human_count_dict[story] = all_line_counts[story]

In [53]:
len(human_count_dict), human_count_dict, len(llm_count_dict), llm_count_dict

(30,
 {'alice': 2463,
  'anne': 8703,
  'artofwar': 7571,
  'candide': 2877,
  'donquixote': 16099,
  'emma': 4215,
  'faust': 6242,
  'frankenstein': 6655,
  'gatsby': 4710,
  'genji': 7864,
  'greatexp': 16105,
  'hamlet': 5199,
  'iliad': 19463,
  'janeeyre': 16560,
  'littlewomen': 16680,
  'marrow': 7820,
  'mendips': 7511,
  'metamorphosis': 1752,
  'odyssey': 9246,
  'pride': 10997,
  'pygmalion': 2630,
  'redchamber': 15778,
  'romeo': 3401,
  'school': 2948,
  'tenant': 14546,
  'trial': 6472,
  'ulysses': 25435,
  'victoria': 7555,
  'war': 4482,
  'wizard': 3403},
 6,
 {'bookstore': 1388,
  'color': 1672,
  'starlight': 1575,
  'time': 1650,
  'threads': 1504,
  'whispers': 1741})

In [54]:
# shortest story in Human stories
min_lines_human = min(human_count_dict.values())
min_lines_human_story = [k for k, v in human_count_dict.items() if v == min_lines_human]

# longest story in Human stories
max_lines_human = max(human_count_dict.values())
max_lines_human_story = [k for k, v in human_count_dict.items() if v == max_lines_human]

# average line count for Human stories
avg_lines_human = sum(human_count_dict.values()) / len(human_count_dict)

# shortest story in LLM stories
min_lines_llm = min(llm_count_dict.values())
min_lines_llm_story = [k for k, v in llm_count_dict.items() if v == min_lines_llm]

# longest story in LLM stories
max_lines_llm = max(llm_count_dict.values())
max_lines_llm_story = [k for k, v in llm_count_dict.items() if v == max_lines_llm]

# average line count for LLM stories
avg_lines_llm = sum(llm_count_dict.values()) / len(llm_count_dict)

print(f"Human counts")
print(f"  Min: {min_lines_human} ({min_lines_human_story})")
print(f"  Max: {max_lines_human} ({max_lines_human_story})")
print(f"  Avg: {avg_lines_human}")

print(f"LLM counts")
print(f"  Min: {min_lines_llm} ({min_lines_llm_story})")
print(f"  Max: {max_lines_llm} ({max_lines_llm_story})")
print(f"  Avg: {avg_lines_llm}")


Human counts
  Min: 1752 (['metamorphosis'])
  Max: 25435 (['ulysses'])
  Avg: 8846.066666666668
LLM counts
  Min: 1388 (['bookstore'])
  Max: 1741 (['whispers'])
  Avg: 1588.3333333333333


### create histogram

In [55]:
# convert to dataframe
human_df = pd.DataFrame(human_count_dict.items(), columns=["story", "num_lines"])
llm_df = pd.DataFrame(llm_count_dict.items(), columns=["story", "num_lines"])

human_df["type"] = "human"
llm_df["type"] = "llm"

df = pd.concat([human_df, llm_df])

In [56]:
# Create histogram
chart = alt.Chart(df).mark_bar().encode(
    x=alt.X('num_lines:Q', bin=alt.Bin(maxbins=20), title="Count"),
    y=alt.Y('count()', title="Number of Stories"),
    color=alt.Color('type:N', scale=alt.Scale(scheme='category10'), title="Story Type"),
    tooltip=['type', 'count()']
).properties(
    title="Length Distribution of Human and LLM Stories",
    width=500,
    height=200
)

# show chart
chart

In [57]:
boxplot = alt.Chart(df).mark_boxplot().encode(
    y=alt.Y('type:N', title="Story Type"),
    x=alt.X('num_lines:Q', title="Number of Lines"),
    color=alt.Color('type:N', scale=alt.Scale(scheme='category10'), title="Story Type"),
    tooltip=['type', 'num_lines']
).properties(
    title="Length Distribution in Human and LLM Stories",
    width=600,
    height=200
)

boxplot

## analyze scene divisions

In [92]:
len(scene_divisions), scene_divisions

(4633,
 [{'exp': 'Start of chapter.', 'story_type': 'non-play'},
  {'exp': 'Location changes to the rabbit hole.', 'story_type': 'non-play'},
  {'exp': 'Location changes to the hall.', 'story_type': 'non-play'},
  {'exp': 'Location remains at the table.', 'story_type': 'non-play'},
  {'exp': 'Location remains at the table.', 'story_type': 'non-play'},
  {'exp': 'Location remains at the table.', 'story_type': 'non-play'},
  {'exp': 'Start of chapter.', 'story_type': 'non-play'},
  {'exp': 'Location changes to the pool.', 'story_type': 'non-play'},
  {'exp': 'Start of chapter.', 'story_type': 'non-play'},
  {'exp': "The focus shifts to the Mouse's speech.", 'story_type': 'non-play'},
  {'exp': 'The Dodo introduces the idea of the race.',
   'story_type': 'non-play'},
  {'exp': 'The race concludes and prizes are distributed.',
   'story_type': 'non-play'},
  {'exp': "The scene shifts to the Mouse's story.", 'story_type': 'non-play'},
  {'exp': 'The location changes as Alice is left alone.

In [118]:
# create dataframe
scene_df = pd.DataFrame(scene_divisions)

print(f"Number of scenes: {len(scene_df)}")

scene_df.head()

Number of scenes: 4633


Unnamed: 0,exp,story_type
0,Start of chapter.,non-play
1,Location changes to the rabbit hole.,non-play
2,Location changes to the hall.,non-play
3,Location remains at the table.,non-play
4,Location remains at the table.,non-play


In [629]:
# filter out rows containing 'start of chapter' or 'end of chapter'
filter_words = ["start of chapter", "end of chapter", "final", "ends", "conclud", "conclus", "wrap"]
scene_df = scene_df[~scene_df["exp"].str.lower().str.contains("|".join(filter_words))]

print(f"Number of scenes: {len(scene_df)}")

scene_df.head()

Number of scenes: 3796


Unnamed: 0,exp,story_type,location,char_change,time_change,focus_change,continue
1,Location changes to the rabbit hole.,non-play,True,False,False,False,False
2,Location changes to the hall.,non-play,True,False,False,False,False
3,Location remains at the table.,non-play,True,False,False,False,False
4,Location remains at the table.,non-play,True,False,False,False,False
5,Location remains at the table.,non-play,True,False,False,False,False


In [645]:
# tag scenes that mention location
location_words = ["location", "still", "setting", "scene shifts", "escape", "back", "transition to the", "moves", "now in", "same", "return to", "continues in", "continues at", "inside", " at ", "place"]
scene_df["location"] = scene_df["exp"].str.lower().str.contains("|".join(location_words))

scene_df.head()

Unnamed: 0,exp,story_type,location,char_change,time_change,focus_change,continue
1,Location changes to the rabbit hole.,non-play,True,False,False,False,False
2,Location changes to the hall.,non-play,True,False,False,False,False
3,Location remains at the table.,non-play,True,False,False,False,False
4,Location remains at the table.,non-play,True,False,False,False,False
5,Location remains at the table.,non-play,True,False,False,False,False


In [646]:
# tag scenes about characters entering/leaving
enter_words = ["enter", "leave", "arriv", "appear", "exit", "step", "meet", "is called", "move to", "change in", "change to", "change of", "introduction of", "depart", "new character", "different character"]
scene_df["char_change"] = scene_df["exp"].str.lower().str.contains("|".join(enter_words)) & ~scene_df["location"]

scene_df.head()

Unnamed: 0,exp,story_type,location,char_change,time_change,focus_change,continue
1,Location changes to the rabbit hole.,non-play,True,False,False,False,False
2,Location changes to the hall.,non-play,True,False,False,False,False
3,Location remains at the table.,non-play,True,False,False,False,False
4,Location remains at the table.,non-play,True,False,False,False,False
5,Location remains at the table.,non-play,True,False,False,False,False


In [757]:
# tag scenes about change of time
time_words = ["time", "night", "morning", "evening", "noon", "midnight", "after", "next", "event", "continu"]
scene_df["time_change"] = scene_df["exp"].str.lower().str.contains("|".join(time_words)) & ~scene_df["location"] & ~scene_df["char_change"]

scene_df.head()

Unnamed: 0,exp,story_type,location,char_change,time_change,focus_change,continue,category
1,Location changes to the rabbit hole.,non-play,True,False,False,False,False,location
2,Location changes to the hall.,non-play,True,False,False,False,False,location
3,Location remains at the table.,non-play,True,False,False,False,False,location
4,Location remains at the table.,non-play,True,False,False,False,False,location
5,Location remains at the table.,non-play,True,False,False,False,False,location


In [758]:
# tag scenes about focus change
focus_words = ["focus", "shift", "conversation", "discussion", "topic", "transition", "chang", "start", "new scene", "conflict", "tension", "a new", "impact", "affect", "decision", "explor", "begin", "emotion", "climax", "new activity", "new plan"]
# Compile the focus word pattern
focus_pattern = re.compile("|".join(focus_words))

def detect_focus_change(row):
    exp = row["exp"].lower()
    contains_focus_word = bool(focus_pattern.search(exp))
    matches_is_ed = bool(re.search(r"\sis\s\w+ed\b", exp))
    return (
        (contains_focus_word or matches_is_ed) and
        not row["location"] and
        not row["char_change"] and
        not row["time_change"]
    )

scene_df["focus_change"] = scene_df.apply(detect_focus_change, axis=1)

scene_df.head()

Unnamed: 0,exp,story_type,location,char_change,time_change,focus_change,continue,category
1,Location changes to the rabbit hole.,non-play,True,False,False,False,False,location
2,Location changes to the hall.,non-play,True,False,False,False,False,location
3,Location remains at the table.,non-play,True,False,False,False,False,location
4,Location remains at the table.,non-play,True,False,False,False,False,location
5,Location remains at the table.,non-play,True,False,False,False,False,location


In [759]:
continue_words = ["continu"]
scene_df["continue"] = scene_df["exp"].str.lower().str.contains("|".join(continue_words)) & ~scene_df["location"] & ~scene_df["char_change"] & ~scene_df["time_change"] & ~scene_df["focus_change"]

scene_df.head()

Unnamed: 0,exp,story_type,location,char_change,time_change,focus_change,continue,category
1,Location changes to the rabbit hole.,non-play,True,False,False,False,False,location
2,Location changes to the hall.,non-play,True,False,False,False,False,location
3,Location remains at the table.,non-play,True,False,False,False,False,location
4,Location remains at the table.,non-play,True,False,False,False,False,location
5,Location remains at the table.,non-play,True,False,False,False,False,location


In [760]:
# count by story_type
scene_df.groupby(["story_type", "location"]).size()


story_type  location
llm         False         29
            True         177
non-play    False        985
            True        2496
play        False         23
            True          86
dtype: int64

In [761]:
# count by story_type
scene_df.groupby(["story_type", "char_change"]).size()

story_type  char_change
llm         False           195
            True             11
non-play    False          3106
            True            375
play        False            92
            True             17
dtype: int64

In [762]:
# count by story_type
scene_df.groupby(["story_type", "time_change"]).size()

story_type  time_change
llm         False           200
            True              6
non-play    False          3402
            True             79
play        False           109
dtype: int64

In [763]:
# count by story_type
scene_df.groupby(["story_type", "focus_change"]).size()

story_type  focus_change
llm         False            197
            True               9
non-play    False           3093
            True             388
play        False            106
            True               3
dtype: int64

In [764]:
scene_df.groupby(["story_type", "continue"]).size()

story_type  continue
llm         False        206
non-play    False       3481
play        False        109
dtype: int64

In [765]:
# print llm & not change
llm_char_change = scene_df[(scene_df["story_type"] == "llm") & ~scene_df["char_change"] &  ~scene_df["location"] & ~scene_df["time_change"] & ~scene_df["focus_change"] & ~scene_df["continue"]]
llm_char_change


Unnamed: 0,exp,story_type,location,char_change,time_change,focus_change,continue,category
599,Lila falls asleep and dreams.,llm,False,False,False,False,False,char_action
612,The confrontation escalates into a battle.,llm,False,False,False,False,False,char_action
3955,Elara prepares to make a choice.,llm,False,False,False,False,False,char_action


In [766]:
# print play & location = False & char_change = False
play_char_change = scene_df[(scene_df["story_type"] == "play") & ~scene_df["char_change"] &  ~scene_df["location"] & ~scene_df["time_change"] & ~scene_df["focus_change"] & ~scene_df["continue"]]
play_char_change

Unnamed: 0,exp,story_type,location,char_change,time_change,focus_change,continue,category
3262,Higgins decides to take on the flower girl.,play,False,False,False,False,False,char_action
3264,Doolittle discusses his intentions with Higgins.,play,False,False,False,False,False,char_action
3277,Doolittle discusses his predicament.,play,False,False,False,False,False,char_action


In [767]:
# non-play & location = False & char_change = False
non_play_no_location = scene_df[(scene_df["story_type"] == "non-play") & (scene_df["location"] == False) &  (scene_df["char_change"] == False) & (scene_df["time_change"] == False) & (scene_df["focus_change"] == False) & (scene_df["continue"] == False)]
non_play_no_location


Unnamed: 0,exp,story_type,location,char_change,time_change,focus_change,continue,category
10,The Dodo introduces the idea of the race.,non-play,False,False,False,False,False,char_action
16,Alice drinks from the bottle in the Rabbit's h...,non-play,False,False,False,False,False,char_action
47,Alice's perspective in the courtroom.,non-play,False,False,False,False,False,char_action
185,Anne climbs the roof.,non-play,False,False,False,False,False,char_action
190,Anne's recovery period.,non-play,False,False,False,False,False,char_action
...,...,...,...,...,...,...,...,...
4398,Pierre decides to join the risky behavior.,non-play,False,False,False,False,False,char_action
4455,Princess Mary interacts with Mademoiselle Bour...,non-play,False,False,False,False,False,char_action
4533,Dorothy expresses her desire to return home.,non-play,False,False,False,False,False,char_action
4534,The Witch provides guidance.,non-play,False,False,False,False,False,char_action


In [768]:
list_val = non_play_no_location["exp"].values.tolist()

list_val

['The Dodo introduces the idea of the race.',
 "Alice drinks from the bottle in the Rabbit's house.",
 "Alice's perspective in the courtroom.",
 'Anne climbs the roof.',
 "Anne's recovery period.",
 'Anne reacts to the news about the class.',
 'Cacambo approaches Candide during supper.',
 'Sancho offers his perspective.',
 'Sancho expresses concern.',
 'Sancho seeks clarity.',
 'Sancho expresses concern for his master.',
 "Sancho observes Don Quixote's antics.",
 'Sancho embarks on his journey.',
 'They stop to listen to the singer.',
 'They encounter Cardenio in the mountains.',
 'The woman reveals herself.',
 'Dorothea describes the proposal.',
 'Dorothea recounts the betrayal.',
 'Planning the rescue and roles.',
 "Don Quixote reacts to Dorothea's story.",
 'Sancho reacts to the story.',
 'Sancho expresses his thoughts.',
 "Sancho's comments provoke Don Quixote.",
 'The group discusses the story.',
 'Anselmo decides to marry Camilla.',
 'Anselmo confronts Lothario about his visits.'

In [769]:
# find all lines with " is " followed by a word that ends in "ed" with regex
# subset_val = [val for val in list_val if re.search(r"\sis\s\w+ed\b", val)]
subset_val  = [val for val in list_val if "climax" in val.lower()]

subset_val

[]

In [770]:
# compile all categories in one column
scene_df["category"] = "char_action"
scene_df.loc[scene_df["location"], "category"] = "location"
scene_df.loc[scene_df["char_change"], "category"] = "char_change"
scene_df.loc[scene_df["time_change"], "category"] = "time_change"
scene_df.loc[scene_df["focus_change"], "category"] = "focus_change"
scene_df.loc[scene_df["continue"], "category"] = "continue"

scene_df.head()

Unnamed: 0,exp,story_type,location,char_change,time_change,focus_change,continue,category
1,Location changes to the rabbit hole.,non-play,True,False,False,False,False,location
2,Location changes to the hall.,non-play,True,False,False,False,False,location
3,Location remains at the table.,non-play,True,False,False,False,False,location
4,Location remains at the table.,non-play,True,False,False,False,False,location
5,Location remains at the table.,non-play,True,False,False,False,False,location


### plot results

In [771]:
# print category counts
scene_df.groupby(["story_type", "category"]).size()

story_type  category    
llm         char_action        3
            char_change       11
            focus_change       9
            location         177
            time_change        6
non-play    char_action      143
            char_change      375
            focus_change     388
            location        2496
            time_change       79
play        char_action        3
            char_change       17
            focus_change       3
            location          86
dtype: int64

In [772]:
counts = scene_df.groupby(['story_type', 'category']).size().reset_index(name='count')
counts['proportion'] = counts.groupby('story_type')['count'].transform(lambda x: x / x.sum())

counts

Unnamed: 0,story_type,category,count,proportion
0,llm,char_action,3,0.014563
1,llm,char_change,11,0.053398
2,llm,focus_change,9,0.043689
3,llm,location,177,0.859223
4,llm,time_change,6,0.029126
5,non-play,char_action,143,0.04108
6,non-play,char_change,375,0.107728
7,non-play,focus_change,388,0.111462
8,non-play,location,2496,0.717035
9,non-play,time_change,79,0.022695


In [791]:
# overall averages for each category
overall_avg = scene_df.groupby("category").size() / len(scene_df)
overall_avg

category
char_action     0.039252
char_change     0.106164
focus_change    0.105374
location        0.726818
time_change     0.022392
dtype: float64

In [805]:
# see examples of each category
scene_df[scene_df["category"] == "time_change"].sample(10)["exp"].values

array(['Continuation of the conversation with a new argument.',
       'Shift to reflections after the events.',
       'Dorothea continues her tale.',
       'The scene continues on the battlefield.',
       'Shift to a specific event involving St. John.',
       'Games begin after lunch.',
       'K. returns to the office the next day.',
       'Transition to morning after the night.',
       'Continuation of their interaction in the garden.',
       'The journey continues towards the cliffs.'], dtype=object)

In [790]:
# plot stacked bar chart
# Add custom label column
label_map = {
    'llm': 'LLM',
    'non-play': 'Non-play',
    'play': 'Play'
}
counts['story_type_label'] = counts['story_type'].map(label_map)

cat_label_map = {
    'char_action': 'Character Action',
    'char_change': 'Character Change',
    'focus_change': 'Focus Shift',
    'location': 'Location Change',
    'time_change': 'Time Change'
}
counts['category_label'] = counts['category'].map(cat_label_map)
                                                      
chart = alt.Chart(counts).mark_bar(opacity=0.5).encode(
    x=alt.X('proportion:Q', stack='normalize', title="Proportion of Scenes", axis=alt.Axis(grid=False, values=[0, 0.25, 0.5, 0.75, 1.0])),
    y=alt.Y('story_type_label:N', title=None, sort=['LLM', 'Non-play', 'Play']),
    color=alt.Color('category_label:N', scale=alt.Scale(domain=['Character Action', 'Character Change', 'Focus Shift', 'Location Change', 'Time Change'], 
                                                        range=[  "#6598DB", "#E56A8F","#64B24F","#9C85C0", "#FF9858", ]), title="Scene Division Type"),
    tooltip=['category:N', alt.Tooltip('proportion:Q', format='.2%')]
).properties(
    width=250,
    height=100
).configure_view(
    stroke=None
)

chart

## plot study guide analysis results

In [842]:
# import from csv
df = pd.read_csv("data/study_guide_analysis.csv")

df.head()

Unnamed: 0,Detected by,Characters,Themes,Scenes
0,Both,0.377127,0.106286,0.82211
1,SparkNotes Only,0.015873,0.009009,0.128507
2,Story Ribbons Only,0.607,0.884705,0.049383


In [862]:
# melt into long format
df_melt = df.melt(id_vars=["Detected by"], var_name="category", value_name="proportion")
df_melt = df_melt.rename(columns={'Detected by': 'source'})

df_melt.head()

Unnamed: 0,source,category,proportion
0,Both,Characters,0.377127
1,SparkNotes Only,Characters,0.015873
2,Story Ribbons Only,Characters,0.607
3,Both,Themes,0.106286
4,SparkNotes Only,Themes,0.009009


In [863]:
df_melt.columns

Index(['source', 'category', 'proportion'], dtype='object')

In [864]:
# replace all source = "Story Ribbons Only" with "SR Only"
df_melt["source"] = df_melt["source"].replace("Story Ribbons Only", "Our Pipeline").replace("SparkNotes Only", "SparkNotes")

df_melt.head()

Unnamed: 0,source,category,proportion
0,Both,Characters,0.377127
1,SparkNotes,Characters,0.015873
2,Our Pipeline,Characters,0.607
3,Both,Themes,0.106286
4,SparkNotes,Themes,0.009009


In [879]:
# rename all category = "Scenes" to "Events"
df_melt["category"] = df_melt["category"].replace("Scenes", "Events")

df_melt.head()

Unnamed: 0,source,category,proportion
0,Both,Characters,0.377127
1,SparkNotes,Characters,0.015873
2,Our Pipeline,Characters,0.607
6,Both,Events,0.82211
7,SparkNotes,Events,0.128507


In [880]:
# Optional prettier labels
source_order = ['Both', 'SparkNotes', 'Our Pipeline']
category_order = ['Characters', 'Themes', 'Events']
source_colors = ['#6598DB', '#FF9858', '#E56A8F']

# Create a copy of your dataframe with an explicit order field
df_ordered = df_melt.copy()
df_ordered['source_order_value'] = df_ordered['source'].map({s: i for i, s in enumerate(source_order)})

chart = alt.Chart(df_ordered).mark_bar(opacity=0.5).encode(
    x=alt.X('proportion:Q', stack='normalize', title="Proportion of Story Elements Found", 
            axis=alt.Axis(grid=False, values=[0, 0.25, 0.5, 0.75, 1.0])),
    y=alt.Y('category:N', title=None, sort=category_order),
    color=alt.Color('source:N', 
        scale=alt.Scale(domain=source_order, range=source_colors),
        title="Detected by"),
    order=alt.Order('source_order_value:Q'),  # Use the numeric field for ordering
    tooltip=['source:N', 'proportion:Q']
).properties(
    width=250,
    height=100
).configure_view(
    stroke=None
)

chart

In [885]:
# filter out Our Pipeline from df, only keep Both and SparkNotes
df_both_sparknotes = df_melt[df_melt["source"] != "Our Pipeline"]

df_both_sparknotes


Unnamed: 0,source,category,proportion
0,Both,Characters,0.377127
1,SparkNotes,Characters,0.015873
6,Both,Events,0.82211
7,SparkNotes,Events,0.128507
3,Both,Themes,0.106286
4,SparkNotes,Themes,0.009009


In [886]:
# recompute proportion so that it sums to 1 for each category
df_both_sparknotes.loc[:, "proportion"] = df_both_sparknotes.groupby("category")["proportion"].transform(lambda x: x / x.sum())

df_both_sparknotes

Unnamed: 0,source,category,proportion
0,Both,Characters,0.959611
1,SparkNotes,Characters,0.040389
6,Both,Events,0.864817
7,SparkNotes,Events,0.135183
3,Both,Themes,0.921861
4,SparkNotes,Themes,0.078139


In [889]:
# replot chart
# Optional prettier labels
source_order = ['Both', 'SparkNotes']
category_order = ['Characters', 'Themes', 'Events']
source_colors = ['#6598DB',  '#E56A8F']

# Create a copy of your dataframe with an explicit order field
df_ordered = df_both_sparknotes.copy()
df_ordered['source_order_value'] = df_ordered['source'].map({s: i for i, s in enumerate(source_order)})

chart = alt.Chart(df_ordered).mark_bar(opacity=0.5).encode(
    x=alt.X('proportion:Q', stack='normalize', title="Proportion of Story Elements Found", 
            axis=alt.Axis(grid=False, values=[0, 0.25, 0.5, 0.75, 1.0])),
    y=alt.Y('category:N', title=None, sort=category_order),
    color=alt.Color('source:N', 
        scale=alt.Scale(domain=source_order, range=source_colors),
        title="Detected by"),
    order=alt.Order('source_order_value:Q'),  # Use the numeric field for ordering
    tooltip=['source:N', 'proportion:Q']
).properties(
    width=250,
    height=100
).configure_view(
    stroke=None
)

chart