In [61]:
import pandas as pd
from pathlib import Path
from typing import Union, Iterable, List, Dict
import re


from sentence_transformers import SentenceTransformer

import umap
import matplotlib.pyplot as plt
import numpy as np

import plotly.express as px

from pydantic import BaseModel, Field, ValidationError
from typing import Optional

import json 
import os

import asyncio

from langchain_openai import ChatOpenAI

Using https://github.com/booknlp/booknlp/tree/main

In [3]:
def parse_quotes_file(path: Union[str, Path]) -> pd.DataFrame:
    """
    Parse a `.quotes` file with columns:
      quote_start, quote_end, mention_start, mention_end, mention_phrase, char_id, quote

    - Separator is TAB.
    - The last field (`quote`) may contain arbitrary characters (including quotes, em-dashes, etc.).
    - Lines starting with '#' are treated as comments and skipped.
    - Blank lines are skipped.
    - Ensures integer dtypes for numeric columns.

    Returns:
        pandas.DataFrame with columns in the same order as the header.
    """
    path = Path(path)
    rows: List[Dict[str, object]] = []

    with path.open("r", encoding="utf-8-sig") as f:
        # Read header
        for line in f:
            line = line.rstrip("\n\r")
            if not line or line.lstrip().startswith("#"):
                continue
            header = [h.strip() for h in line.split("\t")]
            break
        else:
            raise ValueError("File appears to be empty or only comments/blank lines.")

        expected_cols = header
        ncols = len(expected_cols)
        if ncols < 2:
            raise ValueError(f"Unexpected header: {header}")

        # Read data lines
        for line in f:
            line = line.rstrip("\n\r")
            if not line or line.lstrip().startswith("#"):
                continue

            parts = line.split("\t")

            # If there are more than ncols parts, merge the extras into the last column (quote)
            if len(parts) >= ncols:
                fixed = parts[: ncols - 1] + ["\t".join(parts[ncols - 1 :])]
            else:
                # Not enough columns; skip or raise—here we raise for data quality
                raise ValueError(
                    f"Line has fewer columns than expected ({len(parts)} < {ncols}): {line}"
                )

            row = dict(zip(expected_cols, fixed))

            # Strip whitespace from non-quote fields
            for k in expected_cols[:-1]:
                if isinstance(row[k], str):
                    row[k] = row[k].strip()

            # Coerce integer columns if they exist in the header
            for int_col in ("quote_start", "quote_end", "mention_start", "mention_end", "char_id"):
                if int_col in row:
                    try:
                        row[int_col] = int(row[int_col])
                    except (TypeError, ValueError):
                        raise ValueError(f"Could not convert column '{int_col}' to int: {row[int_col]!r}")

            rows.append(row)

    df = pd.DataFrame(rows, columns=expected_cols)

    # Make dtypes explicit
    for int_col in ("quote_start", "quote_end", "mention_start", "mention_end", "char_id"):
        if int_col in df.columns:
            df[int_col] = df[int_col].astype("int64")

    return df

In [4]:
df = parse_quotes_file("quotes/158_emma.quotes")

df.head()

Unnamed: 0,quote_start,quote_end,mention_start,mention_end,mention_phrase,char_id,quote
0,1272,1295,1266,1266,he,425,“ Poor Miss Taylor!--I wish she were here agai...
1,1296,1363,1266,1266,he,425,"“ I can not agree with you , papa ; you know I..."
2,1364,1397,1266,1266,he,425,“ A house of her own!--But where is the advant...
3,1398,1436,1266,1266,he,425,"“ How often we shall be going to see them , an..."
4,1437,1463,1266,1266,he,425,"“ My dear , how am I to get so far ? Randalls ..."


In [5]:
# only keep char_id and quote
df = df[["mention_phrase", "quote"]].drop_duplicates().reset_index(drop=True)

df.head()

Unnamed: 0,mention_phrase,quote
0,he,“ Poor Miss Taylor!--I wish she were here agai...
1,he,"“ I can not agree with you , papa ; you know I..."
2,he,“ A house of her own!--But where is the advant...
3,he,"“ How often we shall be going to see them , an..."
4,he,"“ My dear , how am I to get so far ? Randalls ..."


In [6]:
# print all unique characters
print(df["mention_phrase"].unique())

['he' 'her' 'Mr. Woodhouse' 'Emma' 'Mr. Knightley' 'her father' 'his'
 'Her father' 'Mrs. Weston' 'the ladies' 'Mrs. Martin' 'She' 'Harriet'
 'Miss Woodhouse' 'Her' 'the gallant Mr. Elton--' 'she' 'Mr. Elton' 'your'
 'Isabella' 'His' 'He' 'Mr. Knightley.--“Robert Martin' 'Miss Smith'
 'Shakespeare' 'you' 'Kitty' 'the present proprietor' 'her companion'
 'the gentleman' 'Mr. John Knightley' 'Mr. Woodhouse--“yes'
 'John Knightley' 'his wife' 'the good - hearted Mrs. John Knightley'
 'Mrs. John Knightley.--“It' 'herself' 'Mr. Weston' 'the others' 'James'
 'they' 'him' 'the happily deceived aunt' 'Miss Bates' 'Jane' 'every body'
 'Mr. Woodhouse--“indeed'
 'the principal woollen - draper , linen - draper' 'neither “ master'
 'the young man' 'his father' 'Frank Churchill' 'himself' 'Mrs. Cole'
 'Mrs. Ford.--“Yes' 'the obliging Mrs. Ford' 'Voices' 'the latter'
 'Somebody else' 'Somebody' 'her husband' 'the gentlemen' 'Frank'
 'her partner' 'Short' 'the “ beautiful little friend' 'Mrs. Elton'


In [7]:
# remove -- and anything after it in the menioned_phrase column
df["mention_phrase"] = df["mention_phrase"].str.split("--").str[0].str.strip()

df.head()

Unnamed: 0,mention_phrase,quote
0,he,“ Poor Miss Taylor!--I wish she were here agai...
1,he,"“ I can not agree with you , papa ; you know I..."
2,he,“ A house of her own!--But where is the advant...
3,he,"“ How often we shall be going to see them , an..."
4,he,"“ My dear , how am I to get so far ? Randalls ..."


In [8]:
# print all unique characters
print(df["mention_phrase"].unique())

['he' 'her' 'Mr. Woodhouse' 'Emma' 'Mr. Knightley' 'her father' 'his'
 'Her father' 'Mrs. Weston' 'the ladies' 'Mrs. Martin' 'She' 'Harriet'
 'Miss Woodhouse' 'Her' 'the gallant Mr. Elton' 'she' 'Mr. Elton' 'your'
 'Isabella' 'His' 'He' 'Mr. Knightley.' 'Miss Smith' 'Shakespeare' 'you'
 'Kitty' 'the present proprietor' 'her companion' 'the gentleman'
 'Mr. John Knightley' 'John Knightley' 'his wife'
 'the good - hearted Mrs. John Knightley' 'Mrs. John Knightley.' 'herself'
 'Mr. Weston' 'the others' 'James' 'they' 'him'
 'the happily deceived aunt' 'Miss Bates' 'Jane' 'every body'
 'the principal woollen - draper , linen - draper' 'neither “ master'
 'the young man' 'his father' 'Frank Churchill' 'himself' 'Mrs. Cole'
 'Mrs. Ford.' 'the obliging Mrs. Ford' 'Voices' 'the latter'
 'Somebody else' 'Somebody' 'her husband' 'the gentlemen' 'Frank'
 'her partner' 'Short' 'the “ beautiful little friend' 'Mrs. Elton'
 'Elton' 'sweet Jane Fairfax' 'the Vicarage' 'his brother' 'They' 'them'
 'Mr

In [9]:
# strip punctuation at the end of mention_phrase
def strip_punct(name: str) -> str:
    return re.sub(r"[.,;:!?]+$", "", name).strip()

df['mention_phrase'] = df['mention_phrase'].apply(strip_punct)
df.head()

Unnamed: 0,mention_phrase,quote
0,he,“ Poor Miss Taylor!--I wish she were here agai...
1,he,"“ I can not agree with you , papa ; you know I..."
2,he,“ A house of her own!--But where is the advant...
3,he,"“ How often we shall be going to see them , an..."
4,he,"“ My dear , how am I to get so far ? Randalls ..."


In [10]:
# print all unique characters
print(df["mention_phrase"].unique())

['he' 'her' 'Mr. Woodhouse' 'Emma' 'Mr. Knightley' 'her father' 'his'
 'Her father' 'Mrs. Weston' 'the ladies' 'Mrs. Martin' 'She' 'Harriet'
 'Miss Woodhouse' 'Her' 'the gallant Mr. Elton' 'she' 'Mr. Elton' 'your'
 'Isabella' 'His' 'He' 'Miss Smith' 'Shakespeare' 'you' 'Kitty'
 'the present proprietor' 'her companion' 'the gentleman'
 'Mr. John Knightley' 'John Knightley' 'his wife'
 'the good - hearted Mrs. John Knightley' 'Mrs. John Knightley' 'herself'
 'Mr. Weston' 'the others' 'James' 'they' 'him'
 'the happily deceived aunt' 'Miss Bates' 'Jane' 'every body'
 'the principal woollen - draper , linen - draper' 'neither “ master'
 'the young man' 'his father' 'Frank Churchill' 'himself' 'Mrs. Cole'
 'Mrs. Ford' 'the obliging Mrs. Ford' 'Voices' 'the latter'
 'Somebody else' 'Somebody' 'her husband' 'the gentlemen' 'Frank'
 'her partner' 'Short' 'the “ beautiful little friend' 'Mrs. Elton'
 'Elton' 'sweet Jane Fairfax' 'the Vicarage' 'his brother' 'They' 'them'
 'Mr' 'gentle Mrs. West

In [11]:
# words to ignore when canonicalizing
stopwords = {"the", "a", "an", "her", "his", "your", "my", "our", "their", 
             "she", "he", "him", "herself", "himself", "they", "them", 
             "you", "its"}

# create a function to search for when one name is a substring of another
def map_to_longer(name: str, names: Iterable[str]) -> str:
    name = name.lower()
    # ignore if name is a stopword
    if name in stopwords:
        return name
    candidates = [n for n in names if name in n.lower()]
    if candidates:
        # return the longest candidate
        return max(candidates, key=len)
    return name

In [12]:
df['canonical_name'] = df['mention_phrase'].apply(map_to_longer, names=df['mention_phrase'].unique())

df.head()

Unnamed: 0,mention_phrase,quote,canonical_name
0,he,“ Poor Miss Taylor!--I wish she were here agai...,he
1,he,"“ I can not agree with you , papa ; you know I...",he
2,he,“ A house of her own!--But where is the advant...,he
3,he,"“ How often we shall be going to see them , an...",he
4,he,"“ My dear , how am I to get so far ? Randalls ...",he


In [13]:
print(df["canonical_name"].unique())

['he' 'her' 'Mr. Woodhouse' 'Emma' 'Mr. Knightley' 'her father' 'his'
 'gentle Mrs. Weston' 'the ladies' 'Mrs. Martin' 'she' 'Harriet'
 'Miss Woodhouse' 'the gallant Mr. Elton' 'your' 'Isabella' 'Miss Smith'
 'Shakespeare' 'you' 'Kitty' 'the present proprietor' 'her companion'
 'the gentleman' 'Mr. John Knightley'
 'the good - hearted Mrs. John Knightley' 'his wife' 'herself'
 'Mr. Weston' 'the others' 'James' 'they' 'him'
 'the happily deceived aunt' 'Miss Bates' 'sweet Jane Fairfax'
 'every body' 'the principal woollen - draper , linen - draper'
 'neither “ master' 'the young man' 'his father' 'Frank Churchill'
 'himself' 'Mrs. Cole' 'the obliging Mrs. Ford' 'Voices' 'the latter'
 'Somebody else' 'her husband' 'the gentlemen' 'her partner' 'Short'
 'the “ beautiful little friend' 'Mrs. Elton' 'the Vicarage' 'his brother'
 'them' 'its' 'his son' 'Miss Fairfax' 'no one' 'Richmond' 'MY DEAR MADAM'
 'Patty' 'Mrs. Bates']


In [14]:
# count of quotes per character
df['quote_count'] = df.groupby('canonical_name')['quote'].transform('count')

# print quote counts for each character
print(df[['canonical_name', 'quote_count']].drop_duplicates().sort_values(by='quote_count', ascending=False))

          canonical_name  quote_count
0                     he          466
18                  Emma          379
52                   she          349
22         Mr. Knightley          104
46    gentle Mrs. Weston          102
...                  ...          ...
332                Kitty            1
292           Miss Smith            1
49           Mrs. Martin            1
48            the ladies            1
1919          Mrs. Bates            1

[64 rows x 2 columns]


In [30]:
# add a new column where all canonical names containing stop word are replaced with "Other"
# check if any stopword is in the canonical name
df['name'] = df['canonical_name'].apply(
    lambda x: "Other" 
    if any(word in stopwords for word in re.findall(r"\b\w+\b", x.lower())) 
    else x
)
df.head()

Unnamed: 0,idx,mention_phrase,quote,canonical_name,quote_count,name,quote_embedding,umap_x,umap_y,umap_3d_x,umap_3d_y,umap_3d_z
0,0,he,“ Poor Miss Taylor!--I wish she were here agai...,he,466,Other,"[0.018220363184809685, -0.06551337242126465, 0...",7.369244,6.429071,-4.810905,4.724307,11.485666
1,1,he,"“ I can not agree with you , papa ; you know I...",he,466,Other,"[0.000761064700782299, -0.06581622362136841, -...",7.617204,6.554214,-4.461437,4.583676,11.195873
2,2,he,“ A house of her own!--But where is the advant...,he,466,Other,"[0.0663914754986763, 0.06374149024486542, 0.00...",8.976602,7.480959,-3.102542,4.002779,10.603953
3,3,he,"“ How often we shall be going to see them , an...",he,466,Other,"[-0.03418850898742676, 0.06972943991422653, 0....",8.277662,8.401175,-3.805998,2.872328,10.381424
4,4,he,"“ My dear , how am I to get so far ? Randalls ...",he,466,Other,"[-0.032696083188056946, 0.0649917721748352, -0...",11.080583,6.642487,-2.243239,5.494631,10.195373


In [16]:
# generate embedding for each quote using sentence-transformers
model = SentenceTransformer('all-MiniLM-L6-v2')
df['quote_embedding'] = df['quote'].apply(lambda x: model.encode(x).tolist())

In [17]:
# run UMAP
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
embeddings = np.array(df['quote_embedding'].tolist())
embedding_2d = reducer.fit_transform(embeddings)

# add coordinates back to dataframe
df['umap_x'] = embedding_2d[:, 0]
df['umap_y'] = embedding_2d[:, 1]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [31]:
# interactive plot with plotly
fig = px.scatter(
    df,
    x="umap_x",
    y="umap_y",
    color="name",   # color points by canonical name
    hover_data=["name", "quote"],  # add info on hover
    opacity=0.7,
    width=900,
    height=600
)

fig.update_layout(
    title="UMAP projection of Quote Embeddings",
    xaxis_title="UMAP 1",
    yaxis_title="UMAP 2"
)

fig.show()

In [19]:
df = df.reset_index().rename(columns={"index": "idx"}) # add index column for reference
df.head()

Unnamed: 0,idx,mention_phrase,quote,canonical_name,quote_count,name,quote_embedding,umap_x,umap_y
0,0,he,“ Poor Miss Taylor!--I wish she were here agai...,he,466,Other,"[0.018220363184809685, -0.06551337242126465, 0...",7.369244,6.429071
1,1,he,"“ I can not agree with you , papa ; you know I...",he,466,Other,"[0.000761064700782299, -0.06581622362136841, -...",7.617204,6.554214
2,2,he,“ A house of her own!--But where is the advant...,he,466,Other,"[0.0663914754986763, 0.06374149024486542, 0.00...",8.976602,7.480959
3,3,he,"“ How often we shall be going to see them , an...",he,466,Other,"[-0.03418850898742676, 0.06972943991422653, 0....",8.277662,8.401175
4,4,he,"“ My dear , how am I to get so far ? Randalls ...",he,466,Other,"[-0.032696083188056946, 0.0649917721748352, -0...",11.080583,6.642487


In [20]:
# remove duplicate idx column
df = df.loc[:,~df.columns.duplicated()]
df.head()

Unnamed: 0,idx,mention_phrase,quote,canonical_name,quote_count,name,quote_embedding,umap_x,umap_y
0,0,he,“ Poor Miss Taylor!--I wish she were here agai...,he,466,Other,"[0.018220363184809685, -0.06551337242126465, 0...",7.369244,6.429071
1,1,he,"“ I can not agree with you , papa ; you know I...",he,466,Other,"[0.000761064700782299, -0.06581622362136841, -...",7.617204,6.554214
2,2,he,“ A house of her own!--But where is the advant...,he,466,Other,"[0.0663914754986763, 0.06374149024486542, 0.00...",8.976602,7.480959
3,3,he,"“ How often we shall be going to see them , an...",he,466,Other,"[-0.03418850898742676, 0.06972943991422653, 0....",8.277662,8.401175
4,4,he,"“ My dear , how am I to get so far ? Randalls ...",he,466,Other,"[-0.032696083188056946, 0.0649917721748352, -0...",11.080583,6.642487


In [32]:
fig = px.scatter(
    df,
    x="umap_x",
    y="umap_y",
    color="idx",              # numeric column → continuous scale
    color_continuous_scale="Viridis",  # or "Plasma", "Turbo", etc.
    hover_data=["idx", "name", "quote"],
    opacity=0.7,
    width=900,
    height=600
)

fig.update_layout(
    title="UMAP projection of Quote Embeddings (colored by index)",
    xaxis_title="UMAP 1",
    yaxis_title="UMAP 2"
)

fig.show()

In [33]:
# filter out "other" names
df_filtered = df[df['name'] != "Other"]

df_filtered.head()

Unnamed: 0,idx,mention_phrase,quote,canonical_name,quote_count,name,quote_embedding,umap_x,umap_y,umap_3d_x,umap_3d_y,umap_3d_z
10,10,Mr. Woodhouse,"“ It is very kind of you , Mr. Knightley , to ...",Mr. Woodhouse,52,Mr. Woodhouse,"[-0.04399728402495384, 0.032596878707408905, 0...",6.369669,6.603613,-5.638496,4.97316,10.017447
11,11,Mr. Woodhouse,"“ Not at all , sir . It is a beautiful moonlig...",Mr. Woodhouse,52,Mr. Woodhouse,"[-0.0015808487078174949, 0.08963127434253693, ...",10.691205,7.854729,-1.885758,3.818945,9.357606
12,12,Mr. Woodhouse,“ But you must have found it very damp and dir...,Mr. Woodhouse,52,Mr. Woodhouse,"[-0.042439017444849014, 0.1340944766998291, 0....",9.883797,6.83584,-2.622585,4.798442,9.253331
13,13,Mr. Woodhouse,"“ Dirty , sir ! Look at my shoes . Not a speck...",Mr. Woodhouse,52,Mr. Woodhouse,"[-0.09741733223199844, 0.09301677346229553, 0....",9.989972,10.644764,-2.461584,2.132329,9.249256
14,14,Mr. Woodhouse,"“ Well ! that is quite surprising , for we hav...",Mr. Woodhouse,52,Mr. Woodhouse,"[-0.0009594066650606692, 0.15259331464767456, ...",9.634563,6.951302,-2.80154,4.644891,9.379891


In [34]:
# plot again
fig = px.scatter(
    df_filtered,
    x="umap_x",
    y="umap_y",
    color="idx",              # numeric column → continuous scale
    color_continuous_scale="Viridis",  # or "Plasma", "Turbo", etc.
    hover_data=["idx", "name", "quote"],
    opacity=0.7,
    width=900,
    height=600
)

fig.update_layout(
    title="UMAP projection of Quote Embeddings (colored by index)",
    xaxis_title="UMAP 1",
    yaxis_title="UMAP 2"
)

fig.show()

In [35]:
# try 3D UMAP
reducer_3d = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=3, metric='cosine', random_state=42)
embedding_3d = reducer_3d.fit_transform(embeddings)
df['umap_3d_x'] = embedding_3d[:, 0]
df['umap_3d_y'] = embedding_3d[:, 1]
df['umap_3d_z'] = embedding_3d[:, 2]


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [36]:
# plot again coloring by name
fig = px.scatter_3d(
    df,
    x="umap_3d_x",
    y="umap_3d_y",
    z="umap_3d_z",
    color="name",   # color points by canonical name
    hover_data=["idx", "name", "quote"],  # add info on hover
    opacity=0.7,
    width=900,
    height=600
)

fig.update_layout(
    title="3D UMAP projection of Quote Embeddings",
    scene=dict(
        xaxis_title="UMAP 1",
        yaxis_title="UMAP 2",
        zaxis_title="UMAP 3"
    )
)

fig.show()

In [37]:
# filter again
df_filtered = df[df['name'] != "Other"]

df_filtered.head()

Unnamed: 0,idx,mention_phrase,quote,canonical_name,quote_count,name,quote_embedding,umap_x,umap_y,umap_3d_x,umap_3d_y,umap_3d_z
10,10,Mr. Woodhouse,"“ It is very kind of you , Mr. Knightley , to ...",Mr. Woodhouse,52,Mr. Woodhouse,"[-0.04399728402495384, 0.032596878707408905, 0...",6.369669,6.603613,-5.638496,4.97316,10.017447
11,11,Mr. Woodhouse,"“ Not at all , sir . It is a beautiful moonlig...",Mr. Woodhouse,52,Mr. Woodhouse,"[-0.0015808487078174949, 0.08963127434253693, ...",10.691205,7.854729,-1.885758,3.818945,9.357606
12,12,Mr. Woodhouse,“ But you must have found it very damp and dir...,Mr. Woodhouse,52,Mr. Woodhouse,"[-0.042439017444849014, 0.1340944766998291, 0....",9.883797,6.83584,-2.622585,4.798442,9.253331
13,13,Mr. Woodhouse,"“ Dirty , sir ! Look at my shoes . Not a speck...",Mr. Woodhouse,52,Mr. Woodhouse,"[-0.09741733223199844, 0.09301677346229553, 0....",9.989972,10.644764,-2.461584,2.132329,9.249256
14,14,Mr. Woodhouse,"“ Well ! that is quite surprising , for we hav...",Mr. Woodhouse,52,Mr. Woodhouse,"[-0.0009594066650606692, 0.15259331464767456, ...",9.634563,6.951302,-2.80154,4.644891,9.379891


In [38]:
# plot by idx
fig = px.scatter_3d(
    df_filtered,
    x="umap_3d_x",
    y="umap_3d_y",
    z="umap_3d_z",
    color="idx",   # color points by index
    hover_data=["name", "quote"],  # add info on hover
    opacity=0.7,
    width=900,
    height=600
)

fig.update_layout(
    title="3D UMAP projection of Quote Embeddings (colored by index)",
    scene=dict(
        xaxis_title="UMAP 1",
        yaxis_title="UMAP 2",
        zaxis_title="UMAP 3"
    )
)

fig.show()

In [45]:
# just get Emma quotes
df_emma = df[df['name'] == 'Emma']

df_emma.head()

Unnamed: 0,idx,mention_phrase,quote,canonical_name,quote_count,name,quote_embedding,umap_x,umap_y,umap_3d_x,umap_3d_y,umap_3d_z
18,18,Emma,“ Especially when _ one _ of those two is such...,Emma,379,Emma,"[-0.016880381852388382, 0.10539714992046356, 0...",9.651091,9.250559,-1.857381,2.950687,10.436851
19,19,Emma,"“ That is what you have in your head , I know ...",Emma,379,Emma,"[0.002040544291958213, 0.09740548580884933, 0....",9.542214,9.34691,-2.536318,3.031402,9.67859
25,25,Emma,"“ Well , ”",Emma,379,Emma,"[-0.02996857650578022, 0.10348805040121078, 0....",10.766622,11.30513,-1.086947,1.700511,8.401161
31,31,Emma,“ And you have forgotten one matter of joy to ...,Emma,379,Emma,"[-0.01835009455680847, 0.12960755825042725, 0....",9.079893,8.964325,-2.294795,3.142683,11.146729
32,32,Emma,“ and a very considerable one -- that I made t...,Emma,379,Emma,"[-0.0627688616514206, 0.07645916938781738, 0.0...",7.630513,6.715027,-4.22219,4.322448,10.993834


In [46]:
# plot by idx 2d
fig = px.scatter(
    df_emma,
    x="umap_x",
    y="umap_y",
    color="idx",              # numeric column → continuous scale
    color_continuous_scale="Viridis",  # or "Plasma", "Turbo", etc.
    hover_data=["idx", "name", "quote"],
    opacity=0.7,
    width=900,
    height=600
)

fig.update_layout(
    title="UMAP projection of Miss Woodhouse's Quote Embeddings (colored by index)",
    xaxis_title="UMAP 1",
    yaxis_title="UMAP 2"
)

fig.show()

In [49]:
# print first 5 quotes by Emma

for i, row in df_emma.head(5).iterrows():
    print(f"{row['idx']}: {row['quote']}\n")

18: “ Especially when _ one _ of those two is such a fanciful , troublesome creature ! ”

19: “ That is what you have in your head , I know -- and what you would certainly say if my father were not by . ”

25: “ Well , ”

31: “ And you have forgotten one matter of joy to me , ”

32: “ and a very considerable one -- that I made the match myself . I made the match , you know , four years ago ; and to have it take place , and be proved in the right , when so many people said Mr. Weston would never marry again , may comfort me for any thing . ”



In [53]:
# api keys
try:
    with open("../secrets.json") as f:
        secrets = json.load(f)
    open_ai_key = secrets["openai"]
    anthropic_key = secrets["anthropic"]
    google_key = secrets["google"]
    os.environ["OPENAI_API_KEY"] = open_ai_key
    os.environ["ANTHROPIC_API_KEY"] = anthropic_key
    os.environ["GOOGLE_API_KEY"] = google_key
    print("API key loaded.")
except FileNotFoundError:
    print("Secrets file not found. YOU NEED THEM TO RUN THIS.")

API key loaded.


In [57]:
llm = ChatOpenAI(model="gpt-5-nano", temperature=0.1)

In [65]:
# set up pydantic model to extract analysis of each quote
class QuoteAnalysis(BaseModel):
    idx: int = Field(..., description="Index of the quote")
    analysis: str = Field(..., description="1-2 sentence analysis of the quote")

class QuoteList(BaseModel):
    quotes: List[QuoteAnalysis] = Field(..., description="List of quote analyses")

quote_analysis_llm = llm.with_structured_output(QuoteList)

In [59]:
# Define a semaphore to limit concurrency
SEMAPHORE_LIMIT = 20
semaphore = asyncio.Semaphore(SEMAPHORE_LIMIT)

async def semaphore_wrapper(func, *args):
    """Wrap async function with semaphore to limit concurrency"""
    async with semaphore:
        return await func(*args)
    
def _iter_batches(df: pd.DataFrame, batch_size: int):
    n = len(df)
    for start in range(0, n, batch_size):
        yield df.iloc[start:start + batch_size]

In [68]:
async def analyze_quotes_batched(
    df: pd.DataFrame,
    quote_analysis_llm,
    *,
    batch_size: int = 10,
    max_concurrent_batches: int = 20
) -> QuoteList:
    semaphore = asyncio.Semaphore(max_concurrent_batches)
    total_batches = (len(df) + batch_size - 1) // batch_size
    completed_batches = 0

    async def _analyze_batch(batch_df: pd.DataFrame, batch_idx: int) -> QuoteList:
        payload = [
            {"idx": int(row["idx"]), "quote": str(row["quote"])}
            for _, row in batch_df.iterrows()
        ]
        start_prompt = f"""
        Analyze the following quotes from Jane Austen's *Emma*.  
        For EACH quote, produce a 1–2 sentence synthesis touching on:
        syntax & structure, tone, lexical choice, stylistic features, pragmatics, and function.
        Write a tight synthesis (not bullet points), max 2 sentences.

        Input JSON:
        {payload}

        Return as a JSON object with key "quotes" (list of {{idx, analysis}}).
        """

        async with semaphore:
            result = await quote_analysis_llm.ainvoke(start_prompt)

        nonlocal completed_batches
        completed_batches += 1
        print(f"✅ Finished batch {batch_idx+1}/{total_batches} "
              f"({completed_batches}/{total_batches} total)")
        return result

    # Create tasks for each batch
    tasks = []
    for batch_idx, start in enumerate(range(0, len(df), batch_size)):
        batch_df = df.iloc[start:start + batch_size]
        tasks.append(_analyze_batch(batch_df, batch_idx))

    results = await asyncio.gather(*tasks)

    # Merge
    all_quotes = []
    for r in results:
        all_quotes.extend(r.quotes)

    all_quotes.sort(key=lambda qa: qa.idx)
    return QuoteList(quotes=all_quotes)

In [70]:
quotes = await analyze_quotes_batched(df_emma, quote_analysis_llm)

✅ Finished batch 4/38 (1/38 total)
✅ Finished batch 12/38 (2/38 total)
✅ Finished batch 15/38 (3/38 total)
✅ Finished batch 19/38 (4/38 total)
✅ Finished batch 16/38 (5/38 total)
✅ Finished batch 17/38 (6/38 total)
✅ Finished batch 10/38 (7/38 total)
✅ Finished batch 6/38 (8/38 total)
✅ Finished batch 13/38 (9/38 total)
✅ Finished batch 18/38 (10/38 total)
✅ Finished batch 3/38 (11/38 total)
✅ Finished batch 1/38 (12/38 total)
✅ Finished batch 11/38 (13/38 total)
✅ Finished batch 14/38 (14/38 total)
✅ Finished batch 5/38 (15/38 total)
✅ Finished batch 2/38 (16/38 total)
✅ Finished batch 9/38 (17/38 total)
✅ Finished batch 20/38 (18/38 total)
✅ Finished batch 7/38 (19/38 total)
✅ Finished batch 8/38 (20/38 total)
✅ Finished batch 26/38 (21/38 total)
✅ Finished batch 21/38 (22/38 total)
✅ Finished batch 28/38 (23/38 total)
✅ Finished batch 33/38 (24/38 total)
✅ Finished batch 22/38 (25/38 total)
✅ Finished batch 35/38 (26/38 total)
✅ Finished batch 27/38 (27/38 total)
✅ Finished batch 24

In [74]:
# add analyses back to dataframe
analysis_dict = {qa.idx: qa.analysis for qa in quotes.quotes}
df_emma['analysis'] = df_emma['idx'].map(analysis_dict)

df_emma.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,idx,mention_phrase,quote,canonical_name,quote_count,name,quote_embedding,umap_x,umap_y,umap_3d_x,umap_3d_y,umap_3d_z,analysis
18,18,Emma,“ Especially when _ one _ of those two is such...,Emma,379,Emma,"[-0.016880381852388382, 0.10539714992046356, 0...",9.651091,9.250559,-1.857381,2.950687,10.436851,The line is a compact exclamative clause with ...
19,19,Emma,"“ That is what you have in your head , I know ...",Emma,379,Emma,"[0.002040544291958213, 0.09740548580884933, 0....",9.542214,9.34691,-2.536318,3.031402,9.67859,A declarative with a dash-embedded parenthetic...
25,25,Emma,"“ Well , ”",Emma,379,Emma,"[-0.02996857650578022, 0.10348805040121078, 0....",10.766622,11.30513,-1.086947,1.700511,8.401161,The utterance 'Well' is an extremely truncated...
31,31,Emma,“ And you have forgotten one matter of joy to ...,Emma,379,Emma,"[-0.01835009455680847, 0.12960755825042725, 0....",9.079893,8.964325,-2.294795,3.142683,11.146729,The fragment 'And you have forgotten one matte...
32,32,Emma,“ and a very considerable one -- that I made t...,Emma,379,Emma,"[-0.0627688616514206, 0.07645916938781738, 0.0...",7.630513,6.715027,-4.22219,4.322448,10.993834,"A long, self-assertive declarative built aroun..."


In [76]:
# embed analyses
df_emma['analysis_embedding'] = df_emma['analysis'].apply(lambda x: model.encode(x).tolist())

df_emma.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,idx,mention_phrase,quote,canonical_name,quote_count,name,quote_embedding,umap_x,umap_y,umap_3d_x,umap_3d_y,umap_3d_z,analysis,analysis_embedding
18,18,Emma,“ Especially when _ one _ of those two is such...,Emma,379,Emma,"[-0.016880381852388382, 0.10539714992046356, 0...",9.651091,9.250559,-1.857381,2.950687,10.436851,The line is a compact exclamative clause with ...,"[-0.02795586735010147, 0.046819668263196945, 0..."
19,19,Emma,"“ That is what you have in your head , I know ...",Emma,379,Emma,"[0.002040544291958213, 0.09740548580884933, 0....",9.542214,9.34691,-2.536318,3.031402,9.67859,A declarative with a dash-embedded parenthetic...,"[0.014698651619255543, 0.03855948522686958, 0...."
25,25,Emma,"“ Well , ”",Emma,379,Emma,"[-0.02996857650578022, 0.10348805040121078, 0....",10.766622,11.30513,-1.086947,1.700511,8.401161,The utterance 'Well' is an extremely truncated...,"[0.031328797340393066, 0.03757332265377045, 0...."
31,31,Emma,“ And you have forgotten one matter of joy to ...,Emma,379,Emma,"[-0.01835009455680847, 0.12960755825042725, 0....",9.079893,8.964325,-2.294795,3.142683,11.146729,The fragment 'And you have forgotten one matte...,"[-0.022995956242084503, 0.06343995034694672, 0..."
32,32,Emma,“ and a very considerable one -- that I made t...,Emma,379,Emma,"[-0.0627688616514206, 0.07645916938781738, 0.0...",7.630513,6.715027,-4.22219,4.322448,10.993834,"A long, self-assertive declarative built aroun...","[-0.06680276989936829, 0.06785377115011215, 0...."


In [77]:
# umap on analyses
reducer_analysis = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
analysis_embeddings = np.array(df_emma['analysis_embedding'].tolist())
analysis_embedding_2d = reducer_analysis.fit_transform(analysis_embeddings)
df_emma['analysis_umap_x'] = analysis_embedding_2d[:, 0]
df_emma['analysis_umap_y'] = analysis_embedding_2d[:, 1]


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [78]:
# plot analyses colored by idx
fig = px.scatter(
    df_emma,
    x="analysis_umap_x",
    y="analysis_umap_y",
    color="idx",              # numeric column → continuous scale
    color_continuous_scale="Viridis",  # or "Plasma", "Turbo", etc.
    hover_data=["idx", "name", "quote", "analysis"],
    opacity=0.7,
    width=900,
    height=600
)

fig.update_layout(
    title="UMAP projection of Miss Woodhouse's Quote Analyses (colored by index)",
    xaxis_title="UMAP 1",
    yaxis_title="UMAP 2"
)
fig.show()

In [79]:
# embed quote + analysis
df_emma['quote_plus_analysis'] = df_emma.apply(lambda row: f"Quote: {row['quote']}\nAnalysis: {row['analysis']}", axis=1)
df_emma.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,idx,mention_phrase,quote,canonical_name,quote_count,name,quote_embedding,umap_x,umap_y,umap_3d_x,umap_3d_y,umap_3d_z,analysis,analysis_embedding,analysis_umap_x,analysis_umap_y,quote_plus_analysis
18,18,Emma,“ Especially when _ one _ of those two is such...,Emma,379,Emma,"[-0.016880381852388382, 0.10539714992046356, 0...",9.651091,9.250559,-1.857381,2.950687,10.436851,The line is a compact exclamative clause with ...,"[-0.02795586735010147, 0.046819668263196945, 0...",2.316968,9.766674,Quote: “ Especially when _ one _ of those two ...
19,19,Emma,"“ That is what you have in your head , I know ...",Emma,379,Emma,"[0.002040544291958213, 0.09740548580884933, 0....",9.542214,9.34691,-2.536318,3.031402,9.67859,A declarative with a dash-embedded parenthetic...,"[0.014698651619255543, 0.03855948522686958, 0....",4.141788,12.771272,"Quote: “ That is what you have in your head , ..."
25,25,Emma,"“ Well , ”",Emma,379,Emma,"[-0.02996857650578022, 0.10348805040121078, 0....",10.766622,11.30513,-1.086947,1.700511,8.401161,The utterance 'Well' is an extremely truncated...,"[0.031328797340393066, 0.03757332265377045, 0....",4.576979,9.042109,"Quote: “ Well , ”\nAnalysis: The utterance 'We..."
31,31,Emma,“ And you have forgotten one matter of joy to ...,Emma,379,Emma,"[-0.01835009455680847, 0.12960755825042725, 0....",9.079893,8.964325,-2.294795,3.142683,11.146729,The fragment 'And you have forgotten one matte...,"[-0.022995956242084503, 0.06343995034694672, 0...",5.837966,9.475883,Quote: “ And you have forgotten one matter of ...
32,32,Emma,“ and a very considerable one -- that I made t...,Emma,379,Emma,"[-0.0627688616514206, 0.07645916938781738, 0.0...",7.630513,6.715027,-4.22219,4.322448,10.993834,"A long, self-assertive declarative built aroun...","[-0.06680276989936829, 0.06785377115011215, 0....",3.88379,10.781953,Quote: “ and a very considerable one -- that I...


In [80]:
# embed quote_plus_analysis
df_emma['quote_plus_analysis_embedding'] = df_emma['quote_plus_analysis'].apply(lambda x: model.encode(x).tolist())
df_emma.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,idx,mention_phrase,quote,canonical_name,quote_count,name,quote_embedding,umap_x,umap_y,umap_3d_x,umap_3d_y,umap_3d_z,analysis,analysis_embedding,analysis_umap_x,analysis_umap_y,quote_plus_analysis,quote_plus_analysis_embedding
18,18,Emma,“ Especially when _ one _ of those two is such...,Emma,379,Emma,"[-0.016880381852388382, 0.10539714992046356, 0...",9.651091,9.250559,-1.857381,2.950687,10.436851,The line is a compact exclamative clause with ...,"[-0.02795586735010147, 0.046819668263196945, 0...",2.316968,9.766674,Quote: “ Especially when _ one _ of those two ...,"[-0.006640954874455929, 0.02996320277452469, 0..."
19,19,Emma,"“ That is what you have in your head , I know ...",Emma,379,Emma,"[0.002040544291958213, 0.09740548580884933, 0....",9.542214,9.34691,-2.536318,3.031402,9.67859,A declarative with a dash-embedded parenthetic...,"[0.014698651619255543, 0.03855948522686958, 0....",4.141788,12.771272,"Quote: “ That is what you have in your head , ...","[0.0013665318256244063, 0.09879616647958755, 0..."
25,25,Emma,"“ Well , ”",Emma,379,Emma,"[-0.02996857650578022, 0.10348805040121078, 0....",10.766622,11.30513,-1.086947,1.700511,8.401161,The utterance 'Well' is an extremely truncated...,"[0.031328797340393066, 0.03757332265377045, 0....",4.576979,9.042109,"Quote: “ Well , ”\nAnalysis: The utterance 'We...","[0.018876980990171432, 0.055413831025362015, 0..."
31,31,Emma,“ And you have forgotten one matter of joy to ...,Emma,379,Emma,"[-0.01835009455680847, 0.12960755825042725, 0....",9.079893,8.964325,-2.294795,3.142683,11.146729,The fragment 'And you have forgotten one matte...,"[-0.022995956242084503, 0.06343995034694672, 0...",5.837966,9.475883,Quote: “ And you have forgotten one matter of ...,"[-0.05123772844672203, 0.0827345997095108, 0.0..."
32,32,Emma,“ and a very considerable one -- that I made t...,Emma,379,Emma,"[-0.0627688616514206, 0.07645916938781738, 0.0...",7.630513,6.715027,-4.22219,4.322448,10.993834,"A long, self-assertive declarative built aroun...","[-0.06680276989936829, 0.06785377115011215, 0....",3.88379,10.781953,Quote: “ and a very considerable one -- that I...,"[-0.06492961943149567, 0.05181235820055008, 0...."


In [81]:
# umap on quote_plus_analysis embeddings
reducer_qpa = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
qpa_embeddings = np.array(df_emma['quote_plus_analysis_embedding'].tolist())
qpa_embedding_2d = reducer_qpa.fit_transform(qpa_embeddings)
df_emma['qpa_umap_x'] = qpa_embedding_2d[:, 0]
df_emma['qpa_umap_y'] = qpa_embedding_2d[:, 1]


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [82]:
# plot qpa umap colored by idx
fig = px.scatter(
    df_emma,
    x="qpa_umap_x",
    y="qpa_umap_y",
    color="idx",              # numeric column → continuous scale
    color_continuous_scale="Viridis",  # or "Plasma", "Turbo", etc.
    hover_data=["idx", "name", "quote", "analysis"],
    opacity=0.7,
    width=900,
    height=600
)

fig.update_layout(
    title="UMAP projection of Miss Woodhouse's Quote + Analyses (colored by index)",
    xaxis_title="UMAP 1",
    yaxis_title="UMAP 2"
)
fig.show()