In [26]:
# Import libraries
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import plotly.graph_objects as go
from sqlalchemy import create_engine
import random
import re
import warnings

os.makedirs("yearly_networks", exist_ok=True)
warnings.filterwarnings("ignore")

In [27]:
# Utility functions
def setup_environment():
    """Setup environment and configuration."""
    os.makedirs("yearly_networks", exist_ok=True)
    warnings.filterwarnings("ignore")

    # Try to import spacy
    try:
        import spacy

        print("SpaCy loaded successfully")
        try:
            nlp = spacy.load("en_core_web_sm")
            print("NER model loaded successfully")
            return nlp
        except:
            print(
                "NER model not found. To install: python -m spacy download en_core_web_sm"
            )
            return None
    except:
        print("SpaCy not installed. For better name extraction: pip install spacy")
        return None


# Setup
nlp = setup_environment()

SpaCy loaded successfully
NER model loaded successfully


In [28]:
def load_data():
    """Load data from database."""
    connection_string = "mysql+pymysql://root@localhost:3306/RAMAPO"

    engine = create_engine(connection_string)
    query = """
    SELECT
        omek_search_texts.title,
        omek_tags.name AS tag_name,
        element_texts.text AS date
    FROM
        omek_items
    JOIN
        omek_item_types ON omek_items.item_type_id = omek_item_types.id
    JOIN
        omek_search_texts ON omek_items.id = omek_search_texts.record_id
    JOIN
        omek_records_tags ON omek_items.id = omek_records_tags.record_id
    JOIN
        omek_tags ON omek_records_tags.tag_id = omek_tags.id
    JOIN
        omek_element_texts AS element_texts ON omek_items.id = element_texts.record_id AND element_texts.element_id = 40
    WHERE
        omek_items.item_type_id = 1 AND
        omek_search_texts.record_type = 'Item'
    """
    df = pd.read_sql(query, engine)

    return df


# Load data
print("Loading data...")
df = load_data()

Loading data...


In [29]:
def clean_date(date_str):
    """Extract year from date string and perform sanity checks"""
    if pd.isna(date_str):
        return np.nan

    # Try to extract year from various formats
    year_match = re.search(r"(\d{4})", str(date_str))
    if year_match:
        year = int(year_match.group(1))

        # Filter to ONLY include 1901-1935
        if year < 1901 or year > 1935:
            return np.nan

        return year
    return np.nan

In [30]:
# Data cleaning functions
def clean_data(df):
    """Clean loaded data and print information."""
    # Check for nulls initially
    print("Initial shape:", df.shape)
    print("\nMissing values before cleaning:")
    print(df.isnull().sum())

    # Remove rows with null values
    df = df.dropna(subset=["title", "tag_name", "date"])

    # Check for nulls after cleaning
    print("\nShape after removing null values:", df.shape)
    print("\nMissing values after removing nulls:")
    print(df.isnull().sum())

    # Extract years early and filter by range
    df["year"] = df["date"].apply(clean_date)

    # Filter to 1901-1935 range before doing anything else
    original_count = len(df)
    df = df.dropna(subset=["year"])
    print(
        f"Removed {original_count - len(df)} rows with missing or invalid years (outside 1901-1935)"
    )

    # Display sample titles
    print("\nSample of titles:")
    for i in range(min(10, len(df))):
        print(f"{i + 1}. {df.iloc[i]['title']}")

    return df


# Clean data and filter by year range (1901-1935) early
print("Cleaning data and filtering to 1901-1935...")
df = clean_data(df)

Cleaning data and filtering to 1901-1935...
Initial shape: (56171, 3)

Missing values before cleaning:
title       2
tag_name    0
date        0
dtype: int64

Shape after removing null values: (56169, 3)

Missing values after removing nulls:
title       0
tag_name    0
date        0
dtype: int64
Removed 288 rows with missing or invalid years (outside 1901-1935)

Sample of titles:
1. Tribute to Theodore Parker, November 17, 1910
2. The Progressive Party and the Negro, November 1912
3. Jane Addams to W. E. B. Du Bois, June 23, 1903
4. The Black Scourge in Europe, April 10, 1920
5. M. F. C. Honoré to Jane Addams, January 11, 1923
6. Zonia Baber to Jane Addams, September 3, 1929
7. Jane Addams to Sarah Alice Addams Haldeman, June 5, 1903
8. Jane Addams to Jenkin Lloyd Jones, October 20, 1903
9. Mary White Ovington to Jane Addams, January 10, 1903
10. W. E. B. Du Bois to Jane Addams, April 19, 1905


In [31]:
# Name extraction functions
def is_likely_person_name(name):
    """Check if a string is likely to be a person name"""
    # Names should have capital letters
    if not name or not name[0].isupper():
        return False

    # Names usually don't have more than 5 words
    if len(name.split()) > 5:
        return False

    # Names shouldn't contain certain keywords
    non_person_keywords = [
        "The",
        "Report",
        "Article",
        "Speech",
        "Statement",
        "List",
        "Comments",
        "Address",
        "Draft",
        "Program",
        "Resolution",
        "Peace",
        "Child",
        "Labor",
        "Social",
        "Industrial",
        "Education",
        "House",
        "Museum",
        "College",
        "University",
        "Institute",
        "Association",
        "Committee",
        "Conference",
        "League",
        "Department",
        "Council",
        "System",
        "Public",
        "National",
        "International",
        "Federal",
        "Municipal",
        "Settlement",
        "Fellowship",
        "Organization",
        "Secretary to",
        "Assistant to",
        "Office of",
        "Clerk of",
        "Staff of",
        "Secretary to Jane Addams",
        "to Jane Addams",
        "Jane Addams et al.",
    ]

    if any(keyword in name for keyword in non_person_keywords):
        return False

    return True

In [32]:
def extract_names(title):
    """Extract sender and receiver names from document titles, focusing only on real people"""
    if pd.isna(title):
        return [], []

    senders = []
    receivers = []
    title_str = str(title)

    # Pattern 1: "Person1 to Person2" - Most reliable pattern
    to_pattern = re.search(
        r"([A-Z][A-Za-z\s\.\-\']+)\s+to\s+([A-Z][A-Za-z\s\.\-\']+)[,\.]", title_str
    )
    if to_pattern:
        sender = to_pattern.group(1).strip()
        receiver = to_pattern.group(2).strip()

        # Check if these look like person names
        if is_likely_person_name(sender) and is_likely_person_name(receiver):
            senders.append(sender)
            receivers.append(receiver)
            return senders, receivers

    # Check for specific named people like "Frau Noémi P. Vetter"
    if re.search(r"Frau [\w\s\.\-]+ of Vienna", title_str):
        match = re.search(r"(Frau [\w\s\.\-]+) of Vienna", title_str)
        if match:
            person_name = match.group(1)
            senders.append(person_name)
            receivers.append("Jane Addams")
            return senders, receivers

    # Handle specific cases like "Mrs. J. T. Bowen"
    if re.search(r"Mrs\. [\w\s\.\-]+", title_str):
        match = re.search(r"(Mrs\. [\w\s\.\-]+)", title_str)
        if match:
            person_name = match.group(1)
            receivers.append(person_name)
            senders.append("Jane Addams")
            return senders, receivers

    # Add pattern for names with initials (like H. O. Hammond, J. V. Fernandez)
    initial_name_pattern = re.search(
        r"([A-Z]\.\s+[A-Z]\.\s+[A-Za-z]+|[A-Z]\.\s+[A-Z][a-z]+)\s+to\s+", title_str
    )
    if initial_name_pattern:
        sender = initial_name_pattern.group(1).strip()
        # Extract receiver if possible
        receiver_match = re.search(r"to\s+([A-Z][A-Za-z\s\.\-\']+)", title_str)
        if receiver_match:
            receiver = receiver_match.group(1).strip()
            if is_likely_person_name(receiver):
                receivers.append(receiver)

        senders.append(sender)
        return senders, receivers

    # Comprehensive list of known correspondents
    key_names = [
        "Jane Addams",
        "W. E. B. Du Bois",
        "Alice Addams",
        "Mary White Ovington",
        "Sarah Addams",
        "Theodore Parker",
        "Ellen Gates Starr",
        "Jenkin Lloyd Jones",
        "Mary Rozet Smith",
        "Emily Greene Balch",
        "Paul Underwood Kellogg",
        "Eleanor Daggett Karsten",
        "Lillian D. Wald",
        "Sarah Alice Addams Haldeman",
        "Anita McCormick Blaine",
        "Madeleine Zabriskie Doty",
        "Dorothy Detzer",
        "Mary Ryott Sheepshanks",
        "Hannah Clothier Hull",
        "Amy Woods",
        "Rosika Schwimmer",
        "Myra Harriet Reynolds Linn",
        "Florence Kelley",
        "Stanley Ross Linn",
        "Harriet Park Thomas",
        "Alice Hamilton",
        "Grace Abbott",
        "Samuel Flagg Bemis",
        "Sophonisba Breckinridge",
        "Julia Lathrop",
        "Woodrow Wilson",
        "Anna Marcet Haldeman-Julius",
        "Salmon Oliver Levinson",
        "Wilbur Kelsey Thomas",
        "Mabel L. Hyers",
        "Lucia Ames Mead",
        "Julia Clifford Lathrop",
        "Richard Theodore Ely",
        "James Grover McDonald",
        "Anna Garlin Spencer",
        "Anne Henrietta Martin",
        "Graham Taylor",
        "Mary Sheepshanks",
        "Cornelia Ramondt-Hirschmann",
        "Anna Marcet Haldeman",
        "Mina Caroline Ginger Van Winkle",
        "Louis Paul Lochner",
        "Gertrud Baer",
        "Lucy Biddle Lewis",
        "Catherine Elizabeth Marshall",
        "David Starr Jordan",
        "Carrie Chapman Catt",
        "Benjamin Barr Lindsey",
        "Abraham Isaak",
        "Theodore Roosevelt",
        "Allen B. Pond",
        "William Kent",
        "Clara Landsberg",
        "Crystal Eastman",
        "George Platt Brett Sr.",
        "William Draper Lewis",
        "Katharine Coman",
        "Marianne Beth",
        "Karl Beth",
        "Marianne Hainisch",
    ]

    # Look for mentions of specific people in the title
    found_names = []
    for name in key_names:
        if name in title_str:
            found_names.append(name)

    # If we found exactly one name, it's likely Jane Addams (the author)
    if len(found_names) == 1 and found_names[0] == "Jane Addams":
        senders.append("Jane Addams")
        return senders, receivers

    # If we found multiple names, they're likely the people involved
    if len(found_names) > 0:
        for name in found_names:
            if name != "Jane Addams":
                if "Jane Addams" in found_names:
                    # If Jane Addams is in the list, she's likely the sender
                    if "Jane Addams" not in senders:
                        senders.append("Jane Addams")
                    receivers.append(name)
                else:
                    # If Jane Addams isn't in the list, the person is likely the sender
                    senders.append(name)

        # If we have senders but no receivers, Jane Addams is likely the receiver
        if senders and not receivers and "Jane Addams" not in senders:
            receivers.append("Jane Addams")

        return senders, receivers

    # Return empty lists if no names found
    return senders, receivers

In [33]:
def extract_names_from_data(df):
    """Apply name extraction to dataframe."""
    senders_list = []
    receivers_list = []
    for idx, row in df.iterrows():
        s, r = extract_names(row["title"])
        senders_list.append(s)
        receivers_list.append(r)

    df["senders"] = senders_list
    df["receivers"] = receivers_list

    # Check for empty senders/receivers
    empty_names = df[
        (df["senders"].apply(lambda x: len(x) == 0))
        & (df["receivers"].apply(lambda x: len(x) == 0))
    ]
    print(f"\nNumber of items with no extracted names: {len(empty_names)}")

    # Sample these for inspection
    if len(empty_names) > 0:
        print("\nSample of titles with no extracted names:")
        for i in range(min(100, len(empty_names))):
            print(f"{i + 1}. {empty_names.iloc[i]['title']}")

    return df


# Extract names
print("Extracting names for 1901-1935 data...")
df = extract_names_from_data(df)

Extracting names for 1901-1935 data...

Number of items with no extracted names: 11705

Sample of titles with no extracted names:
1. The Progressive Party and the Negro, November 1912
2. The Black Scourge in Europe, April 10, 1920
3. Respect for Law, January 3, 1901
4. Discuss Negroes Yearly, June 2, 1909
5. Social Control, January 1911
6. Advantages and Disadvantages of a Broken Inheritance, May 26, 1908
7. Lynch Six Negroes; Trick Sheriff's Son, May 22, 1911
8. The Progressive Party and Social Legislation, September 18, 1912
9. The Progressive Party and the Negro, November 1912 (fragment)
10. The Progressive Party and the Negro, November 1912
11. The Progressive Party and the Negro, November 1912
12. Has the Emancipation Act Been Nullified by National Indifference? February 1, 1913 (fragment)
13. The Deserted Negro, August 1912
14. The Colonel's Southern Policy, August 7, 1912
15. Call for a Lincoln Conference on the Negro Question, February 12, 1909
16. Colored Farmers of Texas, Aug

In [34]:
# Network analysis functions
def analyze_network(df):
    """Analyze the network data and print statistics."""
    # Create a list of all people mentioned
    all_people = []
    for s in df["senders"]:
        all_people.extend(s)
    for r in df["receivers"]:
        all_people.extend(r)

    # Count frequency of each person
    people_counts = Counter(all_people)
    top_people = people_counts.most_common(50)

    # Calculate percentage of data with no extracted names
    empty_names = df[
        (df["senders"].apply(lambda x: len(x) == 0))
        & (df["receivers"].apply(lambda x: len(x) == 0))
    ]
    total_records = len(df)
    empty_records = len(empty_names)
    empty_percentage = (empty_records / total_records) * 100

    print(f"Total unique people identified: {len(people_counts)}")
    print(
        f"Records with no extracted names: {empty_records} out of {total_records} ({empty_percentage:.2f}%)"
    )

    print("\nTop 50 people mentioned in the documents:")
    for person, count in top_people:
        print(f"{person}: {count} mentions")

    # Analyze senders and receivers separately
    all_senders = []
    for s in df["senders"]:
        all_senders.extend(s)

    all_receivers = []
    for r in df["receivers"]:
        all_receivers.extend(r)

    senders_counts = Counter(all_senders)
    receivers_counts = Counter(all_receivers)

    print(f"\nUnique senders: {len(senders_counts)}")
    print(f"Unique receivers: {len(receivers_counts)}")

    # Plot distribution of top senders and receivers
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    top_senders = pd.Series(dict(senders_counts.most_common(10)))
    top_senders.plot(kind="barh")
    plt.title("Top 10 Senders")
    plt.tight_layout()

    plt.subplot(1, 2, 2)
    top_receivers = pd.Series(dict(receivers_counts.most_common(10)))
    top_receivers.plot(kind="barh")
    plt.title("Top 10 Receivers")
    plt.tight_layout()

    plt.savefig("sender_receiver_distribution.png")
    plt.close()

    print("\nSender-Receiver distribution saved as 'sender_receiver_distribution.png'")

    return top_people


# Analyze network
print("Analyzing network...")
top_people = analyze_network(df)

Analyzing network...
Total unique people identified: 4887
Records with no extracted names: 11705 out of 55881 (20.95%)

Top 50 people mentioned in the documents:
Jane Addams: 38358 mentions
Emily Greene Balch: 2541 mentions
Paul Underwood Kellogg: 1593 mentions
Eleanor Daggett Karsten: 890 mentions
Mary Rozet Smith: 815 mentions
Lillian D. Wald: 583 mentions
Sarah Alice Addams Haldeman: 552 mentions
Madeleine Zabriskie Doty: 513 mentions
Anita McCormick Blaine: 493 mentions
Dorothy Detzer: 444 mentions
Mary Ryott Sheepshanks: 416 mentions
Hannah Clothier Hull: 395 mentions
Amy Woods: 364 mentions
Rosika Schwimmer: 354 mentions
Anna Marcet Haldeman-Julius: 321 mentions
Myra Harriet Reynolds Linn: 320 mentions
Henrietta Octavia Rowland Barnett: 286 mentions
Florence Kelley: 285 mentions
Harriet Park Thomas: 279 mentions
Stanley Ross Linn: 275 mentions
Lucia Ames Mead: 272 mentions
Salmon Oliver Levinson: 263 mentions
Wilbur Kelsey Thomas: 258 mentions
Albert Joseph Kennedy: 252 mentions


In [35]:
def create_network_dataset(df, top_people):
    """Create a network dataset for visualization."""
    network_data = []

    for idx, row in df.iterrows():
        senders = row["senders"]
        receivers = row["receivers"]
        tag = row["tag_name"]
        year = row["year"]

        # If we have both sender and receiver
        if senders and receivers:
            for sender in senders:
                for receiver in receivers:
                    # Avoid self-loops unless it's Jane Addams (to herself)
                    if sender != receiver or sender == "Jane Addams":
                        network_data.append(
                            {
                                "source": sender,
                                "target": receiver,
                                "tag": tag,
                                "year": year,
                                "title": row["title"],
                            }
                        )

    # Convert to DataFrame
    network_df = pd.DataFrame(network_data)

    # Print network statistics
    print("\nNetwork statistics:")
    print(f"Number of connections: {len(network_df)}")
    print(f"Number of unique sources: {network_df['source'].nunique()}")
    print(f"Number of unique targets: {network_df['target'].nunique()}")
    print(f"Number of tags in connections: {network_df['tag'].nunique()}")

    # Save the network data for visualization
    network_df.to_csv("network_data.csv", index=False)
    print("\nNetwork data saved to 'network_data.csv'")

    # Create a smaller dataset with only the main correspondents
    top_people_names = [person for person, _ in top_people[:20]]
    main_network_df = network_df[
        (network_df["source"].isin(top_people_names))
        | (network_df["target"].isin(top_people_names))
    ]

    # Save the main network data
    main_network_df.to_csv("main_network_data.csv", index=False)
    print("Main network data with top correspondents saved to 'main_network_data.csv'")

    return network_df


# Create network dataset
print("Creating network dataset...")
network_df = create_network_dataset(df, top_people)

Creating network dataset...

Network statistics:
Number of connections: 41547
Number of unique sources: 3996
Number of unique targets: 1796
Number of tags in connections: 288

Network data saved to 'network_data.csv'
Main network data with top correspondents saved to 'main_network_data.csv'


In [36]:
def get_name_variations():
    """Return standardized name variations dictionary."""
    return {
        "Miss Addams": "Jane Addams",
        "Sarah Alice Haldeman Addams": "Sarah Alice Addams Haldeman",
        "Sarah Alice Haldeman Adams": "Sarah Alice Addams Haldeman",
        "Sarah Haldeman Addams": "Sarah Alice Addams Haldeman",
        "S. Yarros": "Rachelle Slobodinsky Yarros",
        "Anna Marcet Haldeman": "Anna Marcet Haldeman-Julius",
        "Mary Sheepshanks": "Mary Ryott Sheepshanks",
        "Anne Martin": "Anne Henrietta Martin",
        "Myra Reynolds Linn": "Myra Harriet Reynolds Linn",
        "Richard T. Ely": "Richard Theodore Ely",
        "Rosika Schwimer": "Rosika Schwimmer",
        "Gandhi": "Mohandas Gandhi",
    }

In [37]:
def get_category_colors():
    """Return category color mapping."""
    return {
        "Peace Work": "#4a6eac",  # Blue
        "Social Reform": "#2ec394",  # Teal
        "Political Activism": "#eb6672",  # Coral/pink
        "Personal Relations": "#964b7d",  # Purple
        "Academic Work": "#f39c12",  # Orange
        "General Correspondence": "#95a5a6",  # Gray
    }

In [38]:
def get_non_person_entities():
    """Return list of non-person entities to exclude."""
    return {
        "Mason-Henry Press",
        "Io Victis",
        "How Would You Uplift",
        "How Build",
        "New York Herald",
        "Trades Unions",
        "Christianity Today",
        "Charitable Effort",
        "Newer Ideas",
        "July Anticipation",
        "Macmillan Company",
        "Recent Immigration",
        "Field Neglected",
        "Other Dangers",
        "Taking Her Place",
        "New Ideals",
        "Hospital Work Among",
        "Changing Ideals",
        "Gotten Gifts",
        "Other Christian Churches",
        "Tenement Housing",
        "As Ithers See Us",
        "Chicago Federation",
        "Newsboy Conditions",
        "Newer Ideals",
        "Neighborhood Improvement",
        "Tribute",
        "American Charities",
        "Introductory Note",
        "American Street Trades",
        "Modern Philanthropy",
        "Probation Work Under Civil Service",
        "Pure Food",
        "Sinai Temple",
        "Woman Suffrage",
        "Commercial Club Dinner",
        "American Immigrants",
        "Is Class Conflict",
        "America Growing",
        "Chicago Agencies",
        "Abraham Lincoln Centre",
        "Chelsea Historical Pageant",
        "Street Trading",
        "Autobiographical Notes Upon Twenty Years",
        "Twenty Years",
        "Autobiographical Notes",
        "Unknown",
        "Ten Years",
        "Why Women Should Vote",
        "New Conscience",
        "Ancient Evil",
        "Progressive Party",
        "Modern Lear",
        "Life Above",
        "Poverty Line",
        "City Youth",
        "Administering the Funds",
        "Houghton Mifflin Company",
        "Constructive Appeal",
        "Foreign Affairs",
        "War Time",
        "Food Supply",
        "Swedish Famine",
        "Sioux City Teachers",
        "Club Women",
        "Civil ServiceAmerican Immigrant",
        "Nineteenth Century Club",
        "Anonymous",
        "More Play",
        "Factory Girls",
        "Newer Conception",
        "New World",
        "American Civil Liberties Union",
        "United States Today",
        "Koven Bowen Biography",
        "Christmas Message",
        "Community Afford",
        "Crime Unsolved",
        "Let Us Start",
        "It Anew",
        "Needed Implement",
        "Business Depressions",
        "Courageous Life",
        "Correctives Suggested",
        "End War",
        "Representative Government",
        "Republican Party Platform",
        "New Day",
        "Revealing Human Needs",
        "Christmas Day",
        "World Comity",
        "Progress Exposition",
        "Modern Woman",
        "Remarks Introducing Eleanor Roosevelt",
        "Orchestra Hall",
        "First Session",
        "Present Necessity",
        "Why Wars Must Cease",
        "Fortieth Anniversary",
        "Because Wars Interfere",
        "Normal Growth",
        "Feminist Physician Speaks",
        "E.S.",
        "Charges Against Hull",
        "The College",
        "Municipal Museum",
        "Frederick Douglass Center",
        "Municipal Museum of Chicago",
        "Civil Service",
        "More Pay",
        "Jacobs",
        "Our Moral Obligation",
        "Women Schedule",
        "City Club",
        "Mayor Turns Censor",
        "Radio Discussion With Frank Bane",
        "Birthday Poem",
        "Her Fiftieth Birthday",
        "Toynbee Hall",
        "Housing Division",
        "Character Building",
        "Hebrew Sheltering",
        "Immigrant Aid Society",
        "Old Age Security",
        "United Charities",
        "Twentieth Anniversary",
        "In Memoriam",
        "A Birthday Greeting",
        "For Jane Addams",
        "G.D.C.",
        "Illinois State Senate Bill",
        "the Chicago Institute",
        "Resolutions Committee",
        "National Conference",
        "Child Laborers",
        "The Process",
        "Relief Mobilization",
        "C.D.M",
        "The Life of Individual",
        "The Pageant",
        "Remarks on Col.",
        "Corrective Suggested",
        "Professor Freund",
        "National Education",
        "Public Words",
        "Unknown E.T",
        "Public Recreation",
        "American Immigrant",
        "Corrective Suggested",
        "American Civil Liberties",
        "Average Citizen Is Ignored",
        "Why Women Are Concerned",
        "Financial Liabilities",
        "Larger Citizenship",
        "World Politics",
        "Religious Comity",
        "Professional Women",
    }

In [39]:
def create_bidirectional_connection_counts(network_df):
    """Create bidirectional connection counts."""
    name_variations = get_name_variations()

    # Create pairs where the alphabetically smaller name comes first
    connection_pairs = []
    for _, row in network_df.iterrows():
        source = row["source"]
        target = row["target"]

        # Standardize name variations
        if source in name_variations:
            source = name_variations[source]
        if target in name_variations:
            target = name_variations[target]

        # Sort names alphabetically to treat A→B and B→A as the same
        if source < target:
            connection_pairs.append((source, target))
        else:
            connection_pairs.append((target, source))

    # Count the combined connections
    combined_counts = Counter(connection_pairs)

    # Create a DataFrame from the counts
    combined_df = pd.DataFrame(
        [
            {"person1": pair[0], "person2": pair[1], "total_interactions": count}
            for pair, count in combined_counts.most_common()
        ]
    )

    # Save the bidirectional connection data
    combined_df.to_csv("bidirectional_connections.csv", index=False)
    print("\nBidirectional connection data saved to 'bidirectional_connections.csv'")

    return combined_df


# Create bidirectional connections
print("Creating bidirectional connection counts...")
bidirectional_df = create_bidirectional_connection_counts(network_df)

Creating bidirectional connection counts...

Bidirectional connection data saved to 'bidirectional_connections.csv'


In [40]:
def get_top_connections(network_df, year, non_person_entities, name_variations):
    """Get the top connections for a specific year.

    Returns:
        tuple: (list of top people, dictionary of person counts)
    """
    # Filter for the specific year
    year_data = network_df[network_df["year"] == year]

    if len(year_data) > 0:
        # Count connections
        year_connections = []

        for _, row in year_data.iterrows():
            source = row["source"]
            target = row["target"]

            # Standardize name variations
            if source in name_variations:
                source = name_variations[source]
            if target in name_variations:
                target = name_variations[target]

            # Exclude connections involving non-person entities
            if source in non_person_entities or target in non_person_entities:
                continue

            # Skip self-connections (Jane Addams to Jane Addams)
            if source == "Jane Addams" and target == "Jane Addams":
                continue

            # Only include connections where Jane Addams is involved
            if source != "Jane Addams" and target != "Jane Addams":
                continue

            # For consistency, always put Jane Addams as the first person
            if target == "Jane Addams":
                # Swap to ensure Jane Addams is first
                source, target = target, source

            year_connections.append((source, target))

        # Count the combined connections
        year_counts = Counter(year_connections)
        top_pairs = year_counts.most_common(15)

        # Extract just the target names and their counts
        top_people = []
        person_counts = {}

        for pair, count in top_pairs:
            person = pair[1]  # The target person (not Jane Addams)
            top_people.append(person)
            person_counts[person] = count

        return top_people, person_counts

    return [], {}

In [41]:
def standardize_names(network_df):
    """Apply name standardization to the network data."""
    name_variations = get_name_variations()

    # Apply name standardization directly to the network_df
    for name, standard in name_variations.items():
        network_df.loc[network_df["source"] == name, "source"] = standard
        network_df.loc[network_df["target"] == name, "target"] = standard

    return network_df


# Standardize names
print("Standardizing names...")
network_df = standardize_names(network_df)

Standardizing names...


In [42]:
def filter_jane_addams_network(network_df):
    """Filter to only include connections with Jane Addams and non-entities."""
    non_person_entities = get_non_person_entities()

    # Filter to only include connections with Jane Addams and non-entities
    addams_network = network_df[
        (
            (network_df["source"] == "Jane Addams")
            | (network_df["target"] == "Jane Addams")
        )
        & (~network_df["source"].isin(non_person_entities))
        & (~network_df["target"].isin(non_person_entities))
    ]

    # Create a column for the other person in each connection
    addams_network["other_person"] = addams_network.apply(
        lambda row: row["target"] if row["source"] == "Jane Addams" else row["source"],
        axis=1,
    )

    return addams_network


# Filter to Jane Addams network
print("Filtering to Jane Addams network...")
addams_network = filter_jane_addams_network(network_df)

Filtering to Jane Addams network...


In [43]:
def assign_categories(addams_network):
    """Assign categories to people in the network."""
    # Define category mapping based on the most common tags
    category_mapping = {
        # Social Reform category
        "Social Reform": [
            "Social",
            "Reform",
            "Settlement",
            "Hull-House",
            "Child Labor",
            "Labor",
            "Education",
        ],
        # Peace Work category
        "Peace Work": [
            "Peace",
            "War",
            "International",
            "Disarmament",
            "Arbitration",
            "Neutrality",
        ],
        # Political Activism category
        "Political Activism": [
            "Politics",
            "Woman Suffrage",
            "Progressive",
            "Democracy",
            "Government",
        ],
        # Personal Relations category
        "Personal Relations": [
            "Family",
            "Health",
            "Personal",
            "Praise",
            "Gratitude",
            "Holidays",
        ],
        # Academic Work category
        "Academic Work": [
            "Books",
            "Publishing",
            "Academic",
            "Research",
            "Lectures",
            "Writing",
        ],
    }

    # Group by person and tag to find dominant themes
    person_tag_counts = (
        addams_network.groupby(["other_person", "tag"]).size().reset_index(name="count")
    )

    # Assign each person to a category based on their most common tags
    person_categories = {}

    for person, group in person_tag_counts.groupby("other_person"):
        # Get top tags for this person
        top_tags = group.sort_values("count", ascending=False)["tag"].tolist()

        # Check which category their tags match best
        category_scores = {category: 0 for category in category_mapping}

        for tag in top_tags:
            for category, keywords in category_mapping.items():
                if any(keyword.lower() in tag.lower() for keyword in keywords):
                    category_scores[category] += 1

        # Assign to the category with the highest score
        if any(category_scores.values()):
            top_category = max(category_scores.items(), key=lambda x: x[1])[0]
            person_categories[person] = top_category
        else:
            person_categories[person] = "General Correspondence"

    # Add category information to the network
    addams_network["category"] = addams_network["other_person"].map(person_categories)

    # Print categories for top correspondents
    print("Category assignments for top correspondents:")
    for person in addams_network["other_person"].value_counts().head(20).index:
        category = person_categories.get(person, "General Correspondence")
        print(f"{person}: {category}")

    return addams_network


# Assign categories
print("Assigning categories to correspondents...")
addams_network = assign_categories(addams_network)

Assigning categories to correspondents...
Category assignments for top correspondents:
Emily Greene Balch: Peace Work
Paul Underwood Kellogg: Social Reform
Sarah Alice Addams Haldeman: Social Reform
Mary Ryott Sheepshanks: Peace Work
Anna Marcet Haldeman-Julius: Personal Relations
Lillian D. Wald: Social Reform
Anita McCormick Blaine: Social Reform
Madeleine Zabriskie Doty: Peace Work
Mary Rozet Smith: Social Reform
Myra Harriet Reynolds Linn: Personal Relations
Rosika Schwimmer: Social Reform
Dorothy Detzer: Social Reform
Amy Woods: Peace Work
Hannah Clothier Hull: Peace Work
Henrietta Octavia Rowland Barnett: Social Reform
Stanley Ross Linn: Personal Relations
Salmon Oliver Levinson: Peace Work
Anne Henrietta Martin: Personal Relations
Florence Kelley: Social Reform
Richard Theodore Ely: Social Reform


In [44]:
def create_yearly_top_connections(addams_network):
    """Create a dataset of top connections for each year."""
    name_variations = get_name_variations()
    non_person_entities = get_non_person_entities()

    # Analyze top connections by year
    year_range = range(
        int(addams_network["year"].min()), int(addams_network["year"].max()) + 1
    )

    print("\n===== Top 15 Connections with Jane Addams by Year =====\n")

    # Save the top 15 connections for each year in a CSV file
    top_connections_by_year = []

    for year in year_range:
        # Get top connections for this year
        top_people, person_counts = get_top_connections(
            addams_network, year, non_person_entities, name_variations
        )

        if top_people:
            for person in top_people:
                top_connections_by_year.append(
                    {
                        "year": year,
                        "source": "Jane Addams",
                        "target": person,
                        "count": person_counts[person],
                    }
                )

            print(f"Year {year} - Top connections with Jane Addams:")
            for person in top_people:
                print(f"  Jane Addams ⟷ {person}: {person_counts[person]} interactions")
            print("")

    # Convert to DataFrame
    top_connections_df = pd.DataFrame(top_connections_by_year)

    # Save the top connections to a CSV file
    top_connections_df.to_csv("top_connections_by_year.csv", index=False)
    print("\nTop connections by year saved to 'top_connections_by_year.csv'.")

    return top_connections_df


# Create yearly top connections
print("Creating yearly top connections...")
top_connections_df = create_yearly_top_connections(addams_network)

# Save the categorized network for the next part
addams_network.to_csv("jane_addams_categorized_network.csv", index=False)
print("Categorized network saved to 'jane_addams_categorized_network.csv'")

Creating yearly top connections...

===== Top 15 Connections with Jane Addams by Year =====

Year 1901 - Top connections with Jane Addams:
  Jane Addams ⟷ Sarah Alice Addams Haldeman: 38 interactions
  Jane Addams ⟷ Mary Rozet Smith: 18 interactions
  Jane Addams ⟷ Richard Theodore Ely: 18 interactions
  Jane Addams ⟷ Florence Kelley: 17 interactions
  Jane Addams ⟷ Abraham Isaak: 12 interactions
  Jane Addams ⟷ Anita McCormick Blaine: 10 interactions
  Jane Addams ⟷ Lillian D. Wald: 9 interactions
  Jane Addams ⟷ James Weber Linn: 9 interactions
  Jane Addams ⟷ George Platt Brett Sr.: 6 interactions
  Jane Addams ⟷ Charles Hammond Blatchford: 5 interactions
  Jane Addams ⟷ Maynard M. Metcalf: 4 interactions
  Jane Addams ⟷ Jacques Bardoux: 4 interactions
  Jane Addams ⟷ Jenkin Lloyd Jones: 4 interactions
  Jane Addams ⟷ Henry Carter Adams: 4 interactions
  Jane Addams ⟷ Charles O. Boring: 3 interactions

Year 1902 - Top connections with Jane Addams:
  Jane Addams ⟷ Sarah Alice Addams 

In [45]:
def create_network_data(year, network_df):
    """Prepare network visualization data for a specific year."""
    name_variations = get_name_variations()
    non_person_entities = get_non_person_entities()
    category_colors = get_category_colors()

    # Get the top connections for this year
    top_people, person_counts = get_top_connections(
        network_df, year, non_person_entities, name_variations
    )

    if not top_people:
        return None

    # Get categories for each person
    person_categories = {}
    for person in top_people:
        # Find this person in the data
        person_data = network_df[
            (network_df["other_person"] == person) & (network_df["year"] == year)
        ]
        if not person_data.empty and "category" in person_data.columns:
            person_categories[person] = person_data["category"].iloc[0]
        else:
            person_categories[person] = "General Correspondence"

    # Calculate positions with improved layout
    positions = {}
    positions["Jane Addams"] = [0, 0]

    # Group by category
    category_people = {}
    for person in top_people:
        category = person_categories.get(person, "General Correspondence")
        if category not in category_people:
            category_people[category] = []
        category_people[category].append(person)

    # Assign angles to categories, with more spacing
    category_angles = {}
    for i, category in enumerate(category_people.keys()):
        category_angles[category] = i * (2 * np.pi / len(category_people))

    # Use a larger canvas
    min_distance = 0.8
    max_distance = 2.0

    # Position people within their categories
    for category, people in category_people.items():
        base_angle = category_angles[category]
        category_width = 2 * np.pi / len(category_people)
        angle_step = category_width / (len(people) + 1)

        for i, person in enumerate(people):
            person_angle = base_angle + (i + 1) * angle_step

            max_count = max(person_counts.values())
            connection_strength = person_counts[person] / max_count

            distance = max_distance - (
                connection_strength * (max_distance - min_distance)
            )
            distance += random.uniform(-0.05, 0.05)

            positions[person] = [
                distance * np.cos(person_angle),
                distance * np.sin(person_angle),
            ]

    # Check for overlaps and adjust as needed
    overlap_iterations = 5
    for _ in range(overlap_iterations):
        overlaps_fixed = 0

        nodes = list(positions.keys())
        for i, node1 in enumerate(nodes):
            for node2 in nodes[i + 1 :]:
                x1, y1 = positions[node1]
                x2, y2 = positions[node2]

                distance = np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)

                if distance < 0.5 and node1 != "Jane Addams" and node2 != "Jane Addams":
                    dx, dy = x2 - x1, y2 - y1
                    if distance > 0:
                        dx, dy = dx / distance, dy / distance
                    else:
                        dx, dy = random.uniform(-1, 1), random.uniform(-1, 1)

                    positions[node1] = [x1 - dx * 0.1, y1 - dy * 0.1]
                    positions[node2] = [x2 + dx * 0.1, y2 + dy * 0.1]
                    overlaps_fixed += 1

        if overlaps_fixed == 0:
            break

    # Create better text positions that don't overlap
    text_positions = {}
    for person in positions:
        if person == "Jane Addams":
            text_positions[person] = "middle center"
            continue

        x, y = positions[person]
        angle = np.arctan2(y, x)

        if -np.pi / 8 <= angle < np.pi / 8:  # Right
            text_positions[person] = "middle right"
        elif np.pi / 8 <= angle < 3 * np.pi / 8:  # Upper right
            text_positions[person] = "bottom right"
        elif 3 * np.pi / 8 <= angle < 5 * np.pi / 8:  # Top
            text_positions[person] = "bottom center"
        elif 5 * np.pi / 8 <= angle < 7 * np.pi / 8:  # Upper left
            text_positions[person] = "bottom left"
        elif 7 * np.pi / 8 <= angle or angle < -7 * np.pi / 8:  # Left
            text_positions[person] = "middle left"
        elif -7 * np.pi / 8 <= angle < -5 * np.pi / 8:  # Lower left
            text_positions[person] = "top left"
        elif -5 * np.pi / 8 <= angle < -3 * np.pi / 8:  # Bottom
            text_positions[person] = "top center"
        else:  # Lower right
            text_positions[person] = "top right"

    # Get more details about each correspondence
    correspondence_details = {}
    for person in top_people:
        # Find this person's interactions with Jane Addams
        interactions = network_df[
            (
                (
                    (network_df["source"] == "Jane Addams")
                    & (network_df["target"] == person)
                )
                | (
                    (network_df["target"] == "Jane Addams")
                    & (network_df["source"] == person)
                )
            )
            & (network_df["year"] == year)
        ]

        # Format correspondence details
        details = f"<b>{person}</b><br>"
        details += f"Total interactions: {person_counts[person]}<br>"
        details += (
            f"Category: {person_categories.get(person, 'General Correspondence')}"
        )

        # Only try to add tag information if available in the data
        if not interactions.empty:
            # Check which column to use for tags
            if "tag_name" in interactions.columns:
                tag_column = "tag_name"
            elif "tag" in interactions.columns:
                tag_column = "tag"
            else:
                tag_column = None

            # Add tag information if available
            if tag_column:
                tag_counts = interactions[tag_column].value_counts().to_dict()
                if tag_counts:
                    top_tags = sorted(
                        tag_counts.items(), key=lambda x: x[1], reverse=True
                    )[:3]
                    details += "<br>Top topics: " + ", ".join(
                        [f"{tag} ({count})" for tag, count in top_tags]
                    )

        correspondence_details[person] = details

    # Prepare node trace
    node_x = [positions["Jane Addams"][0]]
    node_y = [positions["Jane Addams"][1]]
    node_text = ["Jane Addams"]
    node_hover = ["Jane Addams - Central Node"]
    node_sizes = [100]
    node_colors = ["#4a6eac"]
    node_textpositions = ["middle center"]

    # Add other nodes
    for person in top_people:
        node_x.append(positions[person][0])
        node_y.append(positions[person][1])
        node_text.append(person)
        node_hover.append(correspondence_details[person])

        # Size based on connections
        size = 20 + (person_counts[person] / max(person_counts.values())) * 30
        node_sizes.append(size)

        # Color based on category
        category = person_categories.get(person, "General Correspondence")
        node_colors.append(category_colors.get(category, "#95a5a6"))
        node_textpositions.append(text_positions[person])

    # Prepare edge traces
    edge_x = []
    edge_y = []

    for person in top_people:
        x0, y0 = positions["Jane Addams"]
        x1, y1 = positions[person]

        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    # Return all the data needed for the plot
    return {
        "node_x": node_x,
        "node_y": node_y,
        "node_text": node_text,
        "node_hover": node_hover,
        "node_sizes": node_sizes,
        "node_colors": node_colors,
        "node_textpositions": node_textpositions,
        "edge_x": edge_x,
        "edge_y": edge_y,
        "categories_present": set(person_categories.values()),
    }


# Prepare sample visualization data for one year
sample_year = 1915
print(f"Preparing sample visualization data for {sample_year}...")
network_data = create_network_data(sample_year, addams_network)

if network_data:
    print(f"Successfully prepared visualization data for {sample_year}")
    print(
        f"Found {len(network_data['node_text']) - 1} connections with Jane Addams in {sample_year}"
    )
else:
    print(f"No data available for {sample_year}")


Preparing sample visualization data for 1915...
Successfully prepared visualization data for 1915
Found 15 connections with Jane Addams in 1915


In [46]:
# Create static network visualizations for each year
def create_static_network(year, network_df):
    """Create a static network visualization for a specific year."""
    # Get the network data for this year
    network_data = create_network_data(year, network_df)

    if not network_data:
        print(f"No connections found for year {year}")
        return None

    category_colors = get_category_colors()

    # Create figure
    fig = go.Figure()

    # Add edges
    fig.add_trace(
        go.Scatter(
            x=network_data["edge_x"],
            y=network_data["edge_y"],
            line=dict(width=0.8, color="#888888"),
            hoverinfo="none",
            mode="lines",
            showlegend=False,
        )
    )

    # Add nodes with enhanced hover text
    fig.add_trace(
        go.Scatter(
            x=network_data["node_x"],
            y=network_data["node_y"],
            mode="markers+text",
            marker=dict(
                size=network_data["node_sizes"],
                color=network_data["node_colors"],
                line=dict(width=1, color="#ffffff"),
                opacity=0.85,
            ),
            text=network_data["node_text"],
            textposition=network_data["node_textpositions"],
            textfont=dict(size=10, color="#000000"),
            hovertext=network_data["node_hover"],
            hoverinfo="text",
            showlegend=False,
        )
    )

    # Add category legend
    categories_present = network_data["categories_present"]

    for category in categories_present:
        fig.add_trace(
            go.Scatter(
                x=[None],
                y=[None],
                mode="markers",
                marker=dict(size=10, color=category_colors.get(category, "#95a5a6")),
                name=category,
                showlegend=True,
            )
        )

    # Update layout for a larger view
    fig.update_layout(
        title=f"Jane Addams' Network in {int(year)}",
        font=dict(size=16),
        showlegend=True,
        legend=dict(
            title="Categories",
            xanchor="left",
            yanchor="bottom",
            x=0.01,
            y=0.01,
            bgcolor="rgba(255, 255, 255, 0.8)",
        ),
        margin=dict(b=20, l=5, r=5, t=40),
        width=900,
        height=900,
        plot_bgcolor="rgb(240, 245, 250)",
        xaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-2.5, 2.5],
        ),
        yaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-2.5, 2.5],
        ),
    )

    return fig

In [47]:
def create_yearly_static_networks(data):
    """Create static network visualizations for each year."""
    # Ensure output directory exists
    os.makedirs("yearly_networks", exist_ok=True)

    for year in sorted(data["year"].unique()):
        # Only process years in our range
        if 1901 <= year <= 1935:
            fig = create_static_network(year, data)
            if fig:
                fig.write_html(f"yearly_networks/network_{int(year)}.html")
                print(f"Created visualization for year {int(year)}")

    print(
        "All static visualizations have been created in the 'yearly_networks' folder."
    )


# Create static network visualizations
print("Creating static network visualizations...")
create_yearly_static_networks(addams_network)

Creating static network visualizations...
Created visualization for year 1901
Created visualization for year 1902
Created visualization for year 1903
Created visualization for year 1904
Created visualization for year 1905
Created visualization for year 1906
Created visualization for year 1907
Created visualization for year 1908
Created visualization for year 1909
Created visualization for year 1910
Created visualization for year 1911
Created visualization for year 1912
Created visualization for year 1913
Created visualization for year 1914
Created visualization for year 1915
Created visualization for year 1916
Created visualization for year 1917
Created visualization for year 1918
Created visualization for year 1919
Created visualization for year 1920
Created visualization for year 1921
Created visualization for year 1922
Created visualization for year 1923
Created visualization for year 1924
Created visualization for year 1925
Created visualization for year 1926
Created visualization 

In [48]:
def add_year_navigation(fig, years_list):
    """Add navigation controls including left and right arrow buttons to the figure."""
    # Set up the layout with title
    fig.update_layout(
        title="Jane Addams' Network Over Time",
        font=dict(size=18),
        showlegend=True,
        legend=dict(
            title="Categories",
            xanchor="left",
            yanchor="bottom",
            x=0.01,
            y=0.01,
            bgcolor="rgba(255, 255, 255, 0.8)",
        ),
        margin=dict(b=20, l=5, r=5, t=60),
        width=900,
        height=900,
        plot_bgcolor="rgb(240, 245, 250)",
        xaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-2.5, 2.5],
        ),
        yaxis=dict(
            showgrid=False,
            zeroline=False,
            showticklabels=False,
            range=[-2.5, 2.5],
        ),
    )

    # Create the slider steps
    slider_steps = []
    for year in years_list:
        slider_steps.append(
            dict(
                method="animate",
                args=[
                    [str(int(year))],
                    dict(
                        frame=dict(duration=300, redraw=True),
                        mode="immediate",
                        transition=dict(duration=300),
                    ),
                ],
                label=str(int(year)),
            )
        )

    # Create year slider
    sliders = [
        dict(
            active=0,
            currentvalue={"prefix": "Year: "},
            pad={"t": 50},
            steps=slider_steps,
        )
    ]

    # Create updatemenus for play/pause buttons
    updatemenus = [
        dict(
            type="buttons",
            showactive=False,
            buttons=[
                dict(
                    label="Play",
                    method="animate",
                    args=[
                        None,
                        dict(
                            frame=dict(duration=500, redraw=True),
                            fromcurrent=True,
                            mode="immediate",
                            transition=dict(duration=500),
                        ),
                    ],
                ),
                dict(
                    label="Pause",
                    method="animate",
                    args=[
                        [None],
                        dict(
                            frame=dict(duration=0, redraw=False),
                            mode="immediate",
                            transition=dict(duration=0),
                        ),
                    ],
                ),
            ],
            direction="left",
            pad=dict(r=10, t=85),
            x=0.1,
            y=0,
        ),
    ]

    # Update layout with sliders and buttons
    fig.update_layout(
        updatemenus=updatemenus,
        sliders=sliders,
    )

    # We can't directly modify fig.config, so we'll handle this in save_interactive_visualization
    return fig

In [49]:
def create_interactive_visualization(data):
    """Create an interactive visualization with animation controls and left/right navigation."""
    category_colors = get_category_colors()

    # Generate data for each year
    all_years = sorted([y for y in data["year"].unique() if 1901 <= y <= 1935])
    networks_by_year = {}

    # Prepare data for each year
    for year in all_years:
        network_data = create_network_data(year, data)
        if network_data:
            networks_by_year[year] = network_data

    # Make sure we have data
    if not networks_by_year:
        print("No data available for visualization")
        return None

    # Collect ALL unique categories across all years
    all_categories = set()
    for network in networks_by_year.values():
        all_categories.update(network["categories_present"])

    # Convert to a sorted list for consistent ordering
    all_categories = sorted(list(all_categories))
    print(f"All possible categories across all years: {all_categories}")

    # Create the initial figure with data from the first available year
    first_year = min(networks_by_year.keys())
    initial_data = networks_by_year[first_year]

    # Create figure
    fig = go.Figure()

    # Add initial edges
    fig.add_trace(
        go.Scatter(
            x=initial_data["edge_x"],
            y=initial_data["edge_y"],
            line=dict(width=0.8, color="#888888"),
            hoverinfo="none",
            mode="lines",
            showlegend=False,
            name="Edges",
        )
    )

    # Add initial nodes
    fig.add_trace(
        go.Scatter(
            x=initial_data["node_x"],
            y=initial_data["node_y"],
            mode="markers+text",
            marker=dict(
                size=initial_data["node_sizes"],
                color=initial_data["node_colors"],
                line=dict(width=1, color="#ffffff"),
                opacity=0.85,
            ),
            text=initial_data["node_text"],
            textposition=initial_data["node_textpositions"],
            textfont=dict(size=10, color="#000000"),
            hovertext=initial_data["node_hover"],
            hoverinfo="text",
            showlegend=False,
            name="Nodes",
        )
    )

    # Add ALL category traces, but only show those present in first year
    for category in all_categories:
        # Check if this category is present in the first year
        is_visible = category in initial_data["categories_present"]

        fig.add_trace(
            go.Scatter(
                x=[None],
                y=[None],
                mode="markers",
                marker=dict(size=10, color=category_colors.get(category, "#95a5a6")),
                name=category,
                showlegend=True,
                visible=is_visible,
            )
        )

    # Create frames for each year
    frames = []
    years_list = sorted(networks_by_year.keys())

    for year in years_list:
        network = networks_by_year[year]

        # Create base traces (edges and nodes)
        frame_data = [
            # Edge trace (always index 0)
            go.Scatter(
                x=network["edge_x"],
                y=network["edge_y"],
                line=dict(width=0.8, color="#888888"),
                hoverinfo="none",
                mode="lines",
                showlegend=False,
            ),
            # Node trace (always index 1)
            go.Scatter(
                x=network["node_x"],
                y=network["node_y"],
                mode="markers+text",
                marker=dict(
                    size=network["node_sizes"],
                    color=network["node_colors"],
                    line=dict(width=1, color="#ffffff"),
                    opacity=0.85,
                ),
                text=network["node_text"],
                textposition=network["node_textpositions"],
                textfont=dict(size=10, color="#000000"),
                hovertext=network["node_hover"],
                hoverinfo="text",
                showlegend=False,
            ),
        ]

        # Add ALL category traces, but set visibility based on presence in this year
        for category in all_categories:
            is_visible = category in network["categories_present"]

            frame_data.append(
                go.Scatter(
                    x=[None],
                    y=[None],
                    mode="markers",
                    marker=dict(
                        size=10, color=category_colors.get(category, "#95a5a6")
                    ),
                    name=category,
                    showlegend=True,
                    visible=is_visible,
                )
            )

        # Create the frame with all traces
        frame = go.Frame(
            data=frame_data,
            name=str(int(year)),
        )
        frames.append(frame)

    # Add frames to figure
    fig.frames = frames

    # Add navigation controls with left/right arrows
    add_year_navigation(fig, years_list)

    return fig

In [50]:
def save_interactive_visualization(fig):
    """Save the interactive visualization to HTML file with working arrow buttons."""
    # Extract years from the frames
    years_list = [int(frame.name) for frame in fig.frames]
    years_js_array = "[" + ",".join(map(str, years_list)) + "]"

    # Create additional HTML for the left/right navigation
    additional_html = f"""
    <style>
        .year-nav-button {{
            position: absolute;
            top: 50%;
            transform: translateY(-50%);
            background-color: rgba(74, 110, 172, 0.9);
            color: white;
            border: 3px solid white;
            border-radius: 50%;
            width: 50px;
            height: 50px;
            font-size: 30px;
            cursor: pointer;
            z-index: 10000;
            display: flex;
            align-items: center;
            justify-content: center;
            box-shadow: 0 4px 8px rgba(0,0,0,0.3);
        }}
        
        #prev-year {{
            left: 30px;
        }}
        
        #next-year {{
            right: 30px;
        }}
    </style>

    <script>
        // Wait for the page to fully load
        window.addEventListener('load', function() {{
            // Find the plotly graph container
            const plotlyContainer = document.querySelector('.plotly');
            
            if (!plotlyContainer) {{
                return;
            }}
            
            // Get the ID of the plotly graph (the parent element)
            const graphElement = plotlyContainer.closest('[id]');
            const graphId = graphElement ? graphElement.id : null;
            
            if (!graphId) {{
                return;
            }}
            
            // Variables to track state
            let currentYearIndex = 0;
            const years = {years_js_array};
            
            // Create the buttons with arrow symbols
            const prevButton = document.createElement('button');
            prevButton.id = 'prev-year';
            prevButton.className = 'year-nav-button';
            prevButton.innerHTML = '&#8592;';
            
            const nextButton = document.createElement('button');
            nextButton.id = 'next-year';
            nextButton.className = 'year-nav-button';
            nextButton.innerHTML = '&#8594;';
            
            // Add buttons to the container
            plotlyContainer.appendChild(prevButton);
            plotlyContainer.appendChild(nextButton);
            
            // Navigate to previous year
            prevButton.addEventListener('click', function() {{
                if (currentYearIndex > 0) {{
                    currentYearIndex--;
                    const year = years[currentYearIndex];
                    try {{
                        Plotly.animate(graphId, [year.toString()], {{
                            transition: {{ duration: 300 }},
                            frame: {{ duration: 300 }}
                        }});
                    }} catch (e) {{
                        console.error("Error navigating to previous year:", e);
                    }}
                }}
            }});
            
            // Navigate to next year
            nextButton.addEventListener('click', function() {{
                if (currentYearIndex < years.length - 1) {{
                    currentYearIndex++;
                    const year = years[currentYearIndex];
                    try {{
                        Plotly.animate(graphId, [year.toString()], {{
                            transition: {{ duration: 300 }},
                            frame: {{ duration: 300 }}
                        }});
                    }} catch (e) {{
                        console.error("Error navigating to next year:", e);
                    }}
                }}
            }});
            
            // Find the current year from the slider
            const currentYearElement = document.querySelector('.slider-current-value');
            if (currentYearElement) {{
                const yearText = currentYearElement.textContent;
                const yearMatch = yearText.match(/Year: (\\d+)/);
                if (yearMatch) {{
                    const year = parseInt(yearMatch[1]);
                    currentYearIndex = years.indexOf(year);
                }}
            }}
            
            // Make buttons visible through styles
            prevButton.style.display = 'flex';
            prevButton.style.visibility = 'visible';
            prevButton.style.opacity = '1';
            nextButton.style.display = 'flex';
            nextButton.style.visibility = 'visible';
            nextButton.style.opacity = '1';
        }});
    </script>
    """

    # Save the figure to an HTML file
    fig.write_html("network.html", include_plotlyjs=True, full_html=True)

    # Now open the file, add our custom HTML, and save it again
    with open("network.html", "r") as file:
        html_content = file.read()

    # Insert our custom HTML before the </body> tag
    updated_html = html_content.replace("</body>", additional_html + "</body>")

    with open("network.html", "w") as file:
        file.write(updated_html)

    print("Interactive visualization with arrow navigation saved to 'network.html'")


# Create interactive visualization
print("Creating interactive visualization...")
interactive_fig = create_interactive_visualization(addams_network)

if interactive_fig:
    # Save the interactive visualization with navigation arrows
    print("Saving interactive visualization...")
    save_interactive_visualization(interactive_fig)
    print("Interactive visualization saved successfully.")
else:
    print("Failed to create interactive visualization.")

    print("\nAll analysis and visualization complete!")

Creating interactive visualization...
All possible categories across all years: ['Academic Work', 'General Correspondence', 'Peace Work', 'Personal Relations', 'Political Activism', 'Social Reform']
Saving interactive visualization...
Interactive visualization with arrow navigation saved to 'network.html'
Interactive visualization saved successfully.
