# Novel Scenification - Tag Analysis Notebook

This notebook allows you to run the tag analysis script and explore the resulting data.

## Project Overview

This project analyzes scene usage in English novels circa 1800 by processing custom-tagged HTML files to extract metrics on narrative techniques.

## 1. Run Tag Analysis

First, let's run the `count_tags.py` script which will:
- Process HTML files in `data/input/`
- Generate CSV files in `data/counts/`
- Create a summary Excel file at `data/tag_counts_summary.xlsx`
- Generate Markdown summaries in `data/SUMMARY.md` and `data/SAMPLES.md`

Run the cell below to execute the script:

## Install the library package dependencies

In [None]:
%pip install -r requirements.txt

## IMPORTANT: After installing make sure to RESTART the notebook kernel

## Run the external tag analysis script


In [None]:
!python count_tags.py

## 2. Import Libraries

Now let's import the necessary libraries for data analysis and visualization:

In [None]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML, Markdown

# Configure visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = [12, 8]

## 3. Explore Input and Output Files

In [None]:
# List input HTML files
input_files = sorted(glob.glob('data/input/*.html'))
print(f"Found {len(input_files)} HTML files in data/input/")

# Display the first few files and their sizes
print("\nInput HTML files:")
for file in input_files[:5]:
    size_kb = os.path.getsize(file) / 1024
    print(f"- {os.path.basename(file)} ({size_kb:.1f} KB)")
    
if len(input_files) > 5:
    print(f"...and {len(input_files) - 5} more files")

In [None]:
# List output CSV files
output_files = sorted(glob.glob('data/counts/*.csv'))
print(f"Found {len(output_files)} CSV files in data/counts/")

# Display the first few files
print("\nOutput CSV files:")
for file in output_files[:5]:
    print(f"- {os.path.basename(file)}")
    
if len(output_files) > 5:
    print(f"...and {len(output_files) - 5} more files")

## 4. Load and Display Summary Data

In [None]:
# Check if the Excel summary file exists
excel_path = 'data/tag_counts_summary.xlsx'
if os.path.exists(excel_path):
    print(f"Excel summary file exists at: {excel_path}")
    
    # Display available sheets
    sheets = pd.ExcelFile(excel_path).sheet_names
    print(f"\nAvailable sheets in the Excel file: {sheets}")
    
    # Load the Summary sheet
    summary_df = pd.read_excel(excel_path, sheet_name='Summary')
    
    # Extract book titles from Sheet column (which may contain HYPERLINK formulas)
    def extract_title(sheet_name):
        if isinstance(sheet_name, str) and '=HYPERLINK' in sheet_name:
            import re
            match = re.search(r'"(.+?)"', sheet_name)
            if match:
                return match.group(1)
        return sheet_name
    
    summary_df['Title'] = summary_df['Sheet'].apply(extract_title)
    
    # Display the key columns
    display_cols = ['Title', 'Total_Words', 'Total_Tags', 'Chapter_Count', 
                   'SceneAction_Count', 'SceneAction_Words', 
                   'SceneDia_Count', 'SceneDia_Words']
    
    display(summary_df[display_cols])
else:
    print("Excel summary file not found. Please run the tag analysis script first.")

## 5. Visualize Basic Metrics

Let's create some basic visualizations of the tag data across texts:

In [None]:
# Create visualizations if summary data is available
if 'summary_df' in locals():
    # Try to extract years from titles (assuming format starts with year)
    # First, convert Title column to string type
    summary_df['Title'] = summary_df['Title'].astype(str)
    # Now extract the year
    summary_df['Year'] = summary_df['Title'].str.extract(r'^(\d{4})').astype(float).fillna(0).astype(int)
    
    # Sort by year
    sorted_df = summary_df.sort_values('Year')
    
    # Create a shortened title for display
    sorted_df['Short_Title'] = sorted_df['Title'].str.extract(r'^\d{4}\s+(.+?)\s+\d')
    
    # If Short_Title extraction failed, use the full Title
    sorted_df['Short_Title'] = sorted_df['Short_Title'].fillna(sorted_df['Title'])
    
    # Plot total word counts
    plt.figure(figsize=(14, 6))
    ax = sns.barplot(x='Short_Title', y='Total_Words', data=sorted_df)
    plt.title('Total Word Count by Text')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    
    # Add year labels on top of bars
    for i, year in enumerate(sorted_df['Year']):
        ax.text(i, 500, str(year), ha='center', fontweight='bold', color='black')
        
    plt.show()
    
    # Prepare data for scene type distribution
    scene_data = pd.melt(sorted_df, 
                        id_vars=['Short_Title', 'Year'], 
                        value_vars=['SceneAction_Words', 'SceneDia_Words'],
                        var_name='Scene Type', value_name='Word Count')
    
    # Clean up scene type names for display
    scene_data['Scene Type'] = scene_data['Scene Type'].str.replace('_Words', '')
    
    # Plot scene type distribution
    plt.figure(figsize=(14, 6))
    sns.barplot(x='Short_Title', y='Word Count', hue='Scene Type', data=scene_data)
    plt.title('Scene Type Distribution (Word Count)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print("No summary data available for visualization. Run the tag analysis first.")

## 6. Analyze Scene Type Percentages

In [None]:
# Calculate and visualize scene type percentages
if 'sorted_df' in locals():
    # Calculate percentages
    sorted_df['SceneAction_Pct'] = sorted_df['SceneAction_Words'] / sorted_df['Total_Words'] * 100
    sorted_df['SceneDia_Pct'] = sorted_df['SceneDia_Words'] / sorted_df['Total_Words'] * 100
    sorted_df['Other_Pct'] = 100 - sorted_df['SceneAction_Pct'] - sorted_df['SceneDia_Pct']
    
    # Create DataFrame for percentages
    pct_data = pd.melt(sorted_df, 
                      id_vars=['Short_Title', 'Year'], 
                      value_vars=['SceneAction_Pct', 'SceneDia_Pct', 'Other_Pct'],
                      var_name='Scene Type', value_name='Percentage')
    
    # Clean up scene type names
    pct_data['Scene Type'] = pct_data['Scene Type'].str.replace('_Pct', '')
    
    # Plot percentages
    plt.figure(figsize=(14, 6))
    sns.barplot(x='Short_Title', y='Percentage', hue='Scene Type', data=pct_data)
    plt.title('Percentage of Text by Scene Type')
    plt.xticks(rotation=45, ha='right')
    plt.ylabel('Percentage of Total Words')
    plt.tight_layout()
    plt.show()
    
    # Calculate and display average percentages by year
    year_groups = sorted_df.groupby('Year')
    year_stats = year_groups.agg({
        'SceneAction_Pct': 'mean',
        'SceneDia_Pct': 'mean',
        'Other_Pct': 'mean',
        'Short_Title': 'count'
    }).reset_index()
    
    year_stats.rename(columns={'Short_Title': 'Text_Count'}, inplace=True)
    
    print("\nAverage Scene Type Usage by Year:")
    display(year_stats)
    
    # Plot trends over time
    year_melt = pd.melt(year_stats, 
                       id_vars=['Year', 'Text_Count'], 
                       value_vars=['SceneAction_Pct', 'SceneDia_Pct', 'Other_Pct'],
                       var_name='Scene Type', value_name='Percentage')
    
    year_melt['Scene Type'] = year_melt['Scene Type'].str.replace('_Pct', '')
    
    plt.figure(figsize=(12, 6))
    sns.lineplot(x='Year', y='Percentage', hue='Scene Type', 
                marker='o', markersize=10, linewidth=2,
                data=year_melt)
    
    # Add text count labels
    for i, row in year_stats.iterrows():
        plt.text(row['Year'], 5, f"n={int(row['Text_Count'])}", ha='center', fontweight='bold')
    
    plt.title('Scene Type Usage Trends by Year')
    plt.ylabel('Average Percentage')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

## 7. Explore Tag Frequency Data

In [None]:
# Load and analyze tag frequency data
if os.path.exists(excel_path):
    try:
        # Load the Summary Freq Tags sheet
        freq_tags_df = pd.read_excel(excel_path, sheet_name='Summary Freq Tags')
        
        # Get column names representing tags (those ending with _Count)
        tag_cols = [col for col in freq_tags_df.columns if col.endswith('_Count') 
                   and col not in ['Total_Tags', 'Chapter_Count']]
        
        # Extract tag names without _Count suffix
        tag_names = [col.replace('_Count', '') for col in tag_cols]
        
        print(f"Found {len(tag_names)} unique tags in the corpus")
        
        # Calculate tag counts across all texts
        tag_counts = {}
        tag_words = {}
        
        for tag in tag_names:
            count_col = f"{tag}_Count"
            words_col = f"{tag}_Words"
            
            if count_col in freq_tags_df.columns:
                tag_counts[tag] = freq_tags_df[count_col].sum()
                
            if words_col in freq_tags_df.columns:
                tag_words[tag] = freq_tags_df[words_col].sum()
        
        # Create DataFrames for plotting
        count_df = pd.DataFrame(list(tag_counts.items()), columns=['Tag', 'Count'])
        count_df = count_df.sort_values('Count', ascending=False).head(15)
        
        words_df = pd.DataFrame(list(tag_words.items()), columns=['Tag', 'Word Count'])
        words_df = words_df.sort_values('Word Count', ascending=False).head(15)
        
        # Plot top tags by frequency
        plt.figure(figsize=(14, 6))
        sns.barplot(x='Tag', y='Count', data=count_df)
        plt.title('Top 15 Most Frequent Tags')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
        
        # Plot top tags by word count
        plt.figure(figsize=(14, 6))
        sns.barplot(x='Tag', y='Word Count', data=words_df)
        plt.title('Top 15 Tags by Word Count')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        print(f"Error analyzing tag frequency: {e}")

## 8. Analyze an Individual Text

Let's create a function to analyze tag patterns in a specific text:

In [None]:
def analyze_text(text_name=None):
    """Analyze tag distributions in a specific text
    
    Args:
        text_name: Part of the filename to match. If None, will analyze the first file.
    """
    # Get all CSV files
    csv_files = glob.glob('data/counts/*.csv')
    
    if not csv_files:
        print("No CSV files found. Run the tag analysis first.")
        return None
    
    # If no text specified, use the first one
    if text_name is None:
        csv_file = csv_files[0]
    else:
        # Find matching files
        matches = [f for f in csv_files if text_name in f]
        if not matches:
            print(f"No files found matching '{text_name}'")
            return None
        csv_file = matches[0]
    
    print(f"Analyzing: {os.path.basename(csv_file)}")
    
    # Load the CSV
    df = pd.read_csv(csv_file)
    
    # Get document totals
    totals = df[df['tag'] == 'totaldoctagswords'].iloc[0]
    total_tags = totals['tag_count']
    total_words = totals['word_count']
    
    print(f"Total tags: {total_tags}")
    print(f"Total words: {total_words}")
    
    # Remove totals row
    df = df[df['tag'] != 'totaldoctagswords']
    
    # Show top tags by frequency
    freq_df = df.sort_values('tag_count', ascending=False).head(15)
    print("\nTop 15 most frequent tags:")
    display(freq_df[['tag', 'tag_count', 'word_count']])
    
    # Visualize top tags by frequency
    plt.figure(figsize=(14, 6))
    sns.barplot(x='tag', y='tag_count', data=freq_df)
    plt.title(f'Top Tags by Frequency in {os.path.basename(csv_file)}')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    # Visualize top tags by word count
    word_df = df.sort_values('word_count', ascending=False).head(15)
    plt.figure(figsize=(14, 6))
    sns.barplot(x='tag', y='word_count', data=word_df)
    plt.title(f'Top Tags by Word Count in {os.path.basename(csv_file)}')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    return df

In [None]:
# Analyze the first text (if available)
output_files = glob.glob('data/counts/*.csv')
if output_files:
    text_df = analyze_text()
else:
    print("No output files found. Please run the tag analysis script first.")

## 9. Analyze Nested Tag Combinations

Let's examine compound tags (those with an underscore, indicating nesting) across the corpus:

In [None]:
def analyze_compound_tags():
    """Analyze nested tag combinations across all texts"""
    # Load all CSV files
    csv_files = glob.glob('data/counts/*.csv')
    if not csv_files:
        print("No CSV files found. Run the tag analysis first.")
        return None
    
    # Combine data from all files
    all_data = []
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        text_name = os.path.basename(csv_file).replace('.csv', '')
        df['text'] = text_name
        all_data.append(df)
    
    combined_df = pd.concat(all_data, ignore_index=True)
    
    # Filter for compound tags (containing underscore)
    compound_tags = combined_df[combined_df['tag'].str.contains('_')]
    
    # Sum counts across all texts
    tag_totals = compound_tags.groupby('tag').agg({
        'tag_count': 'sum',
        'word_count': 'sum'
    }).reset_index()
    
    # Get top compounds by frequency
    top_by_freq = tag_totals.sort_values('tag_count', ascending=False).head(15)
    print("Top nested tag combinations by frequency:")
    display(top_by_freq)
    
    # Visualize top compounds
    plt.figure(figsize=(14, 6))
    sns.barplot(x='tag', y='tag_count', data=top_by_freq)
    plt.title('Top Nested Tag Combinations by Frequency')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    # Get top compounds by word count
    top_by_words = tag_totals.sort_values('word_count', ascending=False).head(15)
    print("\nTop nested tag combinations by word count:")
    display(top_by_words)
    
    # Visualize top compounds by word count
    plt.figure(figsize=(14, 6))
    sns.barplot(x='tag', y='word_count', data=top_by_words)
    plt.title('Top Nested Tag Combinations by Word Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    return compound_tags

# Run the analysis
compound_data = analyze_compound_tags()