# Chapter 2 Replication: Genre Classification Study

This notebook visualizes the results from Ted Underwood's Chapter 2 genre classification models using Altair.

Data includes classification of detective fiction, science fiction, gothic fiction, and other literary genres in 19th-century texts.

In [1]:
import pandas as pd
import altair as alt
import numpy as np
import os

# Set Altair to use the data transformer for large datasets
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Load Model Results

In [2]:
# Load the main model output
output_dir = 'model_output'
model_name = 'detectnewgatesensation'  # Change this to visualize different models

# Load predictions
df = pd.read_csv(os.path.join(output_dir, f'{model_name}.csv'))
print(f"Loaded {len(df)} predictions")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

Loaded 574 predictions
Columns: ['volid', 'dateused', 'pubdate', 'birthdate', 'firstpub', 'gender', 'nation', 'allwords', 'logistic', 'realclass', 'trainflag', 'author', 'title', 'genretags']

First few rows:


Unnamed: 0,volid,dateused,pubdate,birthdate,firstpub,gender,nation,allwords,logistic,realclass,trainflag,author,title,genretags
0,uc2.ark+=13960=t70v8bj6k,1922,1922,0,1922,m,,15.526054,0.445023,1,1,"French, Joseph Lewis,",Masterpieces of mystery,locdetmyst | teamblack | locghost
1,nyp.33433074866413,1898,1908,1865,1898,m,uk,14.281603,0.111203,0,1,"Kipling, Rudyard,",The day's work,random | teamred
2,20352,1958,1958,1932,1958,m,us,14.059487,0.191117,0,1,"Updike, John,",The poorhouse fair,chirandom | teamblack
3,njp.32101032308387,1832,1881,1803,1832,m,uk,20.78851,0.170084,1,1,"Lytton, Edward Bulwer Lyt",Eugene Aram,newgate | teamred
4,uva.x000674483,1888,1888,1851,1888,f,uk,15.037266,0.132885,0,1,"Ward, Humphry,",Robert Elsmere,random | teamblack


In [3]:
# Load feature coefficients
coefs = pd.read_csv(os.path.join(output_dir, f'{model_name}.coefs.csv'))
coefs.columns = ['word', 'coefficient', 'impact']
print(f"\nLoaded {len(coefs)} feature coefficients")
coefs.head(10)


Loaded 2999 feature coefficients


Unnamed: 0,word,coefficient,impact
0,taught,-1.878489,-1114876000.0
1,their,-1.843144,-2048796.0
2,them,-1.727107,-2424631.0
3,shouted,-1.679311,-207481700.0
4,difference,-1.602835,-585283900.0
5,kissed,-1.597678,-322771300.0
6,tired,-1.596287,-257401400.0
7,weeks,-1.543603,-181952300.0
8,kiss,-1.531457,-425701400.0
9,live,-1.510197,-101261100.0


## Model Performance Overview

In [4]:
# Calculate accuracy
df['correct'] = (df['logistic'] > 0.5).astype(int) == df['realclass']
accuracy = df['correct'].mean()
print(f"Overall Accuracy: {accuracy:.2%}")
print(f"\nTrue Positives: {((df['logistic'] > 0.5) & (df['realclass'] == 1)).sum()}")
print(f"True Negatives: {((df['logistic'] <= 0.5) & (df['realclass'] == 0)).sum()}")
print(f"False Positives: {((df['logistic'] > 0.5) & (df['realclass'] == 0)).sum()}")
print(f"False Negatives: {((df['logistic'] <= 0.5) & (df['realclass'] == 1)).sum()}")

Overall Accuracy: 91.46%

True Positives: 261
True Negatives: 264
False Positives: 23
False Negatives: 26


## Visualizations

In [5]:
# 1. Distribution of Logistic Regression Scores
score_dist = alt.Chart(df).mark_area(opacity=0.3, interpolate='monotone').encode(
    x=alt.X('logistic:Q', bin=alt.Bin(maxbins=50), title='Prediction Score (0=negative, 1=positive)'),
    y='count()',
    color=alt.Color('realclass:N', scale=alt.Scale(domain=[0, 1], range=['#1f77b4', '#ff7f0e']), 
                     title='Actual Class'),
    tooltip=['logistic:Q', 'count():Q', 'realclass:N']
).properties(
    width=700,
    height=300,
    title=f'{model_name.title()} - Distribution of Prediction Scores'
)

score_dist.display()

In [6]:
# 2. Predictions Over Time
df_time = df[['firstpub', 'logistic', 'realclass']].copy()
df_time['decade'] = (df_time['firstpub'] // 10 * 10).astype(int)
df_time['prediction_class'] = (df_time['logistic'] > 0.5).astype(int)

# Aggregate by decade
decade_stats = df_time.groupby('decade').agg({
    'realclass': 'sum',
    'prediction_class': 'sum',
    'logistic': ['mean', 'count']
}).reset_index()
decade_stats.columns = ['decade', 'actual_positive', 'predicted_positive', 'mean_score', 'count']

# Reshape for plotting
decade_melted = decade_stats[['decade', 'actual_positive', 'predicted_positive']].copy()
decade_melted['actual_positive'] = decade_melted['actual_positive'] / decade_stats['count'] * 100
decade_melted['predicted_positive'] = decade_melted['predicted_positive'] / decade_stats['count'] * 100

# Melt for Altair
decade_viz = decade_melted.melt(id_vars=['decade'], var_name='Category', value_name='Percentage')

temporal = alt.Chart(decade_viz).mark_line(point=True, size=3).encode(
    x=alt.X('decade:O', title='Decade'),
    y=alt.Y('Percentage:Q', title='% of Texts in Positive Class'),
    color=alt.Color('Category:N', scale=alt.Scale(domain=['actual_positive', 'predicted_positive'],
                                                   range=['#2ca02c', '#d62728']),
                    title=''),
    tooltip=['decade:O', 'Percentage:Q', 'Category:N']
).properties(
    width=700,
    height=300,
    title=f'{model_name.title()} - Genre Distribution Over Time'
)

temporal.display()

In [7]:
# 3. Top Positive and Negative Features
coefs_sorted = coefs.sort_values('coefficient', key=abs, ascending=False).head(30).copy()
coefs_sorted['type'] = coefs_sorted['coefficient'].apply(lambda x: 'Positive (genre)' if x > 0 else 'Negative (not genre)')
coefs_sorted = coefs_sorted.sort_values('coefficient')

features = alt.Chart(coefs_sorted).mark_bar().encode(
    x='coefficient:Q',
    y=alt.Y('word:N', sort='x', title=''),
    color=alt.Color('type:N', scale=alt.Scale(domain=['Negative (not genre)', 'Positive (genre)'],
                                              range=['#1f77b4', '#ff7f0e']),
                    title=''),
    tooltip=['word:N', 'coefficient:Q']
).properties(
    width=600,
    height=600,
    title=f'{model_name.title()} - Top 30 Feature Coefficients'
)

features.display()

In [8]:
# 4. Confusion Matrix as Heatmap
from itertools import product

confusion_data = []
for actual in [0, 1]:
    for predicted in [0, 1]:
        count = ((df['realclass'] == actual) & ((df['logistic'] > 0.5).astype(int) == predicted)).sum()
        confusion_data.append({
            'Actual': 'Genre' if actual == 1 else 'Not Genre',
            'Predicted': 'Genre' if predicted == 1 else 'Not Genre',
            'Count': count
        })

confusion_df = pd.DataFrame(confusion_data)

confusion = alt.Chart(confusion_df).mark_rect().encode(
    x='Predicted:N',
    y='Actual:N',
    color=alt.Color('Count:Q', scale=alt.Scale(scheme='blues')),
    tooltip=['Actual:N', 'Predicted:N', 'Count:Q']
).properties(
    width=300,
    height=300,
    title=f'{model_name.title()} - Confusion Matrix'
)

# Add text
text = alt.Chart(confusion_df).mark_text(dx=0, dy=0, size=20, fontWeight='bold').encode(
    x='Predicted:N',
    y='Actual:N',
    text='Count:Q'
)

(confusion + text).display()

In [9]:
# 5. Scatter Plot: Actual vs Predicted with Jitter
scatter_data = df[['logistic', 'realclass', 'firstpub', 'title', 'author']].copy()
scatter_data['jitter'] = np.random.normal(0, 0.02, len(scatter_data))
scatter_data['realclass_jitter'] = scatter_data['realclass'] + scatter_data['jitter']

scatter = alt.Chart(scatter_data).mark_circle(opacity=0.4, size=50).encode(
    x=alt.X('logistic:Q', title='Predicted Score'),
    y=alt.Y('realclass_jitter:Q', title='Actual Class (0=Not Genre, 1=Genre)'),
    color=alt.Color('firstpub:Q', scale=alt.Scale(scheme='viridis'), title='Publication Year'),
    tooltip=['logistic:Q', 'realclass:N', 'firstpub:Q', 'title:N', 'author:N']
).properties(
    width=700,
    height=400,
    title=f'{model_name.title()} - Predictions vs Actual (Colored by Year)'
)

scatter.display()

## Summary Statistics

In [10]:
# Summarize key findings
print(f"\n=== {model_name.upper()} Results ===")
print(f"\nDataset:")
print(f"  Total volumes: {len(df)}")
print(f"  Positive cases: {(df['realclass'] == 1).sum()}")
print(f"  Negative cases: {(df['realclass'] == 0).sum()}")

print(f"\nPerformance:")
print(f"  Accuracy: {accuracy:.2%}")

# Calculate precision and recall
tp = ((df['logistic'] > 0.5) & (df['realclass'] == 1)).sum()
fp = ((df['logistic'] > 0.5) & (df['realclass'] == 0)).sum()
fn = ((df['logistic'] <= 0.5) & (df['realclass'] == 1)).sum()

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"  Precision: {precision:.2%}")
print(f"  Recall: {recall:.2%}")
print(f"  F1-Score: {f1:.2%}")

print(f"\nTemporal Range:")
print(f"  Earliest: {df['firstpub'].min()}")
print(f"  Latest: {df['firstpub'].max()}")

print(f"\nTop 10 Positive Words (indicate genre):")
print(coefs.nlargest(10, 'coefficient')[['word', 'coefficient']].to_string(index=False))

print(f"\nTop 10 Negative Words (indicate NOT genre):")
print(coefs.nsmallest(10, 'coefficient')[['word', 'coefficient']].to_string(index=False))


=== DETECTNEWGATESENSATION Results ===

Dataset:
  Total volumes: 574
  Positive cases: 287
  Negative cases: 287

Performance:
  Accuracy: 91.46%
  Precision: 91.90%
  Recall: 90.94%
  F1-Score: 91.42%

Temporal Range:
  Earliest: 1826
  Latest: 1999

Top 10 Positive Words (indicate genre):
         word  coefficient
       murder     2.538932
        crime     2.387019
       motive     2.257554
       police     2.167845
investigation     2.139250
        guilt     2.045979
       arrest     2.042453
     contents     2.042367
    detective     1.973398
    connected     1.891481

Top 10 Negative Words (indicate NOT genre):
      word  coefficient
    taught    -1.878489
     their    -1.843144
      them    -1.727107
   shouted    -1.679311
difference    -1.602835
    kissed    -1.597678
     tired    -1.596287
     weeks    -1.543603
      kiss    -1.531457
      live    -1.510197
