# Shakespeare Classification

Three Goals:
1. Visualize and explore relationships between characters in all of Shakespeare's plays.
2. Determine a way to compare these networks of relationships
3. Build a model that uses this comparison metric to distinguish between comedies and tragedies *without looking at any dialogue.*

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import re

## Download Data

We're downloading the dialouge from all of Shakespeare's plays via Kaggle. The below cell contains what you'd run in your terminal, but to do so you need to have a Kaggle Account/API Key.

In [None]:
import subprocess
import sys
import os
# subprocess.run("kaggle datasets download -d kingburrito666/shakespeare-plays && unzip shakespeare-plays.zip",shell=True)
plays_df = pd.read_csv("Shakespeare_data.csv")
print("{} rows".format(plays_df.shape[0]))
plays_df.head()

In [None]:
# Drop stage directions (where there isn't an act/scene/line)
plays_df = plays_df[pd.notna(plays_df['ActSceneLine'])]
plays_df[['Act','Scene','Line']] = plays_df['ActSceneLine'].str.split('.',expand = True).astype(float)
plays_df = plays_df.drop('ActSceneLine',axis=1)
# Standardize play casing
plays_df['Play'] = plays_df['Play'].apply(lambda x: x.title())
plays_df.head()

### Remove plays defined as "histories"

In [None]:
histories = ["King John", "Henry Iv", "Henry Vi Part 1", "Henry V",
            "Henry Vi Part 2", "Henry Vi Part 3", "Henry Viii", "Richard Ii", 
             "Richard Iii"]

comedies = ["A Midsummer Nights Dream", "A Comedy Of Errors", "Taming Of The Shrew",
            "Two Gentlemen Of Verona", "Loves Labours Lost", "The Tempest", 
            "A Winters Tale", "Cymbeline", "Pericles","Alls Well That Ends Well",
            "Measure For Measure", "Troilus And Cressida", "Twelfth Night", 
            "As You Like It", "Much Ado About Nothing", "Merchant Of Venice",
            "Merry Wives Of Windsor"]

tragedies = ["Macbeth","Titus Andronicus", "Romeo And Juliet", "King Lear",
            "Hamlet","Othello", "Julius Caesar", "Antony And Cleopatra", 
            "Coriolanus", "Timon Of Athens"]

plays_df = plays_df[~plays_df["Play"].isin(histories)]
print("{} rows and {} columns".format(*plays_df.shape))
plays_df.head()

All the plays we have to work with

In [None]:
play_list = plays_df["Play"].unique()
play_list
print(len(play_list))
print(play_list)

In [None]:
play_name = "Romeo And Juliet"
single_play = plays_df[(plays_df['Play'] == play_name)]
single_play.head()

## Build play network

### Drop characters who speak < 5 times

In [None]:
# Group the play dataframe by each character to get how often they speak
top_characters = single_play.groupby(['Player']).size().reset_index()
top_characters.rename(columns = {0: 'Count'}, inplace = True)

top_characters = top_characters[top_characters["Count"] > 5]
top_characters.head()

### Create Graph and add all the characters

In [None]:
character_graph = nx.Graph()
character_graph.add_nodes_from(top_characters["Player"])

### Go scene by scene and add links between all the characters who spoke in that scene

We're updating the weights as we go scene by scene so that  

In [None]:
# Group the play by the Act/Scene and get how much each character spoke in that scene 
character_counts = single_play.groupby(['Act','Scene','Player']).size().reset_index()
character_counts = character_counts[character_counts["Player"].isin(top_characters["Player"])]
character_counts.rename(columns = {0: 'Count'}, inplace = True)
character_counts.head()

In [None]:
from itertools import combinations

# Go scene by scene
for (act,scene), counts in character_counts.groupby(['Act','Scene']):
    # Get all the characters that are in that scene
    characters = counts["Player"].tolist()
    # If a scene contains characters [A,B,C] we want are graph to 
    # contain the edges [(A,B),(A,C),(B,C)]
    pairs = list(combinations(characters,2))
    for (a_char, b_char) in pairs:
        if character_graph.has_edge(a_char, b_char):
            # we added this one before, just increase the weight by one
            character_graph[a_char][b_char]['weight'] += 1
        else:
            # new edge. add with weight=1
            character_graph.add_edge(a_char, b_char, weight =1)

In [None]:
# drawing NX graph object
from networkx.drawing.nx_agraph import graphviz_layout
plt.figure(figsize = (8,6), dpi = 180)
plt.title("{} ({})".format(play_name,"Comedy" if play_name in comedies else "Tragedy"))

pos = graphviz_layout(character_graph, prog="neato")
# Divide all the weights by 30 so that the edges aren't super thick
weights = np.array([character_graph[u][v]['weight'] for u,v in character_graph.edges()])
kwargs = {
    "with_labels": True,
    "node_size": 400,
    "node_color": 'grey',
    "font_size": 8,
    "font_weight": 'semibold',
    "width": weights,
    "edge_color": weights, 
    "edge_cmap": plt.cm.Blues,
    "pos": pos
}
nx.draw_networkx(character_graph, **kwargs)
plt.tight_layout()
plt.axis('off')
# plt.savefig("./visualizations/{}.png".format(play_name))
plt.show()
# plt.close()

### Save the character graph to a file for later

In [None]:
nx.write_gpickle(character_graph, "./graphs/{}.gpickle".format(play_name))

# *Repeat with all other plays...*

![SegmentLocal](networks.gif "segment")

# NetLSD: Calculate Heat Traces

The idea behind classifying these plays is that at some global and local levels, tragedies and comedies have different patterns of communication. This can be captured by simulating how the dialogue "flows" through the network. This uses the heat kernel and is directly analogous to modelling how heat diffuses throughout a system.

In [None]:
from sklearn.model_selection import train_test_split
# Split into training and testing data
data = np.array(comedies+tragedies)
labels = np.concatenate([np.full(len(comedies), "c"),np.full(len(tragedies), "t")])
# Label map
lm = {
    "c": ("Comedy","blue"),
    "t": ("Tragedy","red")
}
priors, test_data, prior_labels, test_labels = train_test_split(data, labels, test_size=0.33, random_state=42)

## Calculate point of truth heat signature

In [None]:
import netlsd
kwargs = {
    "timescales": np.logspace(-2, 2, 250),
    "normalization": "empty"
}

get_sig = lambda title: netlsd.heat(nx.read_gpickle("./graphs/{}.gpickle".format(title)),**kwargs)

prior_heat_sigs =[get_sig(title) for title in priors]

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_title("Observed Heat Signatures")
ax.set_ylabel('h(t)', fontsize = 15)
ax.set_xlabel("Time")
ax.set_xscale('log')
ax.grid()

for sig,name,ptype in zip(prior_heat_sigs,priors,prior_labels):
    ax.plot(kwargs["timescales"],sig, color=lm[ptype][1])

# Add legend
for key, item in lm.items():
    ax.plot([], [], label=item[0], color=item[1])
ax.legend(loc="best")
plt.show()
# plt.savefig("./visualizations/training_heat_sigs.png")

### Predict Class of Test Data via KNN

In [None]:
from netlsd import compare as l2_distance

def knn_predict(title, training_heat_sigs, training_titles, labels_train,k=5):
    # Read in the play's network and calculate it's heat trace signature using the same arguments as the training data
    graph_sig = get_sig(title)
    # Calcualte the distance between that graph and the training data's heat traces
    distances = [l2_distance(graph_sig,train_sig) for train_sig in training_heat_sigs]
    # I found that the most concise way to sort all three training inputs by distance was to 
    # put them in a dataframe first. If you're concerned about memory, other options are
    # preferable
    total = pd.DataFrame({"Play Title": training_titles, "Play Type": labels_train, "Distance From Input": distances})
    total = total.sort_values("Distance From Input")
    print(title)
    print(total.head(k))
    return total["Play Type"].head(k).mode()[0]


pred = [knn_predict(play,prior_heat_sigs,priors,prior_labels,k=5) for play in test_data]
print(pred)
print(test_labels)

### Create confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
classes = ["Comedy", "Tragedy"]
cm = confusion_matrix(pred,test_labels)
title = 'Shakespear Classification Confusion Matrix'
    
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
        yticks=np.arange(cm.shape[0]),
        # ... and label them with the respective list entries
        xticklabels=classes, yticklabels=classes,
        title=title,
        ylabel='Predicted label',
        xlabel='True label')

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
thresh = cm.max() / 2.
fmt = 'd'
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, format(cm[i, j], fmt),
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
plt.savefig("./visualizations/confusion_mat.png")

In [None]:
# Since this is just a binary classification, calculating some metrics isn't too difficult
# For simplicity we'll consider comedies to be the positive class and tragedies the negative
# True positive => predicted a comedy and is a comedy
true_positives = cm[0][0]
# False positive => predicted a comedy and is a tragedy
false_positives = cm[0][1]
# False negative => predicted a tragedy and is a comedy
false_negatives = cm[1][0]

In [None]:
precision = true_positives/(true_positives+false_positives)
precision

In [None]:
recall = true_positives/(true_positives+false_negatives)
recall

In [None]:
f_score = 2*((precision*recall)/(precision+recall))
f_score