In [1]:
import pandas as pd
import numpy as np
from sys import stdout
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


Read in csv

In [2]:
df = pd.read_csv("../data/Shakespeare_data.csv")
df

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"
...,...,...,...,...,...,...
111391,111392,A Winters Tale,38.0,5.3.180,LEONTES,"Lead us from hence, where we may leisurely"
111392,111393,A Winters Tale,38.0,5.3.181,LEONTES,Each one demand an answer to his part
111393,111394,A Winters Tale,38.0,5.3.182,LEONTES,Perform'd in this wide gap of time since first
111394,111395,A Winters Tale,38.0,5.3.183,LEONTES,We were dissever'd: hastily lead away.


Remove NaN values

In [3]:
df = df.dropna()
df

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,6,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils
6,7,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.
7,8,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil
...,...,...,...,...,...,...
111390,111391,A Winters Tale,38.0,5.3.179,LEONTES,"Is troth-plight to your daughter. Good Paulina,"
111391,111392,A Winters Tale,38.0,5.3.180,LEONTES,"Lead us from hence, where we may leisurely"
111392,111393,A Winters Tale,38.0,5.3.181,LEONTES,Each one demand an answer to his part
111393,111394,A Winters Tale,38.0,5.3.182,LEONTES,Perform'd in this wide gap of time since first


In [28]:
unique_players = df["Player"].unique()
len(unique_players)

934

# One-hot encoding for plays
I chose to use one-hot encoding so that I could run these features through a random forest model. Given that there are 934 different players, I think a Random Forest Classifier will achieve the best performance compared to a nearest neighbors approach or an SVM.

In [4]:
unique_plays = df['Play'].unique()
print(unique_plays)
len(unique_plays)

['Henry IV' 'Henry VI Part 1' 'Henry VI Part 2' 'Henry VI Part 3'
 'Alls well that ends well' 'As you like it' 'Antony and Cleopatra'
 'A Comedy of Errors' 'Coriolanus' 'Cymbeline' 'Hamlet' 'Henry V'
 'Henry VIII' 'King John' 'Julius Caesar' 'King Lear' 'Loves Labours Lost'
 'macbeth' 'Measure for measure' 'Merchant of Venice'
 'Merry Wives of Windsor' 'A Midsummer nights dream'
 'Much Ado about nothing' 'Othello' 'Pericles' 'Richard II' 'Richard III'
 'Romeo and Juliet' 'Taming of the Shrew' 'The Tempest' 'Timon of Athens'
 'Titus Andronicus' 'Troilus and Cressida' 'Twelfth Night'
 'Two Gentlemen of Verona' 'A Winters Tale']


36

In [5]:
act_scene_line_cols = ['Act','Scene','Line']
cols = list(unique_plays)
cols.extend(act_scene_line_cols)
cols.append('PlayerLinenumber')
cols

['Henry IV',
 'Henry VI Part 1',
 'Henry VI Part 2',
 'Henry VI Part 3',
 'Alls well that ends well',
 'As you like it',
 'Antony and Cleopatra',
 'A Comedy of Errors',
 'Coriolanus',
 'Cymbeline',
 'Hamlet',
 'Henry V',
 'Henry VIII',
 'King John',
 'Julius Caesar',
 'King Lear',
 'Loves Labours Lost',
 'macbeth',
 'Measure for measure',
 'Merchant of Venice',
 'Merry Wives of Windsor',
 'A Midsummer nights dream',
 'Much Ado about nothing',
 'Othello',
 'Pericles',
 'Richard II',
 'Richard III',
 'Romeo and Juliet',
 'Taming of the Shrew',
 'The Tempest',
 'Timon of Athens',
 'Titus Andronicus',
 'Troilus and Cressida',
 'Twelfth Night',
 'Two Gentlemen of Verona',
 'A Winters Tale',
 'Act',
 'Scene',
 'Line',
 'PlayerLinenumber']

Use dictionary for faster one-hot encoding of play names. I know pandas has their own function to get one-hot encoding values but it still requires you to rebuild the dataframe so this seemed just as simple as that.

In [6]:
play_dictionary = dict()
for i in range(len(unique_plays)):
    play_dictionary[unique_plays[i]] = i

## Create new dataframe that can be fed into a classifier. 
In general, machine learning algorithms do not handle long text well (except when the model is explictly designed for processing it). In this case, I expect that the player lines are unique enough to where one-hot encoding them will not be useful so I chose to drop the column. However, if your sole goal was to make a classifier on these plays the best model would probably be some dictionary on the player lines and then handling any overlaps by differentiating between play and act/scene/line.

In [7]:
act_col = 36
scene_col = 37
line_col = 38
playerlinenumber_col = 39
rows = []
for index,row in df.iterrows():
    r = [0]*40
    play = play_dictionary[row['Play']]
    r[play] = 1
    act,scene,line = row['ActSceneLine'].split('.')
    r[act_col] = act
    r[scene_col] = scene
    r[line_col] = line
    r[playerlinenumber_col] = row['PlayerLinenumber']
    rows.append(r)
    if index % 1000 == 0:
        stdout.write("\r%d" % len(rows))
        stdout.flush()
data = pd.DataFrame(rows,columns=cols)
data

104772

Unnamed: 0,Henry IV,Henry VI Part 1,Henry VI Part 2,Henry VI Part 3,Alls well that ends well,As you like it,Antony and Cleopatra,A Comedy of Errors,Coriolanus,Cymbeline,...,Timon of Athens,Titus Andronicus,Troilus and Cressida,Twelfth Night,Two Gentlemen of Verona,A Winters Tale,Act,Scene,Line,PlayerLinenumber
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1.0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,2,1.0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,3,1.0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,4,1.0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,5,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105147,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,5,3,179,38.0
105148,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,5,3,180,38.0
105149,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,5,3,181,38.0
105150,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,5,3,182,38.0


In [8]:
players = pd.get_dummies(df['Player'], prefix='player')
players

Unnamed: 0,player_A Lord,player_A Patrician,player_A Player,player_AARON,player_ABERGAVENNY,player_ABHORSON,player_ABRAHAM,player_ACHILLES,player_ADAM,player_ADRIAN,...,player_Wife,player_YORK,player_YOUNG CLIFFORD,player_YOUNG SIWARD,player_Young LUCIUS,player_Young MARCIUS,player_of BUCKINGHAM,player_of King Henry VI,player_of Prince Edward,player_of young Princes
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111390,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
111391,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
111392,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
111393,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Save data to prevent needing to reprocess data

In [29]:
data.to_csv(r'../data/modified_shakespeare_data.csv', index = None, header=True)

## Create Training and Testing data
A train-test split of 75/25 was used. Since our dataset is explicitly trying to guess what player said what line and we have a complete list of all the lines for the plays we're working with it seemed the model would perform best if no validation set was used. If the goal was to design a model that could be fed some data and then generalize over parts of plays it may have not seen I would use a validation set.

In [9]:
labels = np.array(players)
features = np.array(data)

In [10]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25)

In [11]:
rf = RandomForestClassifier(n_estimators = 30, verbose=2)

In [12]:
rf.fit(train_features, train_labels)

building tree 1 of 30


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.3s remaining:    0.0s


building tree 2 of 30
building tree 3 of 30
building tree 4 of 30
building tree 5 of 30
building tree 6 of 30
building tree 7 of 30
building tree 8 of 30
building tree 9 of 30
building tree 10 of 30
building tree 11 of 30
building tree 12 of 30
building tree 13 of 30
building tree 14 of 30
building tree 15 of 30
building tree 16 of 30
building tree 17 of 30
building tree 18 of 30
building tree 19 of 30
building tree 20 of 30
building tree 21 of 30
building tree 22 of 30
building tree 23 of 30
building tree 24 of 30
building tree 25 of 30
building tree 26 of 30
building tree 27 of 30
building tree 28 of 30
building tree 29 of 30
building tree 30 of 30


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 12.8min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=None, verbose=2,
            warm_start=False)

In [15]:
predictions = rf.predict(test_features)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  4.3min finished


In [27]:
errors = []
for i in range(len(predictions)):
    prediction = predictions[i]
    label = test_labels[i]
    expected = np.where(label == 1)
    predicted = prediction[expected]
    if predicted == 1:
        errors.append(0)
    else:
        errors.append(1)
print("Number of test samples:", len(test_labels))
print("Accuracy:", str(round(1 -sum(errors)/len(errors),3)*100)+"%")

Number of test samples: 26288
Accuracy: 78.60000000000001%


# Results
An accuracy of 78.6% on a model with over 900 classes is pretty good. By just using the play, act, scene, line, and player line number a lot of the data was differntiable by the random forest classifier.