# Welcome to Shakespeare play classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
df = pd.read_csv(Path("data/Shakespeare_data.csv"))

In [3]:
df.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


## We can see we have NaN so we need to do some mvr

In [4]:
df.dtypes

Dataline              int64
Play                 object
PlayerLinenumber    float64
ActSceneLine         object
Player               object
PlayerLine           object
dtype: object

In [5]:
df.loc[df['PlayerLinenumber'].isnull()]

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."


In [6]:
df['PlayerLinenumber'].fillna(0, inplace=True)

In [7]:
df.loc[df['ActSceneLine'].isnull()]

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,0.0,,,ACT I
1,2,Henry IV,0.0,,,SCENE I. London. The palace.
2,3,Henry IV,0.0,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
111,112,Henry IV,10.0,,WESTMORELAND,Exeunt
112,113,Henry IV,10.0,,WESTMORELAND,SCENE II. London. An apartment of the Prince's.
113,114,Henry IV,10.0,,WESTMORELAND,Enter the PRINCE OF WALES and FALSTAFF
214,215,Henry IV,33.0,,FALSTAFF,Enter POINS
264,265,Henry IV,52.0,,PRINCE HENRY,Exit Falstaff
299,300,Henry IV,61.0,,POINS,Exit Poins
323,324,Henry IV,62.0,,PRINCE HENRY,Exit


In [8]:
df['ActSceneLine'].fillna("trans", inplace=True)

In [9]:
df.loc[df['ActSceneLine'].isnull()]

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine


In [10]:
df.loc[df['Player'].isnull()]

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,0.0,trans,,ACT I
1,2,Henry IV,0.0,trans,,SCENE I. London. The palace.
2,3,Henry IV,0.0,trans,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
2015,2016,Henry IV,10.0,trans,,Eastcheap. The Boar's-Head Tavern.
2016,2017,Henry IV,10.0,trans,,Enter FALSTAFF and BARDOLPH
29459,29460,Cymbeline,7.0,2.2.55,,An ante-chamber adjoining Imogen's apartments.
29460,29461,Cymbeline,7.0,trans,,Enter CLOTEN and Lords


In [11]:
df['Player'].fillna("OTHER", inplace=True)

In [12]:
df.loc[df['Player'].isnull()]

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine


In [13]:
df.loc[df['PlayerLine'].isnull()]

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine


In [14]:
df[df.isnull().any(axis=1)]

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine


## We can see we have filled all the missing values

In [15]:
df.rename(index=str, columns={"PlayerLinenumber": "LineNumber", "PlayerLine": "Line"}, inplace=True)

In [16]:
df.drop(columns=["Dataline"], inplace=True)

In [17]:
df.Player.nunique()

935

In [18]:
df['Player'] = df['Player'].astype(str)

In [19]:
df['LineLength'] = df.apply(lambda row : len(row.Line), axis=1)

In [20]:
df.head()

Unnamed: 0,Play,LineNumber,ActSceneLine,Player,Line,LineLength
0,Henry IV,0.0,trans,OTHER,ACT I,5
1,Henry IV,0.0,trans,OTHER,SCENE I. London. The palace.,28
2,Henry IV,0.0,trans,OTHER,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ...",96
3,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",38
4,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",42


In [21]:
import collections
df['MostCommonLetter'] = df.apply(lambda row : ord(collections.Counter(row.Line.lower()).most_common(1)[0][0]), axis=1)

In [22]:
df.head()

Unnamed: 0,Play,LineNumber,ActSceneLine,Player,Line,LineLength,MostCommonLetter
0,Henry IV,0.0,trans,OTHER,ACT I,5,97
1,Henry IV,0.0,trans,OTHER,SCENE I. London. The palace.,28,101
2,Henry IV,0.0,trans,OTHER,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ...",96,32
3,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",38,32
4,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",42,32


In [23]:
from sklearn import preprocessing

In [24]:
le = preprocessing.LabelEncoder()
le.fit(df['Player'])

LabelEncoder()

In [25]:
list(le.classes_)

['A Lord',
 'A Patrician',
 'A Player',
 'AARON',
 'ABERGAVENNY',
 'ABHORSON',
 'ABRAHAM',
 'ACHILLES',
 'ADAM',
 'ADRIAN',
 'ADRIANA',
 'ADRIANO DE ARMADO',
 'AEGEON',
 'AEMELIA',
 'AEMILIUS',
 'AENEAS',
 'AEdile',
 'AGAMEMNON',
 'AGRIPPA',
 'AJAX',
 'ALBANY',
 'ALCIBIADES',
 'ALENCON',
 'ALEXANDER',
 'ALEXAS',
 'ALICE',
 'ALL',
 'ALONSO',
 'AMIENS',
 'ANDROMACHE',
 'ANGELO',
 'ANGUS',
 'ANNE',
 'ANNE PAGE',
 'ANOTHER',
 'ANTIGONUS',
 'ANTIOCHUS',
 'ANTIPHOLUS',
 'ANTONIO',
 'ANTONY',
 'APEMANTUS',
 'ARCHBISHOP OF YORK',
 'ARCHIDAMUS',
 'ARIEL',
 'ARMADO',
 'ARRAGON',
 'ARTEMIDORUS',
 'ARTHUR',
 'ARVIRAGUS',
 'ATTENDANT',
 'AUDREY',
 'AUFIDIUS',
 'AUSTRIA',
 'AUTOLYCUS',
 'Abbot',
 'All',
 'All Citizens',
 'All Conspirators',
 'All Ladies',
 'All Lords',
 'All Servants',
 'All The Lords',
 'All The People',
 'All the Goths',
 'Apothecary',
 'Attendant',
 'Attendants',
 'BAGOT',
 'BALTHASAR',
 'BALTHAZAR',
 'BANQUO',
 'BAPTISTA',
 'BARDOLPH',
 'BARNARDINE',
 'BASSANIO',
 'BASSET',
 'BA

In [26]:
le.transform(df['Player'])

array([617, 617, 617, ..., 494, 494, 494], dtype=int64)

In [27]:
le2 = preprocessing.LabelEncoder()
le2.fit(df['Play'])
le2.transform(df['Play'])

array([9, 9, 9, ..., 2, 2, 2], dtype=int64)

In [28]:
df.head()

Unnamed: 0,Play,LineNumber,ActSceneLine,Player,Line,LineLength,MostCommonLetter
0,Henry IV,0.0,trans,OTHER,ACT I,5,97
1,Henry IV,0.0,trans,OTHER,SCENE I. London. The palace.,28,101
2,Henry IV,0.0,trans,OTHER,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ...",96,32
3,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",38,32
4,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",42,32


In [29]:
X = df[['LineNumber', 'LineLength', 'MostCommonLetter']]
Y = df['Player']

In [30]:
from sklearn.model_selection import train_test_split
import time
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.2)

In [31]:
from sklearn import tree
model = tree.DecisionTreeClassifier()

In [32]:
model.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [33]:
y_predict = model.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, y_predict)

0.043536804308797125

In [34]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [35]:
mNB_predict = clf.predict(X_test)
accuracy_score(Y_test, mNB_predict)

0.008393177737881508

In [36]:
df.dtypes

Play                 object
LineNumber          float64
ActSceneLine         object
Player               object
Line                 object
LineLength            int64
MostCommonLetter      int64
dtype: object

In [37]:
df.head()

Unnamed: 0,Play,LineNumber,ActSceneLine,Player,Line,LineLength,MostCommonLetter
0,Henry IV,0.0,trans,OTHER,ACT I,5,97
1,Henry IV,0.0,trans,OTHER,SCENE I. London. The palace.,28,101
2,Henry IV,0.0,trans,OTHER,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ...",96,32
3,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",38,32
4,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",42,32


In [38]:
from textblob import TextBlob

In [39]:
df['Polarity'] = df.apply(lambda row : TextBlob(row.Line).sentiment.polarity, axis=1)
df['Subjectivity'] = df.apply(lambda row : TextBlob(row.Line).sentiment.subjectivity, axis=1)

In [40]:
df.head()

Unnamed: 0,Play,LineNumber,ActSceneLine,Player,Line,LineLength,MostCommonLetter,Polarity,Subjectivity
0,Henry IV,0.0,trans,OTHER,ACT I,5,97,0.0,0.0
1,Henry IV,0.0,trans,OTHER,SCENE I. London. The palace.,28,101,0.0,0.0
2,Henry IV,0.0,trans,OTHER,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ...",96,32,0.0,0.0
3,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",38,32,-0.2,0.15
4,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",42,32,0.0,0.0


In [41]:
"1.1.1".split(".")

['1', '1', '1']

In [42]:
def acs_split(string, index):
    if string == "trans":
        return 0
    else:
        return (int(string.split(".")[index]))

In [43]:
df['Act'] = df.apply(lambda row : acs_split(row.ActSceneLine, 0), axis=1)
df['Scene'] = df.apply(lambda row : acs_split(row.ActSceneLine, 1), axis=1)
df['ACSLineNum'] = df.apply(lambda row : acs_split(row.ActSceneLine, 2), axis=1)

In [44]:
df.head()

Unnamed: 0,Play,LineNumber,ActSceneLine,Player,Line,LineLength,MostCommonLetter,Polarity,Subjectivity,Act,Scene,ACSLineNum
0,Henry IV,0.0,trans,OTHER,ACT I,5,97,0.0,0.0,0,0,0
1,Henry IV,0.0,trans,OTHER,SCENE I. London. The palace.,28,101,0.0,0.0,0,0,0
2,Henry IV,0.0,trans,OTHER,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ...",96,32,0.0,0.0,0,0,0
3,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",38,32,-0.2,0.15,1,1,1
4,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",42,32,0.0,0.0,1,1,2


In [45]:
X = df[['LineNumber', 'LineLength', 'MostCommonLetter', 'Polarity', 'Subjectivity','Act','Scene','ACSLineNum']]
Y = df['Player']
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.2)

In [46]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()

In [47]:
random_forest.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [48]:
rf_predict = random_forest.predict(X_test)
accuracy_score(Y_test, y_predict)

0.004533213644524237

In [49]:
le3 = preprocessing.LabelEncoder()
le3.fit(df['Line'])
le3.transform(df['Line'])

array([ 2574, 60108, 23218, ..., 58003, 82723, 24488], dtype=int64)

In [50]:
X = df[['LineNumber', 'Line', 'LineLength']]
Y = df['Player']
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.2)
random_forest2 = RandomForestClassifier()

In [51]:
random_forest2.fit(X_train, Y_train)
rf_predict = random_forest2.predict(X_test)
accuracy_score(Y_test, y_predict)

ValueError: could not convert string to float: 'From the inward of thee? One, but painted thus,'

In [None]:
df.head()

In [None]:
df['LineENC'] = le3.transform(df['Line'])

In [None]:
df.head()

In [None]:
df['PlayerENC'] = le.transform(df['Player'])
df['PlayENC'] = le2.transform(df['Play'])

In [None]:
df.head()

In [None]:
df