# Welcome to Shakespeare classifier

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
df = pd.read_csv(Path("data/Shakespeare_data.csv"))

In [3]:
df.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [4]:
df.dtypes

Dataline              int64
Play                 object
PlayerLinenumber    float64
ActSceneLine         object
Player               object
PlayerLine           object
dtype: object

## We need to check for missing values 

We find instances where PlayerLinenumber is NaN and replace it with 0

In [5]:
df.loc[df['PlayerLinenumber'].isnull()]

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."


In [6]:
df['PlayerLinenumber'].fillna(0, inplace=True)

We do the same for ActSceneLine but we replace with the string "trans" for transition in the play

In [7]:
df.loc[df['ActSceneLine'].isnull()]

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,0.0,,,ACT I
1,2,Henry IV,0.0,,,SCENE I. London. The palace.
2,3,Henry IV,0.0,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
111,112,Henry IV,10.0,,WESTMORELAND,Exeunt
112,113,Henry IV,10.0,,WESTMORELAND,SCENE II. London. An apartment of the Prince's.
113,114,Henry IV,10.0,,WESTMORELAND,Enter the PRINCE OF WALES and FALSTAFF
214,215,Henry IV,33.0,,FALSTAFF,Enter POINS
264,265,Henry IV,52.0,,PRINCE HENRY,Exit Falstaff
299,300,Henry IV,61.0,,POINS,Exit Poins
323,324,Henry IV,62.0,,PRINCE HENRY,Exit


In [8]:
df['ActSceneLine'].fillna("trans", inplace=True)

Finally, for Player we replace missing values with OTHER

In [9]:
df.loc[df['Player'].isnull()]

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,0.0,trans,,ACT I
1,2,Henry IV,0.0,trans,,SCENE I. London. The palace.
2,3,Henry IV,0.0,trans,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
2015,2016,Henry IV,10.0,trans,,Eastcheap. The Boar's-Head Tavern.
2016,2017,Henry IV,10.0,trans,,Enter FALSTAFF and BARDOLPH
29459,29460,Cymbeline,7.0,2.2.55,,An ante-chamber adjoining Imogen's apartments.
29460,29461,Cymbeline,7.0,trans,,Enter CLOTEN and Lords


In [10]:
df['Player'].fillna("OTHER", inplace=True)

## Rename the columns for more consistency

In [11]:
df.rename(index=str, columns={"PlayerLinenumber": "LineNumber", "PlayerLine": "Line"}, inplace=True)
df.drop(columns=["Dataline"], inplace=True)

In [12]:
df.head()

Unnamed: 0,Play,LineNumber,ActSceneLine,Player,Line
0,Henry IV,0.0,trans,OTHER,ACT I
1,Henry IV,0.0,trans,OTHER,SCENE I. London. The palace.
2,Henry IV,0.0,trans,OTHER,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


## Create new columns for LineLength and MostCommonLetter

In [13]:
import collections
df['LineLength'] = df.apply(lambda row : len(row.Line), axis=1)
df['MostCommonLetter'] = df.apply(lambda row : ord(collections.Counter(row.Line.lower()).most_common(1)[0][0]), axis=1)

In [14]:
df.head()

Unnamed: 0,Play,LineNumber,ActSceneLine,Player,Line,LineLength,MostCommonLetter
0,Henry IV,0.0,trans,OTHER,ACT I,5,97
1,Henry IV,0.0,trans,OTHER,SCENE I. London. The palace.,28,101
2,Henry IV,0.0,trans,OTHER,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ...",96,32
3,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",38,32
4,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",42,32


## Create new columns for Polarity and Subjectivity of the sentiment in each line

In [15]:
from textblob import TextBlob

df['Polarity'] = df.apply(lambda row : TextBlob(row.Line).sentiment.polarity, axis=1)
df['Subjectivity'] = df.apply(lambda row : TextBlob(row.Line).sentiment.subjectivity, axis=1)

## Doing some more feature engineering by splitting the ActSceneLine column into three different columns by parsing the string

In [16]:
def acs_split(string, index):
    if string == "trans":
        return 0
    else:
        return (int(string.split(".")[index]))

df['Act'] = df.apply(lambda row : acs_split(row.ActSceneLine, 0), axis=1)
df['Scene'] = df.apply(lambda row : acs_split(row.ActSceneLine, 1), axis=1)
df['ACSLineNum'] = df.apply(lambda row : acs_split(row.ActSceneLine, 2), axis=1)

In [17]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

## Create new columns that contain the encoding of the string values

In [18]:
df["PlayENC"] = le.fit_transform(df["Play"].astype(str))
df["PlayerENC"] = le.fit_transform(df["Player"].astype(str))
df["LineENC"] = le.fit_transform(df["Line"].astype(str))
df["ACSEnc"] = le.fit_transform(df["ActSceneLine"].astype(str))

In [19]:
df

Unnamed: 0,Play,LineNumber,ActSceneLine,Player,Line,LineLength,MostCommonLetter,Polarity,Subjectivity,Act,Scene,ACSLineNum,PlayENC,PlayerENC,LineENC,ACSEnc
0,Henry IV,0.0,trans,OTHER,ACT I,5,97,0.000000,0.000000,0,0,0,9,617,2574,16122
1,Henry IV,0.0,trans,OTHER,SCENE I. London. The palace.,28,101,0.000000,0.000000,0,0,0,9,617,60108,16122
2,Henry IV,0.0,trans,OTHER,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ...",96,32,0.000000,0.000000,0,0,0,9,617,23218,16122
3,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",38,32,-0.200000,0.150000,1,1,1,9,457,63734,324
4,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",42,32,0.000000,0.000000,1,1,2,9,457,25781,435
5,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils,46,32,0.136364,0.454545,1,1,3,9,457,5120,546
6,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.,39,32,-0.100000,0.200000,1,1,4,9,457,77594,657
7,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil,41,32,-0.250000,0.500000,1,1,5,9,457,51386,768
8,Henry IV,1.0,1.1.6,KING HENRY IV,Shall daub her lips with her own children's bl...,50,32,0.600000,1.000000,1,1,6,9,457,61525,781
9,Henry IV,1.0,1.1.7,KING HENRY IV,"Nor more shall trenching war channel her fields,",48,32,0.500000,0.500000,1,1,7,9,457,52258,792


## Split the data into testing and training set

In [20]:
from sklearn.model_selection import train_test_split

X = df[['LineLength', 'MostCommonLetter', 'Polarity', 'Subjectivity', 'Act', 'Scene', 'ACSLineNum', 'ACSEnc', 'PlayENC','LineENC']]
Y = df['PlayerENC']
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.2)

## Running two different classifiers, Random Forest and Desicion Tree 

In [21]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()

In [22]:
random_forest.fit(X_train, Y_train)
rf_predict = random_forest.predict(X_test)

### We get a decent prediction accuracy with random forest of about 60.9%

In [23]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, rf_predict)

0.6092908438061041

### Surpisingly, we get a somewhat better prediction accuracy rate of 63.3% using a decision tree

In [24]:
from sklearn import tree
model = tree.DecisionTreeClassifier()
model.fit(X_train, Y_train)
dt_predict = model.predict(X_test)
accuracy_score(Y_test, dt_predict)

0.6326750448833034