In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
from pylab import rcParams

from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB

import base64 
from facets_overview.generic_feature_statistics_generator import GenericFeatureStatisticsGenerator
from IPython.core.display import display, HTML

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

# Import data to pandas dataframe

In [2]:
df = pd.read_csv('../data/raw/Shakespeare_data.csv')
df.head()
df.isnull().sum()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


Dataline               0
Play                   0
PlayerLinenumber       3
ActSceneLine        6243
Player                 7
PlayerLine             0
dtype: int64

# Clean Dataframe
Remove the rows that has null values.

In [3]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105152 entries, 3 to 111394
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Dataline          105152 non-null  int64  
 1   Play              105152 non-null  object 
 2   PlayerLinenumber  105152 non-null  float64
 3   ActSceneLine      105152 non-null  object 
 4   Player            105152 non-null  object 
 5   PlayerLine        105152 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 5.6+ MB


# Feature Engineering
Now the dataframe is free of null values, I will one hot encode the play and label encode Players to prepare a dataset to create a classification model.

In [4]:
df['Player']= df['Player'].astype('category').cat.codes
one_hot = pd.get_dummies(df['Play'])
df = pd.concat([df, one_hot], axis = 1)
df.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,A Comedy of Errors,A Midsummer nights dream,A Winters Tale,Alls well that ends well,...,Richard III,Romeo and Juliet,Taming of the Shrew,The Tempest,Timon of Athens,Titus Andronicus,Troilus and Cressida,Twelfth Night,Two Gentlemen of Verona,macbeth
3,4,Henry IV,1.0,1.1.1,457,"So shaken as we are, so wan with care,",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Henry IV,1.0,1.1.2,457,"Find we a time for frighted peace to pant,",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,6,Henry IV,1.0,1.1.3,457,And breathe short-winded accents of new broils,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,7,Henry IV,1.0,1.1.4,457,To be commenced in strands afar remote.,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,8,Henry IV,1.0,1.1.5,457,No more the thirsty entrance of this soil,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**For PlayerLine, I will add a column called PlayerLineLenght with the lenght of the string of player line.**

In [5]:
df['PlayerLineLength'] = df.PlayerLine.str.len()
df.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,A Comedy of Errors,A Midsummer nights dream,A Winters Tale,Alls well that ends well,...,Romeo and Juliet,Taming of the Shrew,The Tempest,Timon of Athens,Titus Andronicus,Troilus and Cressida,Twelfth Night,Two Gentlemen of Verona,macbeth,PlayerLineLength
3,4,Henry IV,1.0,1.1.1,457,"So shaken as we are, so wan with care,",0,0,0,0,...,0,0,0,0,0,0,0,0,0,38
4,5,Henry IV,1.0,1.1.2,457,"Find we a time for frighted peace to pant,",0,0,0,0,...,0,0,0,0,0,0,0,0,0,42
5,6,Henry IV,1.0,1.1.3,457,And breathe short-winded accents of new broils,0,0,0,0,...,0,0,0,0,0,0,0,0,0,46
6,7,Henry IV,1.0,1.1.4,457,To be commenced in strands afar remote.,0,0,0,0,...,0,0,0,0,0,0,0,0,0,39
7,8,Henry IV,1.0,1.1.5,457,No more the thirsty entrance of this soil,0,0,0,0,...,0,0,0,0,0,0,0,0,0,41


__Drop Unwanted Columns to prepare for target and feature.__

In [6]:
df = df.drop(['Dataline', 'Play', 'ActSceneLine', 'PlayerLine'], axis=1)
df.describe()

Unnamed: 0,PlayerLinenumber,Player,A Comedy of Errors,A Midsummer nights dream,A Winters Tale,Alls well that ends well,Antony and Cleopatra,As you like it,Coriolanus,Cymbeline,...,Romeo and Juliet,Taming of the Shrew,The Tempest,Timon of Athens,Titus Andronicus,Troilus and Cressida,Twelfth Night,Two Gentlemen of Verona,macbeth,PlayerLineLength
count,105152.0,105152.0,105152.0,105152.0,105152.0,105152.0,105152.0,105152.0,105152.0,105152.0,...,105152.0,105152.0,105152.0,105152.0,105152.0,105152.0,105152.0,105152.0,105152.0,105152.0
mean,36.765872,442.556718,0.018706,0.020561,0.031992,0.027817,0.033903,0.025458,0.035834,0.03572,...,0.029443,0.025078,0.021768,0.023775,0.024336,0.0332,0.023613,0.021245,0.022729,38.983833
std,39.989659,249.109112,0.135486,0.141909,0.175979,0.164449,0.180981,0.157513,0.185877,0.185591,...,0.169046,0.156363,0.145927,0.152349,0.154091,0.179158,0.151842,0.144202,0.149039,11.32345
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,10.0,225.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0
50%,25.0,455.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0
75%,50.0,638.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0
max,405.0,933.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,842.0


# Select target and feature columns

In [7]:
target = df['Player']
feature = df.drop(['Player'], axis = 1)
feature.head()

Unnamed: 0,PlayerLinenumber,A Comedy of Errors,A Midsummer nights dream,A Winters Tale,Alls well that ends well,Antony and Cleopatra,As you like it,Coriolanus,Cymbeline,Hamlet,...,Romeo and Juliet,Taming of the Shrew,The Tempest,Timon of Athens,Titus Andronicus,Troilus and Cressida,Twelfth Night,Two Gentlemen of Verona,macbeth,PlayerLineLength
3,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,38
4,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,42
5,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,46
6,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,39
7,1.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,41


# Split dataset to test/train and use a DecisionTree classifier
Split the dataset into 70% train and 30% test randomly. Then we will use a decision tree classifier with gini criteron upto depth 5. 

In [8]:
x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size = .3, random_state=1)

In [None]:
model = DecisionTreeClassifier(criterion='gini', max_depth=5)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print('\nTest Results')
print('Accuracy: {}'.format(model.score(x_test, y_test)))

plt.figure(figsize=(10, 8), dpi= 80, facecolor='w', edgecolor='k')
plot_tree(model, filled=True)
plt.show();


Test Results
Accuracy: 0.07420909148544982


__This is a really bad result with 7% accuracy. Now, let us try Naive Bayes classifier.__

# Naive Bayes Classifier

In [None]:
model = GaussianNB()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print('\nTest Results')
print('Accuracy: {}'.format(model.score(x_test, y_test)))

__Result is better than decision tree classifier but it is still not acceptable. I tried to run the svm classifier but it the model does not complete training at all.__