In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer

csvPath = "data/"
file = "Shakespeare_data.csv"

df = pd.DataFrame(pd.read_csv(csvPath + file))
# data cleaning, discard rows that have no player
df = df[df['Player'].notna()]

# only focus on Play Hamlet
df_Hamlet = df[df['Play']=='Hamlet']

cnt_M = 0
train = pd.DataFrame()
test = pd.DataFrame()
label = {}
# focusing on players who have more than 100 lines
# fix the random seed
np.random.seed(27)
for i in df_Hamlet['Player'].unique():
    temp = df_Hamlet[df_Hamlet['Player']==i]
    if temp.shape[0] > 100:
        cnt_M += 1
        label[i] = len(label)
        for r in range(temp.shape[0]):
            dice = np.random.random_sample()
            if dice > 0.3:
                train = train.append(temp.iloc[r])
            else:
                test = test.append(temp.iloc[r])


In [2]:
# visualize
print("label")
print(label)
print("\nTrain:"+str(train.shape))
print(train.iloc[0])


label
{'HORATIO': 0, 'KING CLAUDIUS': 1, 'LAERTES': 2, 'LORD POLONIUS': 3, 'HAMLET': 4, 'QUEEN GERTRUDE': 5, 'OPHELIA': 6}

Train:(2388, 6)
ActSceneLine                         1.1.16
Dataline                              32452
Play                                 Hamlet
Player                              HORATIO
PlayerLine          Friends to this ground.
PlayerLinenumber                         13
Name: 32451, dtype: object


In [3]:
# using Naive Bayes
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
import re

# training, using the single word to predict the player
words = []
player = []
for r in range(train.shape[0]):
    player_line = train.iloc[r]['PlayerLine']
    player_line = player_line.lower()
    t = re.findall(r'\w+', player_line)
    p = train.iloc[r]['Player']
    for w in t:
        words.append(w)
        player.append(p)
        
word_map = {}
words_encoded = []
for w in words:
    if w not in word_map:
        word_map[w] = len(word_map)
    
    words_encoded.append(word_map[w])

player_map = {}
player_list = []
player_encoded = []
for w in player:
    if w not in player_map:
        player_map[w] = len(player_map)
        player_list.append(w)
    
    player_encoded.append(player_map[w])


X = np.array([words_encoded])
XX = np.transpose(X)
model = GaussianNB()

model.fit(XX,player_encoded)

# testing, 
# assign special prediction for previous unseen word as unknown
# then using voting to determine the final prediction
for r in range(test.shape[0]):
    player_line = test.iloc[r]['PlayerLine']
    player_line = player_line.lower()
    t = re.findall(r'\w+', player_line)
    gt = test.iloc[r]['Player']
    p = []
    for w in t:
        if w in word_map:
            p.append(model.predict([[word_map[w]]])[0])
        else:
            p.append(-1)
    print(p)

[0, 4, 4, 4, 4, 4, 4]
[4, 4, 4, 4]
[4, -1, 0, 4, 4, 4, 4, 4, -1, -1]
[4, -1, 4, -1, -1, 4, 4, -1]
[4, 4]
[4, -1, 0, 4, -1, 4, 4, 0]
[4, 4, 4]
[0, 4, 0, 4, 0, 4, 0, 4, 4]
[0, 4, 4, 4, 4, 4, 4, 4, -1, 0, 4]
[4, 4, 4, -1, 4, 4, 4, 4, -1]
[4, -1, 4, 4, 4, -1, 0]
[0, 4, 4, -1, 4, 4, 4, 4, 0]
[-1, 0, 4, 4, 4, 4, -1, -1]
[0, 4, 4, 4, 4, 4, -1]
[4, -1, 4, -1, 4, 4, 4, 4]
[-1, 4, 4, 4, 4, 4, -1, 4]
[4, -1, -1, 4, 4, -1]
[4, 4, -1, -1, 4, 4, 4, 4]
[4, 4, 4]
[0, 4, 4, -1, 4, 4, 4, 0, 4]
[0, 0, 4, 0, -1, 4, 4, 0]
[-1, 4, 4, 4, -1, 4, 0]
[0, 0, 4, 4, 4, 4]
[0, 4, -1, -1, 4, 0, 4]
[4, 0, 4, 0, 4, 4, 4, 4, 4]
[4, 4, 4, 4, 4, 4, 4, 0, -1]
[4, 4, -1, 4, -1]
[-1, 4, 4, 4, 4, 4, -1, 0, -1, 4]
[4, 4, 4]
[4, 4, 4, 0, 4]
[4, 4, 4, 0, 4, 4, 4, 4, 0]
[4, 4, -1, 0, 4, 4, -1, 4]
[4, 0, -1, 0, 4, 4, 4, -1]
[0, 4, -1, 4, 4, 4, 4, 4]
[4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
[4, 4, 4, 4, 4, 4, 4, 4, 4, 0]
[4, 4, 4, 4]
[4, 4, 4, 4, 4, 4, 4, -1]
[4, -1, 4, -1, 0, 4, 4, -1]
[4, 4, 4, 4, 4, 4, 4, 4]
[4, 4, 4, 4, 4, 4]
[4, 4, 4, 

[4, -1, 4, -1, 4, 4, -1, 4, 4]
[4, 4, 4, -1, -1, 4, 4]
[4, 4, 4, -1, 0, 4]
[0, 4]
[4, 0, 4, 4]
[4, 0, 4, 4, 4, 4, 4]
[4, 4, 4, -1, 4, 4, 4, 0, 4, 4, 4, -1]
[4, 4, 4, 0, 4, 4, 4, 4, 4]
[4, 4, 4, 4, 4, 4, 4, 4, 4]
[4, 4, 4, 4, 4, -1, 4, 4, 4]
[4, -1, 0, 0]
[-1, 4, 4, 4, 4, -1, 4, 4]
[4, 4, 4, 4, 0, 0, -1]
[4, 4, -1, 4, 4, 4]
[4, 4, 4, 0, 4, -1, 4, 4, 4, -1]
[4, 4, 0, 4, -1]
[4, 4, 0, 4, 4, 4, 4, 4, 0, 4]
[4, 4, -1, 0, 4, 4, 4, 4]
[4, -1, 4, -1, 4, 4, -1]
[4, 4, 4, 4, 0, 4, -1, 4, 4]
[0, 4, 4, 4, 4, 4, 4, -1]
[4, 4, -1, 4, 4, 4, 0, -1, 4, 4]
[4, 0, 4, 4, 4, 4, 4, 4, 4, 0]
[4, 4, 4, 0, 4, 0]
[4, 4, 4, 4, 4, 4, 4, 4, 4]
[4, 0, 4, 4, 4, -1, 4, -1, 4, 4]
[4, -1, 4, 4, 4, 4]
[4, 4, 4, 0, 4, 4, -1, 4]
[4, 4, 4, 4, 4, 0, 4, 4, 4, -1]
[0, 4, 0, 0, 4, 4, 4, 4, 4]
[4, 4, 4, 4, 0, 4, 4, 4, 4, 4]
[4, 4, 4, -1, 4, 4, 4, -1]
[0, -1, 4, 4, 4, 4, 4]
[4, 4, 4, 4, -1, 4, 4, 0]
[0, 4, 4, 4, 4, 4, 4, 4]
[4, 4, 4, 0, 4, 4, 4, 4]
[4, 4, 4, -1, 4, -1]
[0, 0, 4]
[4, 4, 4, 0, 4, 0]
[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 

[4, 4, 4, 4, 4, 4, 4, 4]
[4, 0, 4, 4, 0]
[4, 4, 4, 4, 4, 4, 0, 4, 0, 4]
[4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
[0, 4, 4, 4]
[4, 4, 4, 4, 4, 4, 4, 0, 4]
[4, 4, 4, 4, 4]
[-1, 4, 4, -1, 4, -1, 4]
[4, 4, 4, -1, 4, 4]
[0, 4, 0, 4, 4, -1, -1]
[4, 4, 0, 4, 4, -1, 0, 4]
[4, -1, 4, 0, 4, 4, 4, 4]
[4, 4, 4, 4, 4, 4, -1, 4]
[4, 4, 4, -1, -1, 4, 4, 4, 4]
[4, -1, 4, 4, 4, 0, 4, 0, -1]
[4, 4, 0, 4, 4, 4, 4, 4]
[4, 0, -1, 4, -1, 4, -1, 4]
[4, -1, 4, 4, -1, -1]
[4, -1, 4, 4, 4, 4]
[4, -1, 4, 4, -1, 4, 0, 4, 4]
[4, 4, 4, 4, 0, 4, 4, 4, 4, 4]
[4, 4, 4, 4, 4, 4]
[0, 4, 4, 4, 4, 4, 4, 4, 4]
[4, 4, 4, -1]
[4, -1, 4, 4, 4, 0, 4, 0, 4]
[4, 4, -1, 4]
[4, 4, 4, 4, 4]
[4, 4, -1, 4, 4, 4, 4, 4]
[0, 4, 4, 4, 4, 4, 4, 4, -1]
[4, 0, 4, 0, 4, 4, 4, 4]
[4, 4]
[4, 4, 4, -1, 4, 4, 4, 4]
[4, 4, 4, -1, 4, 4, 4, 4]
[4, 0, 4, 4, 4, -1, 4, 4, 4]
[4, 4, 4, -1, 4, 4, 4, 4, 4]
[0, 4, 4, 4, 4, 4, 4, 4, 0, 4]
[4, -1, 4, 0, 0, 4, 0, 4]
[4, -1, 4, 0, 4, 4, 4, 4]
[4, -1, 0, 4, 4, -1, 4, 4]
[4, 4, 4, -1, 4, 4, 4, -1]
[4, 4, 4, 4, 0, 4, 4, 4

In [4]:
# analysis
# most of the prediction is 0, means there is something wrong
# let's look at the training data

print(player_map)
for pl in set(player):
    print(pl + ":" + str(player.count(pl)))

# now the result is kind of interesting
# because the lines of HAMLET appears most in the play HAMLET, 
# the trained naive bayes classifier will predict most word are saied by HAMLET
# let's see if downsample can help in this case


{'HORATIO': 0, 'KING CLAUDIUS': 1, 'LAERTES': 2, 'LORD POLONIUS': 3, 'HAMLET': 4, 'QUEEN GERTRUDE': 5, 'OPHELIA': 6}
KING CLAUDIUS:3018
QUEEN GERTRUDE:746
LORD POLONIUS:1965
OPHELIA:926
LAERTES:995
HAMLET:8565
HORATIO:1527


In [5]:

# since we know that the player who said least words is QUEEN GERTRUDE, who said 892 words
# let's down sample the training set to make sure eveyone has only 800 words
cnt_words = {}
for i in range(len(words)):
    pl = player[i]
    w = words[i]
    if pl in cnt_words:
        cnt_words[pl].append(w)
    else:
        cnt_words[pl] = [w]
# fix the random seed
np.random.seed(188)
subset_words = []
subset_player = []   
for pl in cnt_words:
    choosen_words = np.random.choice(cnt_words[pl], 800)
    for w in choosen_words:
        subset_words.append(w)
        subset_player.append(pl)
        
subset_word_map = {}
subset_words_encoded = []
for w in subset_words:
    if w not in subset_word_map:
        subset_word_map[w] = len(subset_word_map)
    
    subset_words_encoded.append(subset_word_map[w])

subset_player_encoded = []
for w in subset_player:
    subset_player_encoded.append(player_map[w])

for r in player_map:
    print(r + ":" + str(subset_player.count(r)))
    
x = np.array([subset_words_encoded])
xx = np.transpose(x)
model = GaussianNB()

model.fit(xx,subset_player_encoded)

# testing
for r in range(test.shape[0]):
    player_line = test.iloc[r]['PlayerLine']
    player_line = player_line.lower()
    t = re.findall(r'\w+', player_line)
    gt = test.iloc[r]['Player']
    p = []
    for w in t:
        if w in subset_word_map:
            p.append(model.predict([[subset_word_map[w]]])[0])
        else:
            p.append(-1)
    print(p)
    
# it seems better, but the model is in favor of HORATIO nownot quite what we expected.
# maybe we could try different approach.

HORATIO:800
KING CLAUDIUS:800
LAERTES:800
LORD POLONIUS:800
HAMLET:800
QUEEN GERTRUDE:800
OPHELIA:800
[0, 0, 0, 5, 0, -1, 1]
[0, 5, 4, 2]
[0, -1, 0, 0, -1, 4, 0, 2, -1, -1]
[0, -1, 0, -1, -1, 0, 0, -1]
[0, 1]
[0, -1, 0, 1, -1, 0, 0, 0]
[0, 2, 0]
[4, -1, 0, 1, 1, 0, 0, 0, 0]
[0, 0, 0, -1, 0, 0, 5, 0, -1, 0, 0]
[0, 0, 1, -1, 0, 0, 0, 1, -1]
[0, -1, 0, 0, -1, -1, 0]
[2, 0, 0, -1, 0, -1, 1, 0, 0]
[-1, 0, 1, 0, -1, 0, -1, -1]
[0, 0, 5, -1, 0, 0, -1]
[0, -1, 0, -1, 0, 0, -1, -1]
[-1, 0, 0, 5, 0, 0, -1, -1]
[0, -1, -1, 2, 0, -1]
[1, 2, -1, -1, 0, 0, 5, 2]
[0, 0, 0]
[1, 0, 3, -1, 0, 2, 5, 0, 5]
[1, 1, 0, 1, -1, 0, 2, 1]
[-1, -1, 0, 0, -1, 0, 0]
[0, 1, 0, 0, 0, 2]
[0, 0, -1, -1, 0, 0, 0]
[0, 0, 0, 0, 0, 5, 0, 0, 0]
[-1, 0, 5, 0, 5, 0, 0, 0, -1]
[0, 2, -1, 0, -1]
[-1, 0, 1, 0, 5, 0, -1, 0, -1, -1]
[0, 0, 0]
[1, 0, -1, 0, 5]
[1, 2, -1, 0, 0, 5, 0, 0, 1]
[0, 0, -1, 0, 0, 5, -1, 2]
[0, 0, -1, 0, 5, -1, 0, -1]
[0, 0, -1, 0, 0, -1, 0, 5]
[2, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 5, 1, -1, 0, -1]

[2, 5, 0, 0, 0, 0, -1, 1, 1]
[1, 4, 2, 0, 5, 0, 0, -1]
[0, 0, -1, 0, 0, 1, 0, -1, 0, 0]
[0, 0, 2, 0, 5, 0, 4, 5, -1, 0]
[2, 0, 0, 1, 0, 0]
[0, 0, 2, 0, 0, 0, 0, 1, -1]
[0, 0, 0, 3, 0, -1, 0, -1, 1, 0]
[1, -1, 0, 0, 5, -1]
[0, 0, 5, 2, 0, 5, -1, 0]
[4, 0, 0, 2, 0, 1, 2, 0, 0, -1]
[1, 5, 0, 0, 0, 2, 0, 5, -1]
[0, 0, 0, 2, 0, 0, 5, 5, 0, -1]
[-1, 0, 0, -1, 0, 0, 0, -1]
[1, -1, 5, 0, 0, 0, 5]
[0, 1, -1, 0, -1, -1, -1, 0]
[0, 0, 0, 0, 5, 2, 5, 0]
[5, 0, -1, 1, 0, 2, 2, 0]
[1, 5, 0, -1, 0, -1]
[0, 0, 5]
[1, 2, 0, 0, 0, 0]
[5, 0, 0, 0, 0, 0, 0, 1, 0, 4, 0]
[5, 0, 0, 4, 0, -1, 0, -1, 0]
[2, 0, 1, 0, 0, 1, 0, 0, 0, 2]
[0, -1, 0, 2, 0, 0, -1, 0, 0]
[-1, 0, 0, 0, 1, 0, 0, 0, -1]
[-1]
[1, 0, 0, -1, 0, -1, 0, 0, 0]
[0, 0, 0, 5, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0]
[0, 5, -1, 0, 0, 0, -1, 0, -1]
[-1, 5, -1, -1, -1]
[-1, 2, 5, 0, 0, 0, 0, -1, 0, 0]
[-1, 2, 0, 0, 5]
[0, 2]
[-1, 0, 0, -1]
[0, 5, 0, 5, 2, -1, 1, 5]
[0, -1, 0, 0, 5, 0, 4, 0, 0]
[0, -1, 5]
[5, 0, 5, 0, 2]
[0, -1, 0, -1, 0, 0, -1]
[-1, 0, 2, 

[1, 0, 1, 0, 5, 0, -1, 3]
[1, 0, 0, -1, 0, 0, 2, 0]
[2, -1, -1, 0, -1, -1, 5]
[0, 1, 0, 0, 0, 5, 0, 0, 3, 4]
[5, 1, -1, 0]
[0, -1, 0, 5, 2, 0, 0, 0, 2]
[0, -1, 0, 5, 0, 5]
[0, -1, 0, 0, -1, 1, 5, 0, 0]
[0, -1, 2, -1, 0, 0, -1]
[1, 0, 0, 5, 0, 2, 0, 0, 0, 0, 0]
[-1, 0, -1, 0, 5, 5, 0, 0]
[-1, -1, 0, 0, -1, -1]
[0, 0, 2, -1, -1, -1, 0, 0, 1]
[0, 0, 0, -1, 5, 1, 5]
[0, 0, 0, 1, 5, 0, 0, 5, -1, 0]
[0, 1, 0, 5, 1, 0, 5, -1]
[0, 5, 5, 5, 0, 0, -1]
[5, 0, 0, -1]
[1, 0, 5, -1, 0, 0, 0, -1, 0, 1, 0, 5]
[1, 1, 1, 0, 0, 1, 0, 5, -1, 0, -1, 0]
[1, 0, 0, 1, 4, 5, 0, 0, 0, -1]
[-1, 0, 5, 5, 1, 0, -1, 0, -1, 0]
[-1, 0, 0, -1, -1, 0, 0, 0, 1, 0, 0]
[0, -1, 0, 0, 0, 0, -1, -1, 0]
[0, 0, -1, 0, 0, -1]
[0, 0, 0, -1]
[0, 2, 0, 0, 1, 1, 0, 0, 4, 0, 0]
[0, -1, -1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1]
[0, 0, 0, 2, 0, 0, 0, -1, 5, 0, 4]
[0, 2, -1, 0, -1, 0, 0]
[-1, 0, -1, 1, 2, 1, 0, 1, 0]
[0, -1]
[1, 2, 0, 0, -1]
[1, 0, 0, 2]
[1, 0, 1]
[-1, 0, -1]
[0, 1, 0, 0, 0, 1, 0, 0, 0, -1, 0]
[0, -1, 0, -1, 0, 5, 0, 2, 1]


In [6]:
# but how would it be better if we train 7 classifiers for 7 classes?
models = []
prediction = []
for pl in player_map:
    this_player_encoded = []
    for w in subset_player:
        if w == pl:
            this_player_encoded.append(1)
        else:
            this_player_encoded.append(0)
    this_model = GaussianNB()
    this_model.fit(xx,this_player_encoded)
    models.append(this_model)
    prediction.append(pl)

# testing
for r in range(test.shape[0]):
    player_line = test.iloc[r]['PlayerLine']
    player_line = player_line.lower()
    t = re.findall(r'\w+', player_line)
    gt = test.iloc[r]['Player']
    max_p = 0
    pred = ""
    for m_id in range(len(models)):
        m = models[m_id]
        p = 0
        for w in t:
            if w in subset_word_map:
                p += m.predict([[subset_word_map[w]]])[0]
        if p > max_p:
            max_p = p
            pred = prediction[models.index(m)]
    if pred == "":
        print("NC / " + gt)
    else:
        print(pred + " / " + gt)
    
# hmm, the result looks bad too...
# Because we are trying to find the relation between words to player, but some words in testing set 
# might not be in the training set.


NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
QUEEN GERTRUDE / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
QUEEN GERTRUDE / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
QUEEN GERTRUDE / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
QUEEN GERTRUDE / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
QUEEN GERTRUDE / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC / HORATIO
NC /

NC / HAMLET
NC / HAMLET
QUEEN GERTRUDE / HAMLET
NC / HAMLET
QUEEN GERTRUDE / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
QUEEN GERTRUDE / HAMLET
QUEEN GERTRUDE / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
QUEEN GERTRUDE / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
QUEEN GERTRUDE / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
QUEEN GERTRUDE / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
QUEEN GERTRUDE / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
QUEEN GERTRUDE / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
QUEEN GERTRUDE / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
NC / HAMLET
QUEEN GERTRUDE / HAMLET
NC / HAMLET
NC /

In [13]:
# using Decision Tree
from sklearn import tree
dt = tree.DecisionTreeClassifier()
dt.fit(xx,subset_player_encoded)

for r in range(test.shape[0]):
    player_line = test.iloc[r]['PlayerLine']
    player_line = player_line.lower()
    t = re.findall(r'\w+', player_line)
    gt = test.iloc[r]['Player']
    p = []
    for w in t:
        if w in word_map:
            p.append(dt.predict([[word_map[w]]])[0])
        else:
            p.append(-1)
    
    # get prediction
    cnt_pred = {}
    for it in p:
        if it in cnt_pred:
            cnt_pred[it] += 1
        else:
            cnt_pred[it] = 1
    max_k = 0
    pred = ""
    for k in cnt_pred:
        if cnt_pred[k] > max_k:
            max_k = cnt_pred[k]
            if k >= 0:
                pred = player_list[k]
            else:
                pred = ""
    
    if pred == "":
        print("NC / " + gt)
    else:
        print(pred + " / " + gt)


LORD POLONIUS / HORATIO
HAMLET / HORATIO
HORATIO / HORATIO
NC / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
LAERTES / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
NC / HORATIO
KING CLAUDIUS / HORATIO
NC / HORATIO
HORATIO / HORATIO
NC / HORATIO
OPHELIA / HORATIO
NC / HORATIO
HAMLET / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
NC / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
OPHELIA / HORATIO
HORATIO / HORATIO
NC / HORATIO
OPHELIA / HORATIO
KING CLAUDIUS / HORATIO
OPHELIA / HORATIO
OPHELIA / HORATIO
HORATIO / HORATIO
OPHELIA / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
OPHELIA / HORATIO
HORATIO / HORATIO
KING CLAUDIUS / HORATIO
NC / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
QUEEN GERTRUDE / HORATIO
OPHELIA / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
HAMLET / HORATIO
LORD POLONIUS / HORATIO
OPHELIA / HORATIO
HORATIO / HORATIO
LORD POLONIUS / HORATIO
LORD POLONIUS / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
HORA

HORATIO / HAMLET
HORATIO / HAMLET
LORD POLONIUS / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
LORD POLONIUS / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
OPHELIA / HAMLET
KING CLAUDIUS / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
LORD POLONIUS / HAMLET
HORATIO / HAMLET
LORD POLONIUS / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
NC / HAMLET
LORD POLONIUS / HAMLET
NC / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
LORD POLONIUS / HAMLET
OPHELIA / HAMLET
NC / HAMLET
OPHELIA / HAMLET
HORATIO / HAMLET
NC / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
LORD POLONIUS / HAMLET
NC / HAMLET
LAERTES / HAMLET
HORATIO / HAMLET
NC / HAMLET
LORD POLONIUS / HAMLET
HORATIO / HAMLET
QUEEN GERTRUDE / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
LORD POLONIUS / HAMLET
KING CLAUDIUS / HAMLET
OPHELIA / HAMLET
NC / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
HORATIO / 

In [14]:
# using Random Forest
from sklearn.ensemble import RandomForestClassifier


clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
clf = clf.fit(xx,subset_player_encoded)

for r in range(test.shape[0]):
    player_line = test.iloc[r]['PlayerLine']
    player_line = player_line.lower()
    t = re.findall(r'\w+', player_line)
    gt = test.iloc[r]['Player']
    p = []
    for w in t:
        if w in word_map:
            p.append(clf.predict([[word_map[w]]])[0])
        else:
            p.append(-1)
    
    # get prediction
    cnt_pred = {}
    for it in p:
        if it in cnt_pred:
            cnt_pred[it] += 1
        else:
            cnt_pred[it] = 1
    max_k = 0
    pred = ""
    for k in cnt_pred:
        if cnt_pred[k] > max_k:
            max_k = cnt_pred[k]
            if k >= 0:
                pred = player_list[k]
            else:
                pred = ""
    
    if pred == "":
        print("NC / " + gt)
    else:
        print(pred + " / " + gt)



LORD POLONIUS / HORATIO
HAMLET / HORATIO
HORATIO / HORATIO
NC / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
LORD POLONIUS / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
NC / HORATIO
LORD POLONIUS / HORATIO
NC / HORATIO
HORATIO / HORATIO
NC / HORATIO
OPHELIA / HORATIO
NC / HORATIO
HAMLET / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
NC / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
OPHELIA / HORATIO
KING CLAUDIUS / HORATIO
NC / HORATIO
OPHELIA / HORATIO
KING CLAUDIUS / HORATIO
OPHELIA / HORATIO
OPHELIA / HORATIO
NC / HORATIO
OPHELIA / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
LORD POLONIUS / HORATIO
HORATIO / HORATIO
KING CLAUDIUS / HORATIO
NC / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
QUEEN GERTRUDE / HORATIO
OPHELIA / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
OPHELIA / HORATIO
HAMLET / HORATIO
LORD POLONIUS / HORATIO
OPHELIA / HORATIO
HORATIO / HORATIO
LORD POLONIUS / HORATIO
LORD POLONIUS / HORATIO
HORATIO / HORATIO
HORATIO / HORATIO
HORATIO /

HORATIO / LORD POLONIUS
HORATIO / LORD POLONIUS
HORATIO / LORD POLONIUS
KING CLAUDIUS / LORD POLONIUS
KING CLAUDIUS / LORD POLONIUS
HORATIO / LORD POLONIUS
OPHELIA / LORD POLONIUS
HORATIO / LORD POLONIUS
LAERTES / LORD POLONIUS
OPHELIA / LORD POLONIUS
HORATIO / LORD POLONIUS
HORATIO / LORD POLONIUS
HORATIO / LORD POLONIUS
NC / LORD POLONIUS
HAMLET / LORD POLONIUS
HORATIO / LORD POLONIUS
HORATIO / LORD POLONIUS
HORATIO / LORD POLONIUS
LAERTES / LORD POLONIUS
KING CLAUDIUS / LORD POLONIUS
HORATIO / LORD POLONIUS
QUEEN GERTRUDE / LORD POLONIUS
OPHELIA / LORD POLONIUS
HORATIO / LORD POLONIUS
HORATIO / LORD POLONIUS
HORATIO / LORD POLONIUS
HORATIO / LORD POLONIUS
OPHELIA / LORD POLONIUS
HORATIO / LORD POLONIUS
OPHELIA / LORD POLONIUS
OPHELIA / LORD POLONIUS
NC / LORD POLONIUS
HORATIO / LORD POLONIUS
HORATIO / LORD POLONIUS
KING CLAUDIUS / LORD POLONIUS
LORD POLONIUS / LORD POLONIUS
HORATIO / LORD POLONIUS
OPHELIA / LORD POLONIUS
HORATIO / LORD POLONIUS
LORD POLONIUS / LORD POLONIUS
QUEEN GE

KING CLAUDIUS / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
HAMLET / HAMLET
NC / HAMLET
KING CLAUDIUS / HAMLET
LORD POLONIUS / HAMLET
HORATIO / HAMLET
KING CLAUDIUS / HAMLET
OPHELIA / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
QUEEN GERTRUDE / HAMLET
HORATIO / HAMLET
LORD POLONIUS / HAMLET
HORATIO / HAMLET
OPHELIA / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
NC / HAMLET
OPHELIA / HAMLET
KING CLAUDIUS / HAMLET
OPHELIA / HAMLET
OPHELIA / HAMLET
LORD POLONIUS / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
OPHELIA / HAMLET
HORATIO / HAMLET
HORATIO / HAMLET
LORD POLONIUS / HAMLET
HORATIO / HAMLET
LORD POLONIUS / HAMLET
HORATIO / HAMLET
NC / HAMLET
LORD POLONIUS / HAMLET
LORD POLONIUS / HAMLET
NC / HAMLET
QUEEN GERTRUDE / HAMLET
KING CLAUDIUS / HAMLET
LORD POLONIUS / HAMLET
HORATIO / HAMLET
OPHELIA / HAMLET
OPHELIA / HAMLET
NC / HAMLET
LORD POLONIUS / HAMLET
HORATIO / HAMLET
LORD POLONIUS / HAMLET
OPHELIA / HAMLET
OPHELIA / HAMLET
OPHELIA / HAMLET
HORA

In [15]:
# compare decision tree and random forest
from sklearn.model_selection import cross_val_score

test_x = []
test_y = []
for r in range(test.shape[0]):
    player_line = test.iloc[r]['PlayerLine']
    player_line = player_line.lower()
    t = re.findall(r'\w+', player_line)
    gt = test.iloc[r]['Player']
    for w in t:
        if w in subset_word_map:
            test_x.append(subset_word_map[w])
            test_y.append(player_map[gt])
            
a_test_x = np.array([test_x])
a_test_x = np.transpose(a_test_x)


scores = cross_val_score(dt, a_test_x, test_y, cv=5)
print("decision tree: " + str(scores.mean()))
scores = cross_val_score(clf, a_test_x, test_y, cv=5)
print("random forest: " + str(scores.mean()))


decision tree: 0.43645569720105176
random forest: 0.4459689759523224


In [16]:
# now let's focus on predict "HAMLET" or not

HAMLET_encoded = []
for w in player:
    if w == "HAMLET":
        HAMLET_encoded.append(1)
    else:
        HAMLET_encoded.append(0)

# using NB
model_NB = GaussianNB()
model_NB.fit(XX,HAMLET_encoded)

# using Decision tree
model_tree = tree.DecisionTreeClassifier()
model_tree.fit(XX,HAMLET_encoded)

# using random forest
model_clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
model_clf = clf.fit(XX,HAMLET_encoded)

# get test
test_X = []
test_Y = []
for r in range(test.shape[0]):
    player_line = test.iloc[r]['PlayerLine']
    player_line = player_line.lower()
    t = re.findall(r'\w+', player_line)
    gt = test.iloc[r]['Player']
    for w in t:
        if w in word_map:
            test_X.append(word_map[w])
            if gt == "HAMLET":
                test_Y.append(1)
            else:
                test_Y.append(0)

a_test_X = np.array([test_X])
a_test_X = np.transpose(a_test_X)
scores = cross_val_score(model_NB, a_test_X, test_Y, cv=5)
print("navie bayes: " + str(scores.mean()))
scores = cross_val_score(model_tree, a_test_X, test_Y, cv=5)
print("decision tree: " + str(scores.mean()))  
scores = cross_val_score(model_clf, a_test_X, test_Y, cv=5)
print("random forest: " + str(scores.mean()))  




navie bayes: 0.5104665010080174
decision tree: 0.5126305864212002
random forest: 0.5116213189137376
