In [1]:
# Daniel Bandala @ nov 2022
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [2]:
with open('hamlet.txt', 'r') as file:
    hamlet = file.read().replace('\n', ' ').rstrip().lower()
with open('juliuscaesar.txt', 'r') as file:
    juliuscaesar = file.read().replace('\n', ' ').rstrip().lower()
with open('macbeth.txt', 'r') as file:
    macbeth = file.read().replace('\n', ' ').rstrip().lower()
with open('othello.txt', 'r') as file:
    othello = file.read().replace('\n', ' ').rstrip().lower()

In [3]:
# tokenization
hamlet_tok = word_tokenize(hamlet)
juliuscaesar_tok = word_tokenize(juliuscaesar)
macbeth_tok = word_tokenize(macbeth)
othello_tok = word_tokenize(othello)
print("Tokens document hamlet: ",len(hamlet_tok))
print("Tokens document juliuscaesar: ",len(juliuscaesar_tok))
print("Tokens document macbeth: ",len(macbeth_tok))
print("Tokens document othello: ",len(othello_tok))

Tokens document hamlet:  40485
Tokens document juliuscaesar:  26513
Tokens document macbeth:  23232
Tokens document othello:  36599


In [4]:
# stemming
stemmer = PorterStemmer()
hamlet_stem = [stemmer.stem(i) for i in hamlet_tok]
juliuscaesar_stem = [stemmer.stem(i) for i in juliuscaesar_tok]
macbeth_stem = [stemmer.stem(i) for i in macbeth_tok]
othello_stem = [stemmer.stem(i) for i in othello_tok]
print("Stems document hamlet: ",len(hamlet_stem))
print("Stems document juliuscaesar: ",len(juliuscaesar_stem))
print("Stems document macbeth: ",len(macbeth_stem))
print("Stems document othello: ",len(othello_stem))

Stems document hamlet:  40485
Stems document juliuscaesar:  26513
Stems document macbeth:  23232
Stems document othello:  36599


In [6]:
# retrieve bag of words
words_bag = sorted(set(hamlet_tok).union(set(juliuscaesar_tok)).union(set(macbeth_tok)).union(set(othello_tok)))
print("Global vocabulary size: ",len(words_bag))

Global vocabulary size:  8838


In [7]:
# conditioning
docs = [hamlet_tok,juliuscaesar_tok,macbeth_tok,othello_tok]
C = len(docs)
W = len(words_bag)

In [8]:
# number of occurrences
words_vec = np.array([[docs[j].count(words_bag[i]) for i in range(W)] for j in range(C)])
print("Global vocabulary size: ",words_vec.shape)

Global vocabulary size:  (4, 8838)


In [9]:
# calculate distance
print("Words bag distance between Hamlet and Julius Caesar: ", np.linalg.norm(words_vec[0]-words_vec[1]))
print("Words bag distance between Hamlet and Julius Macbeth: ", np.linalg.norm(words_vec[0]-words_vec[2]))
print("Words bag distance between Hamlet and Julius Othello: ", np.linalg.norm(words_vec[0]-words_vec[3]))
print("Words bag distance between Julius Caesar and Macbeth: ", np.linalg.norm(words_vec[1]-words_vec[2]))
print("Words bag distance between Julius Caesar and Othello: ", np.linalg.norm(words_vec[1]-words_vec[3]))
print("Words bag distance between Macbeth and Othello: ", np.linalg.norm(words_vec[2]-words_vec[3]))

Words bag distance between Hamlet and Julius Caesar:  1857.9499455044531
Words bag distance between Hamlet and Julius Macbeth:  2296.975620245021
Words bag distance between Hamlet and Julius Othello:  1492.5809860774725
Words bag distance between Julius Caesar and Macbeth:  972.3800697258248
Words bag distance between Julius Caesar and Othello:  1566.3799028332814
Words bag distance between Macbeth and Othello:  1920.467911734013
