In [1]:
# Daniel Bandala @ nov 2022
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [2]:
with open('hamlet.txt', 'r') as file:
    hamlet = file.read().replace('\n', ' ').rstrip().lower()
with open('juliuscaesar.txt', 'r') as file:
    juliuscaesar = file.read().replace('\n', ' ').rstrip().lower()
with open('macbeth.txt', 'r') as file:
    macbeth = file.read().replace('\n', ' ').rstrip().lower()
with open('othello.txt', 'r') as file:
    othello = file.read().replace('\n', ' ').rstrip().lower()

In [3]:
# tokenization
hamlet_tok = word_tokenize(hamlet)
juliuscaesar_tok = word_tokenize(juliuscaesar)
macbeth_tok = word_tokenize(macbeth)
othello_tok = word_tokenize(othello)
print("Tokens document hamlet: ",len(hamlet_tok))
print("Tokens document juliuscaesar: ",len(juliuscaesar_tok))
print("Tokens document macbeth: ",len(macbeth_tok))
print("Tokens document othello: ",len(othello_tok))

Tokens document hamlet:  40485
Tokens document juliuscaesar:  26513
Tokens document macbeth:  23232
Tokens document othello:  36599


In [4]:
# stemming
stemmer = PorterStemmer()
hamlet_stem = [stemmer.stem(i) for i in hamlet_tok]
juliuscaesar_stem = [stemmer.stem(i) for i in juliuscaesar_tok]
macbeth_stem = [stemmer.stem(i) for i in macbeth_tok]
othello_stem = [stemmer.stem(i) for i in othello_tok]
print("Stems document hamlet: ",len(hamlet_stem))
print("Stems document juliuscaesar: ",len(juliuscaesar_stem))
print("Stems document macbeth: ",len(macbeth_stem))
print("Stems document macbeth: ",len(othello_stem))

Stems document hamlet:  40485
Stems document juliuscaesar:  26513
Stems document macbeth:  23232
Stems document macbeth:  36599


In [5]:
# retrieve global vocabulary - convert to list
glob_words = sorted(set(hamlet_stem).union(set(juliuscaesar_stem)).union(set(macbeth_stem)).union(set(othello_stem)))
print("Global vocabulary size: ",len(glob_words))

Global vocabulary size:  6562


In [6]:
# conditioning
words = ["caesar", "brutus"]
docs = [hamlet_tok,juliuscaesar_tok,macbeth_tok,othello_tok]
C = len(docs)
W = len(words)

In [7]:
# number of occurrences
words_freq = [[docs[j].count(words[i]) for i in range(W)] for j in range(C)]
words_freq

[[2, 1], [267, 361], [1, 0], [1, 0]]

In [8]:
# max occurrences
max_words_freq = [max(max(words_freq[i]),1) for i in range(C)]
max_words_freq

[2, 361, 1, 1]

In [9]:
# term frequency matrix
tf = [[docs[j].count(words[i])/max_words_freq[j] for j in range(C)] for i in range(W)]
print("Frequency matrix\n doc1 \t doc2 \t doc3 \t doc4 \n"+'\n'.join(['\t'.join([str(round(cell,4)) for cell in row]) for row in tf]))

Frequency matrix
 doc1 	 doc2 	 doc3 	 doc4 
1.0	0.7396	1.0	1.0
0.5	1.0	0.0	0.0


In [10]:
# calculate distance between words
word_distance = (tf[0][0]-tf[1][0])**2+(tf[0][1]-tf[1][1])**2+(tf[0][2]-tf[1][2])**2+(tf[0][3]-tf[1][3])**2

In [11]:
print("Eclidean distance between "+words[0]+" and "+words[1]+": "+str(word_distance))

Eclidean distance between caesar and brutus: 2.3178018124477253
