In [2]:
import pandas as pd
import numpy as np
import string

In [3]:
data = pd.read_csv("./assets/poems.csv")
documents = data["Poem"]

In [4]:
documents = documents.apply(lambda x: str.lower(x))

In [6]:
characters = set()
for doc in documents:
    words = doc.split(" ")
    for word in words:
        for char in word:
            if char not in characters:
                characters.add(char)

print(sorted(characters))

['!', '"', '&', "'", '(', ')', ',', '-', '.', '/', '0', '1', '2', '5', '8', ':', ';', '?', '[', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', 'è', 'é', '–', '—', '’', '“', '”', '\u2028']


In [7]:
for char in characters:
    if char not in string.ascii_lowercase:
        documents = documents.apply(lambda x: x.replace(char, ""))

In [8]:
tokenized_docs = [doc.split() for doc in documents]
tokenized_docs[0][0:10]

['a', 'woman', 'walks', 'by', 'the', 'bench', 'im', 'sitting', 'onwith', 'her']

In [9]:
unique_words = set(word for doc in tokenized_docs for word in doc)

In [10]:
embeddings = {word: np.random.rand(12) for word in unique_words}

In [11]:
dtm = np.zeros((len(tokenized_docs), len(unique_words)))

In [12]:
word_to_index = {word: i for i, word in enumerate(unique_words)}

In [13]:
for doc_idx, doc in enumerate(tokenized_docs):
    for word in doc:
        word_idx = word_to_index[word]
        dtm[doc_idx, word_idx] +=1

In [14]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    similarity = dot_product / (norm_vec1 * norm_vec2)
    return similarity

In [15]:
similarity = cosine_similarity(dtm[0], dtm[1])
print(similarity)

0.23588370942962128


In [16]:
num_docs = dtm.shape[0]
similarity_matrix = np.zeros((num_docs, num_docs))

for i in range(num_docs):
    for j in range(num_docs):
        if i != j:
            similarity_matrix[i, j] = cosine_similarity(dtm[i], dtm[j])
        else:
            similarity_matrix[i, j] = 1  # Document is perfectly similar to itself

In [17]:
sorted_indices = np.argsort(similarity_matrix[0])[::-1]
sorted_indices[1]

38

In [18]:
print(documents[0])
print(documents[sorted_indices[1]])

a woman walks by the bench im sitting onwith her dog that looks part lab part buickstops and asks if i would like to dancei smile tell her of course i do we decideon a waltz that she begins to hum
a woman was playinga man looking onand the mould of her faceand her neck and her hairwhich the rays fell uponof the two candles theresent him mentally strayingin some fancyplacewhere pain had no tracea cowled apparitioncame pushing betweenand her notes seemed to sighand the lights to burn paleas a spell
