In [None]:
!pip install nltk

In [None]:
import os
import string
import copy
import math
import operator
from collections import Counter, OrderedDict
from itertools import combinations

import numpy as np

import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

In [None]:
if os.path.exists("cains.txt"):
    cains_fp = "cains.txt"
else:
    cains_fp = "/content/cains.txt"
with open(cains_fp, 'r', encoding="utf-8") as cains_file:
    text = cains_file.read()

__Analysing cosine similarity between pages__

In [None]:
def get_document_word_vector(text):
    print('input text length: ', len(text))
    print('input text beginning: ', text[:200])
    translator = str.maketrans('', '', string.punctuation)
    text_no_punct = text.translate(translator)
    print('(no punctuation) text length: ', len(text_no_punct))
    print('(no punctuation) text beginning: ', text_no_punct[:200])
    tokenizer = TreebankWordTokenizer()
    tokens = tokenizer.tokenize(text_no_punct.lower())
    token_counts = Counter(tokens)
    print('total tokens: ', len(token_counts))
    print('sample tokens: ', tokens[:100])
    print('tokens sorted by occurences: ', token_counts.most_common()[:100])
    stopwords = set(nltk.corpus.stopwords.words('english'))
    tokens = [x for x in tokens if x not in stopwords]
    token_counts = Counter(tokens)
    print('(no stopwords) total tokens: ', len(token_counts))
    print('(no stopwords) sample tokens: ', tokens[:100])
    print('(no stopwords) tokens sorted by occurences: ', token_counts.most_common()[:100])

    document_vector = {}
    tokens_num = len(tokens)
    for key, value in token_counts.most_common():
        document_vector[key] = value / tokens_num
    print('built word vector of length', len(document_vector))
    lexicon = sorted(set(tokens))
    return document_vector, lexicon

In [None]:
all_book_wv, all_book_lexicon = get_document_word_vector(text)
print(all_book_lexicon[:50])
print(all_book_wv)

In [None]:
zero_vector = OrderedDict((token, 0) for token in all_book_lexicon)
print(len(zero_vector))
#print(zero_vector)

In [None]:
with open(cains_fp, 'r', encoding="utf-8") as cains_file:
    all_file = cains_file.read()
    book_pages = all_file.splitlines()

print('pages count: ', len(book_pages))
print('1st page: ', book_pages[0])
doc_vectors = []

for page_text in book_pages:
    vec = copy.copy(zero_vector)
    page_wv, page_lexicon = get_document_word_vector(page_text)
    if not set(page_lexicon).issubset(all_book_lexicon):
        print('ERROR! the page lexicon is out of the all text lexicon..')
        break
    for word, count in page_wv.items():
        vec[word] = count / len(all_book_lexicon)
    doc_vectors.append(vec)

In [None]:
def cosine_sim(vec1, vec2):
    vec1 = [val for val in vec1.values()]
    vec2 = [val for val in vec2.values()]
    dot_prod = 0
    for i, v in enumerate(vec1):
        dot_prod += v * vec2[i]
    mag_1 = math.sqrt(sum([x**2 for x in vec1]))
    mag_2 = math.sqrt(sum([x**2 for x in vec2]))
    return dot_prod / (mag_1 * mag_2)

In [None]:
print("cosine=", cosine_sim(doc_vectors[0], doc_vectors[0])) #checking our calculations. cosine dist between the same vectors must be 1.

In [None]:
comb = list(combinations(range(100), 2))
#print(comb)

dict_cos = {}
map_cos = np.ones((100,100))
for n, item in enumerate(comb):
    i = comb[n][0]
    j = comb[n][1]
    cos_ij = cosine_sim(doc_vectors[i], doc_vectors[j])
    dict_cos["cosine (%d, %d)" % (i+1, j+1)] = cos_ij
    map_cos[i][j] = cos_ij
    map_cos[j][i] = cos_ij

desc = sorted(dict_cos.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
print(f"pages similarity in decreasing order\n")
for v, item in enumerate(desc):
    print(v+1, item)

In [None]:
N = 10
print(f"the top {closest} closest pages for each page\n")
for i in range(100):
    print(i + 1, np.argsort(map_cos[i,:])[-N:] + 1)