In [766]:
import gensim
print(dir(gensim))

['NullHandler', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_matutils', 'corpora', 'interfaces', 'logger', 'logging', 'matutils', 'models', 'parsing', 'scripts', 'similarities', 'summarization', 'topic_coherence', 'utils']


In [767]:
import os
cwd = os.getcwd()
sourcefile = cwd + "/sources/pap.txt"

In [768]:
raw_documents = []

with open(sourcefile) as f: 
    contents = f.read()
    for note in contents.split("#"):
            if len(note) > 0:
                raw_documents.append(note)
                
print("Number of documents:",len(raw_documents))

Number of documents: 51574


In [769]:
from nltk.tokenize import word_tokenize
import string
gen_docs = [[w.lower() for w in word_tokenize(text) if w not in string.punctuation] 
            for text in raw_documents]

In [770]:
dictionary = gensim.corpora.Dictionary(gen_docs)
print("Number of words in dictionary:", len(dictionary))

Number of words in dictionary: 223502


In [771]:
### TF-IDF PART ###

In [772]:
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

In [773]:
tf_idf = gensim.models.TfidfModel(corpus)

In [774]:
def get_sim_tfidf(doc):
    query_doc = [w.lower() for w in doc]
    query_doc_bow = dictionary.doc2bow(query_doc)
    query_doc_tf_idf = tf_idf[query_doc_bow]

    sim = gensim.similarities.MatrixSimilarity(corpus) 
    return sim[query_doc_tf_idf]

In [775]:
import numpy as np
def get_top_n_tfidf(n, doc_id):
    doc_index = doc_id - 1
    sim = get_sim_tfidf(gen_docs[doc_index])
    top_per_doc = sim.argsort()[-n:][::-1]
    top_per_doc = [num + 1 for num in top_per_doc]
    return top_per_doc

In [776]:
### GRAPH PART ###

In [777]:
graph_corpus = []
for i in range(0,10):
    graph_corpus.append(0)

In [778]:
stopwordsfile = cwd + "/sources/stopwords.txt"
stop_words = []

with open(stopwordsfile) as f: 
    contents = f.read()
    for word in contents.split(", "):
            if len(word) > 0:
                stop_words.append(word)

In [779]:
import copy
filtered_dictionary = copy.deepcopy(dictionary)

for stop_word in stop_words:
    if stop_word in filtered_dictionary.token2id:
        filtered_dictionary.filter_tokens(bad_ids=[filtered_dictionary.token2id[stop_word]])

In [780]:
def get_zeros():
    zeros = []
    size = len(filtered_dictionary)
    for i in range (0, size):
        row = []
        for r in range (0, size):
            row.append(0)
        zeros.append(row)
    return zeros

In [781]:
def get_doc_matrix(doc, k):
    doc_matrix = get_zeros()
    
    query_doc = [w.lower() for w in doc]    
    dict_size = len(filtered_dictionary)
    doc_size = len(query_doc)
    
    for i in range(0, doc_size):
        for slide in range(0, k+1):
            if i+slide < doc_size and query_doc[i] in filtered_dictionary.token2id and query_doc[i+slide] in filtered_dictionary.token2id:
                first = filtered_dictionary.token2id[query_doc[i]]
                second = filtered_dictionary.token2id[query_doc[i+slide]]
                doc_matrix[first][second] += 1
    return doc_matrix

In [782]:
def get_doc_vector(doc, k):
    doc_matrix = get_doc_matrix(doc, k)
    doc_vector = []
    
    counter = 0
    
    dict_size = len(filtered_dictionary)
    
    for i in range(0, dict_size):
        for j in range(0, dict_size):
            if doc_matrix[i][j] > 0:
                doc_vector.append((counter, doc_matrix[i][j]))
            counter += 1
    return doc_vector

In [783]:
def get_graph_corpus(k):
    if graph_corpus[k] != 0:
        return graph_corpus[k]
    else:
        doc_count = len(gen_docs)
        corpus = []
        for i in range(0, doc_count):
            v = get_doc_vector(gen_docs[i], k)
            corpus.append(v)
        graph_corpus[k] = corpus
        return corpus

In [784]:
def get_sim_graph(doc, k):
    corpus = get_graph_corpus(k)
    doc_vector = get_doc_vector(doc, k)
    sim = gensim.similarities.MatrixSimilarity(corpus)
    return sim[doc_vector]

In [785]:
def get_top_n_graph(n, doc_id, k):
    doc_index = doc_id - 1
    sim = get_sim_graph(gen_docs[doc_index], k)
    top_per_doc = sim.argsort()[-n:][::-1]
    top_per_doc = [num + 1 for num in top_per_doc]
    return top_per_doc

In [786]:
### ALTERNATIVE GRAPH PART ###

In [787]:
matrix_flat = []
doc_count = len(gen_docs)
for i in range(0,doc_count):
    matrix_flat.append([])

In [788]:
import numpy as np
from scipy.spatial import distance

def get_top_n_graph_sparsed(n, doc_id, k):
    doc_index = doc_id - 1
    
    doc_matrix = np.array([get_doc_matrix(gen_docs[doc_index], k)], dtype = np.int64)
    doc_flat_matrix = np.hstack(doc_matrix)
    doc_flat_matrix = np.hstack(doc_flat_matrix)
    
    doc_count = len(gen_docs)
    dist_to_note = {}
    for i in range(0, doc_count):
        if len(matrix_flat[i]) != 0:
            m_flat = matrix_flat[i]
        else:
            m = np.array([get_doc_matrix(gen_docs[i], k)], dtype = np.int64)
            m_flat = np.hstack(m)
            m_flat = np.hstack(m_flat)
            matrix_flat[i] = m_flat
            
        dist = distance.cosine(doc_flat_matrix, m_flat)
        dist_to_note[dist] = i+1
    
    n_items = dist_to_note
    
    top_per_doc  = dict(sorted(dist_to_note.items(), key=lambda x: x[0], reverse=False)[:n])
    top_per_doc = list(top_per_doc.values())
    return top_per_doc

In [789]:
### EXPERIMENT ###

In [790]:
def experiment(note_id, k_list, n):
    print('base note:', note_id)
    print('tf-idf: ',get_top_n_tfidf(n, note_id))
    for k in k_list:
        print(k,'graph:',get_top_n_graph_sparsed(n, note_id, k))

In [None]:
experiment(121,[1,2,3],10)

base note: 121
tf-idf:  [121, 8312, 239, 17666, 1199, 7812, 2173, 373, 26133, 10411]
