In [456]:
import gensim
print(dir(gensim))

['NullHandler', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', '_matutils', 'corpora', 'interfaces', 'logger', 'logging', 'matutils', 'models', 'parsing', 'scripts', 'similarities', 'summarization', 'topic_coherence', 'utils']


In [457]:
import os
cwd = os.getcwd()
sourcefile = cwd + "/sources/pap.txt"

In [458]:
raw_documents = []

with open(sourcefile) as f: 
    contents = f.read()
    for note in contents.split("#"):
            if len(note) > 0:
                raw_documents.append(note)
                
print("Number of documents:",len(raw_documents))

Number of documents: 51574


In [459]:
from nltk.tokenize import word_tokenize
import string
gen_docs = [[w.lower() for w in word_tokenize(text) if w not in string.punctuation] 
            for text in raw_documents]

In [460]:
dictionary = gensim.corpora.Dictionary(gen_docs)
dictionary.filter_n_most_frequent(2)
print("Number of words in dictionary:", len(dictionary))

Number of words in dictionary: 223500


In [461]:
### TF-IDF PART ###

In [462]:
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]

In [463]:
tf_idf = gensim.models.TfidfModel(corpus)

In [464]:
def get_sim_tfidf(doc):
    query_doc = [w.lower() for w in doc]
    query_doc_bow = dictionary.doc2bow(query_doc)
    query_doc_tf_idf = tf_idf[query_doc_bow]

    sim = gensim.similarities.MatrixSimilarity(corpus) 
    return sim[query_doc_tf_idf]

In [467]:
import numpy as np
def get_top_n_tfidf(n, doc_id):
    doc_index = doc_id - 1
    top_n = []
    sim = get_sim_tfidf(gen_docs[doc_index])
    top_per_doc = sim.argsort()[-n:][::-1]
    top_per_doc = [num + 1 for num in top_per_doc]
    return top_per_doc

In [468]:
### GRAPH PART ###

In [469]:
graph_corpus = []
for i in range(0,10):
    graph_corpus.append(0)

In [None]:
stopwordsfile = cwd + "/sources/stopwords.txt"
stop_words = []

with open(stopwordsfile) as f: 
    contents = f.read()
    for word in contents.split(", "):
            if len(word) > 0:
                stop_words.append(word)

In [None]:
import copy
filtered_dictionary = copy.deepcopy(dictionary)

for stop_word in stop_words:
    if stop_word in filtered_dictionary.token2id:
        filtered_dictionary.filter_tokens(bad_ids=[filtered_dictionary.token2id[stop_word]])

In [470]:
def get_zeros():
    zeros = []
    size = len(filtered_dictionary)
    for i in range (0, size):
        row = []
        for r in range (0, size):
            row.append(0)
        zeros.append(row)
    return zeros

In [471]:
def get_doc_matrix(doc, k):
    doc_matrix = get_zeros()
    
    query_doc = [w.lower() for w in doc]    
    dict_size = len(filtered_dictionary)
    doc_size = len(query_doc)
    
    for i in range(0, doc_size):
        for slide in range(0, k+1):
            if i+slide < doc_size and query_doc[i] in filtered_dictionary.token2id and query_doc[i+slide] in filtered_dictionary.token2id:
                first = filtered_dictionary.token2id[query_doc[i]]
                second = filtered_dictionary.token2id[query_doc[i+slide]]
                doc_matrix[first][second] += 1
    return doc_matrix

In [472]:
def get_doc_vector(doc, k):
    doc_matrix = get_doc_matrix(doc, k)
    doc_vector = []
    
    counter = 0
    
    dict_size = len(filtered_dictionary)
    
    for i in range(0, dict_size):
        for j in range(0, dict_size):
            if doc_matrix[i][j] > 0:
                doc_vector.append((counter, doc_matrix[i][j]))
            counter += 1
    return doc_vector

In [473]:
def get_graph_corpus(k):
    if graph_corpus[k] != 0:
        return graph_corpus[k]
    else:
        doc_count = len(gen_docs)
        corpus = []
        for i in range(0, doc_count):
            v = get_doc_vector(gen_docs[i], k)
            corpus.append(v)
        graph_corpus[k] = corpus
        return corpus

In [474]:
def get_sim_graph(doc, k):
    corpus = get_graph_corpus(k)
    doc_vector = get_doc_vector(doc, k)
    sim = gensim.similarities.MatrixSimilarity(corpus)
    return sim[doc_vector]

In [475]:
def get_top_n_graph(n, doc_id, k):
    doc_index = doc_id - 1
    top_n = []
    sim = get_sim_graph(gen_docs[doc_index], k)
    top_per_doc = sim.argsort()[-n:][::-1]
    top_per_doc = [num + 1 for num in top_per_doc]
    return top_per_doc

In [None]:
### EXPERIMENT ###

In [476]:
def experiment(note_id, k_list):
    print('base note:', note_id)
    print('tf-idf: ',get_top_n_tfidf(10, note_id))
    for k in k_list:
        print(k,'graph:',get_top_n_graph(10, note_id, k))

In [None]:
experiment(121,[1,2,3])

base note: 121
tf-idf:  [121, 239, 8312, 26439, 2669, 42838, 3435, 7812, 2173, 17666]
