In [290]:
from gensim import corpora, models, utils, similarities
from gensim.models import LsiModel

In [291]:
import os
cwd = os.getcwd()
sourcefile = cwd + "/sources/pap.txt"

In [292]:
documents = []

with open(sourcefile) as f: 
    contents = f.read()
    for note in contents.split("#"):
            if len(note) > 0:
                documents.append(note)
       
print("Number of documents:",len(documents))

Number of documents: 51574


In [293]:
stopwordsfile = cwd + "/sources/stopwords.txt"
stop_words = []

with open(stopwordsfile) as f: 
    contents = f.read()
    for word in contents.split(", "):
            if len(word) > 0:
                stop_words.append(word)

In [294]:
# Mapping from any form to the base form of a word

dictionaryfile = cwd + "/sources/odm.txt"
word_to_base_form = {}

with open(dictionaryfile, 'r', newline='', encoding='latin2') as sourcefile:
    content = sourcefile.readlines()
    for line in content:
        line = line.rstrip()
        allForms = line.split(', ')
        base_form = allForms[0].lower()
        for form in allForms:
            word_to_base_form[form] = base_form

In [295]:
def primary_form(word):
    if word in word_to_base_form.keys():
        return word_to_base_form[word]
    return word

In [296]:
# Conversion to primary form, removing punctuation and skipping stop words

from nltk.tokenize import word_tokenize
import string
texts = [[primary_form(w.lower()) for w in word_tokenize(text) if w not in string.punctuation and w not in stop_words] 
            for text in documents]

In [297]:
# Removing hapax legomena

from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        
texts = [[token for token in text if frequency[token] > 1] for text in texts]

In [298]:
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')
print(dictionary)

Dictionary(46611 unique tokens: ['1991-95', '1997', '1998', '59', 'andrzej']...)


In [None]:
# Converting each document into the bag-of-words format

corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)
# print(corpus)

In [None]:
# Running LSA

corpus = corpora.MmCorpus('/tmp/deerwester.mm')
id2word = corpora.Dictionary.load('/tmp/deerwester.dict')

lsi = models.LsiModel(corpus, id2word=id2word, num_topics=200, chunksize=1)
print(lsi)

In [None]:
# SIMILATIRY

In [None]:
def experiment(base_note_id, n):
    actual_index = base_note_id - 1
    query_doc = documents[actual_index] 
    vec_bow = dictionary.doc2bow(query_doc.lower().split())
    vec_lsi = lsi[vec_bow] # convert the query to LSI space
    # print(vec_lsi)
    
    index = similarities.MatrixSimilarity(lsi[corpus])
    index.save('/tmp/deerwester.index')
    index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')
    sims = index[vec_lsi] # perform a similarity query against the corpus
    result = list(enumerate(sims))
    # print(result)
    
    top_per_doc = sorted(result, key=lambda x: x[1], reverse=True)[:5]
    top_per_doc = list(item[0] for item in top_per_doc)

    top_per_doc = [x+1 for x in top_per_doc]
    print(top_per_doc)
    return top_per_doc

In [None]:
experiment(121,10)