Topic Indexing - LSI using Gensim
===

This document implements Topic Indexing using a gensim implementation of Latent Semantic Indexing (LSI)

Import Libraries
---

In [137]:
from gensim import corpora, models, similarities 
import os
from six import iteritems

Prepare corpus
---

In [138]:
directory_path = r"c:\users\avour\Documents\GitHub\topic-indexing-lsa-gensim"
corpus_name = "my_corpus.txt"
corpus_path = os.path.join(directory_path, corpus_name)
print(corpus_path)

c:\users\avour\Documents\GitHub\topic-indexing-lsa-gensim\my_corpus.txt


Generate Dataset
---

In [139]:
stop_list = set('for a of the and to is has they be are as from their in'.split())

# Generate Dictionary
with open(corpus_path, 'r') as file:
    dictionary = corpora.Dictionary(line.lower().split() for line in file)

    # remove stop words and words that appear only once
    stop_ids = [dictionary.token2id[stopword] for stopword in stop_list
        if stopword in dictionary.token2id]
    once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq==1]
    dictionary.filter_tokens(stop_ids+once_ids)
    dictionary.compactify()

    
# Corpus Streaming
class MyCorpus(object):
    def __iter__(self):
        with open(corpus_path, 'r') as file:
            for line in file:
                # Assume there's one document per line, tokens separated by whitespace
                yield dictionary.doc2bow(line.lower().split())

corpus = MyCorpus()

# Save corpus and dictionary
corpora.MmCorpus.serialize('something.mm', corpus)
dictionary.save('something.dict')

Load Dataset
---

In [140]:
# Corpus of documents represented as a stream of vectors
if(os.path.exists('something.dict')):
    corpus = corpora.MmCorpus('something.mm')
    dictionary = corpora.Dictionary.load('something.dict')
    print('Used saved dataset')
else:
    print('Please generate data set')

Used saved dataset


Implement LSI
---

In [141]:
# Initialize tfidf model
tfidf = models.TfidfModel(corpus)

# Use tfidf model to transform vectors
corpus_tfidf = tfidf[corpus]

# Perform LSI tranformation
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) #Initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

lsi.save('model.lsi')
lsi = models.LsiModel.load('model.lsi')

# Print results
print('LSI topics: ')
lsi.print_topics(2)
print('\n')

print('Corpus LSI ')
with open(corpus_path, 'r') as file:
    for i, line in enumerate(file):
        topic_index = max(corpus_lsi[i], key=lambda item:abs(item[1]))[0]
        print('Topic : ', topic_index)
        print(corpus_lsi[i], " # " + line)
       
    print('\n')

LSI topics: 


Corpus LSI 
Topic :  1
[(0, -0.22626029643702006), (1, 0.41505425844926125)]  # Teaching computer programming has long been known to be a complex process.

Topic :  1
[(0, -0.25027660053479917), (1, 0.32718116265433828)]  # Software Visualization (SV) is the use of graphical and textual formalisms to describe the execution of computer programs.

Topic :  1
[(0, -0.3600028052743221), (1, 0.38302786637598507)]  # The approach of using SV over a network as an educational tool raises futher issues as to how SV can be most appropriately included within a computer programming curriculum.

Topic :  0
[(0, -0.28215966245084323), (1, 0.22003522668466829)]  # Some SV designers have claimed their product applies equally well from novice through to expert.

Topic :  0
[(0, -0.37630383510885124), (1, 0.23681236336692577)]  # Recent SV evaluation studies suggest important differences in the way novices and experts are able use SVs.

Topic :  0
[(0, -0.61574514122530655), (1, -0.237283