Topic Indexing - LSI using Gensim
===

This document implements Topic Indexing using a gensim implementation of Latent Semantic Indexing (LSI). It is meant for my own educational purposes

Import Libraries
---

In [16]:
from gensim import corpora, models, similarities 
import os
from six import iteritems

Prepare corpus
---

In [17]:
directory_path = r"c:\users\avour\Documents\GitHub\topic-indexing-lsa-gensim"
corpus_name = r"mycorpus.txt"
corpus_path = os.path.join(directory_path, corpus_name)
print(corpus_path)

c:\users\avour\Documents\GitHub\topic-indexing-lsa-gensim\mycorpus.txt


Generate Dataset
---

In [18]:
stop_list = set('for a of the and to is has they be are as from their in'.split())

# Generate Dictionary
with open(corpus_path, 'r') as file:
    dictionary = corpora.Dictionary(line.lower().split() for line in file)

    # remove stop words and words that appear only once
    stop_ids = [dictionary.token2id[stopword] for stopword in stop_list
        if stopword in dictionary.token2id]
    once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq==1]
    dictionary.filter_tokens(stop_ids+once_ids)
    #dictionary.filter_tokens(stop_ids)
    dictionary.compactify()

    
# Corpus Streaming
class MyCorpus(object):
    def __iter__(self):
        with open(corpus_path, 'r') as file:
            for line in file:
                # Assume there's one document per line, tokens separated by whitespace
                yield dictionary.doc2bow(line.lower().split())

corpus = MyCorpus()


# Save corpus and dictionary
corpora.MmCorpus.serialize('something.mm', corpus)
dictionary.save('something.dict')


Load Dataset
---

In [19]:
# Corpus of documents represented as a stream of vectors

if(os.path.exists('something.dict')):
    corpus = corpora.MmCorpus('something.mm')
    dictionary = corpora.Dictionary.load('something.dict')
    print('Used saved dataset')
else:
    print('Please generate data set')


Used saved dataset


Implement LSI
---

In [20]:
# Initialize tfidf model
tfidf = models.TfidfModel(corpus)

# Use tfidf model to transform vectors
corpus_tfidf = tfidf[corpus]

# Perform LSI tranformation
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) #Initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

lsi.save('model.lsi')
lsi = models.LsiModel.load('model.lsi')

# Print topics
print('LSI topics: ')
lsi.print_topics()
print('\n')

LSI topics: 




Visualize Final Corpus
---

In [21]:
print('Corpus LSI ')
with open(corpus_path, 'r') as file:
    for i, line in enumerate(file):
        topic_index = max(corpus_lsi[i], key=lambda item:abs(item[1]))[0]
        print('Topic : ', topic_index)
        print(corpus_lsi[i], " # " + line)
       
    print('\n')

Corpus LSI 
Topic :  1
[(0, 0.066007833960899834), (1, 0.5200703306361858)]  # Human machine interface for lab abc computer applications

Topic :  1
[(0, 0.19667592859141947), (1, 0.76095631677000586)]  # A survey of user opinion of computer system response time

Topic :  1
[(0, 0.089926399724459483), (1, 0.72418606267525143)]  # The EPS user interface management system

Topic :  1
[(0, 0.075858476521777518), (1, 0.63205515860034345)]  # System and human system engineering testing of EPS

Topic :  1
[(0, 0.10150299184979705), (1, 0.57373084830029586)]  # Relation of user perceived response time to error measurement

Topic :  0
[(0, 0.70321089393783187), (1, -0.16115180214025332)]  # The generation of random binary unordered trees

Topic :  0
[(0, 0.87747876731198415), (1, -0.1675890686465884)]  # The intersection graph of paths in trees

Topic :  0
[(0, 0.90986246868185883), (1, -0.14086553628718412)]  # Graph minors IV Widths of trees and well quasi ordering

Topic :  0
[(0, 0.6165825