# LSA with gensim

In [1]:
from gensim import corpora
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim.models import TfidfModel
import re
import gensim
import tempfile
TEMP_FOLDER = tempfile.gettempdir()

## Load data and make dictionary

In [10]:
# Read Gutenberg data
with open('datasets/gutenberg.txt', 'r') as gutenberg:
    sherlock = gutenberg.read().replace('\n', ' ')
    
sherlock_sentences = sent_tokenize(sherlock)

documents = [[re.sub(r'[^\w]', ' ', word).lower().encode('utf-8') for word
              in sentence.split() if word not in stopwords.words('english')]
              for sentence in sherlock_sentences]

# Remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for sentence in documents:
    for token in sentence:
        frequency[token] += 1
              
documents = [[token for token in sentence if frequency[token] > 1] 
             for sentence in documents]

dictionary = corpora.Dictionary(documents)
dictionary.save('/tmp/gutenberg.dict')



In [11]:
print dictionary
print documents

# The function doc2bow converts an array to a bag of words format
print dictionary.doc2bow(["load", "theory", "to", "the", "garden"])

Dictionary(16 unique tokens: [u'load', u'hops ', u'used', u'natural', u'cognitive']...)
[['in', 'cognitive', 'cognitive', 'load', 'used'], ['cognitive', 'load', 'theory', 'john', 'sweller'], ['sweller', 'used', 'cognitive', 'load'], ['cognitive', 'load', 'theory', 'cognitive', 'load'], ['john'], ['beer', 'drink', 'drink'], ['beer', 'brewed'], ['carbonation', 'beer '], ['beer', 'brewed', 'hops ', 'natural'], ['included', 'used', 'hops '], ['in', 'natural', 'carbonation'], ['beer ', 'included', 'beer', 'beer', 'prayer', 'beer ', 'prayer', 'beer']]
[(0, 1), (6, 1)]


In [12]:
import os

# Build corpus. Convert dictionary entries to integer tuples
corpus = [dictionary.doc2bow(text) for text in documents]

# Storke corpus on dics
# corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'gutenberg.mm'), corpus)

In [5]:
# Load id-> word mapping (the dictionary)
# id2word = gensim.corpora.Dictionary.load_from_text('/tmp/gutenberg.dict')

# Load corpus iterator
#mm = gensim.corpora.MmCorpus('/tmp/gutenberg.mm')

## Convert to tf-idf

In [13]:
# Build Tf-IDF model
from gensim.models.tfidfmodel import TfidfModel

# Durch die TfidfModel Klasse kann ich die Document Term Matrix gewichten.
# Wichtige (infrequente) Wörter werden höher gewichtet als frequente Wörter.
# Beide Matrizen kann ich später in das LSI(LSA) Modell hineinfügen
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

## Run LSA over tf-idf

In [17]:
# Calculate LSA with both corpus (weighted and unweighted)
from gensim.models.lsimodel import LsiModel

# Die num_topics des Models geben nichts anderes an als die Dimensionen, die
# verwendet werden, um die Reihen der Matrix zu verkleinern. Dies findet durch
# die Singular Value Decomposition statt. 
lsa       = LsiModel(corpus      , num_topics=2)
lsa_tfidf = LsiModel(corpus_tfidf, num_topics=2)
print(lsa)
print(lsa_tfidf)

LsiModel(num_terms=16, num_topics=2, decay=1.0, chunksize=20000)
LsiModel(num_terms=16, num_topics=2, decay=1.0, chunksize=20000)


## Cosine Similarities between all documents

In [18]:
from gensim.similarities import MatrixSimilarity

# Interestingly we can do similarities with a different corpus. We
# trained the LSI-Model with our tf-idf corpus but we could use
# just another one
similarity = MatrixSimilarity(lsa_tfidf[corpus_tfidf], num_features=lsa_tfidf.num_terms)

print similarity

MatrixSimilarity<12 docs, 16 features>


In [32]:
cosine_similarities = list(enumerate(similarity))

# Obviously LSA could divide 
print(cosine_similarities)
print(cosine_similarities[4][1])

print(lsa.print_topics(num_topics=2))

[(0, array([ 1.        ,  0.96896333,  0.99281341,  0.97989964,  0.91932237,
        0.08354242,  0.09665251,  0.16798048,  0.15166092,  0.56825316,
        0.39827818,  0.12378429], dtype=float32)), (1, array([ 0.96896333,  1.        ,  0.99158341,  0.99880189,  0.98806596,
       -0.16539106, -0.15239467, -0.08092515, -0.09739138,  0.34720302,
        0.15916471, -0.12536113], dtype=float32)), (2, array([ 0.99281341,  0.99158341,  1.        ,  0.99673116,  0.95980746,
       -0.03631249, -0.02315469,  0.04880091,  0.03228242,  0.46569604,
        0.2856442 ,  0.00414221], dtype=float32)), (3, array([ 0.97989964,  0.99880189,  0.99673116,  1.        ,  0.97934443,
       -0.1169306 , -0.10384742, -0.03205244, -0.04857108,  0.39267901,
        0.20728645, -0.07666072], dtype=float32)), (4, array([ 0.91932237,  0.98806596,  0.95980746,  0.97934443,  0.99999994,
       -0.3153272 , -0.30280808, -0.23348542, -0.2495281 ,  0.19861057,
        0.0051976 , -0.27668115], dtype=float32)), (5, 