# LSA with gensim

In [39]:
from gensim import corpora
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim.models import TfidfModel
import re
import gensim
import tempfile
TEMP_FOLDER = tempfile.gettempdir()

## Load data and make dictionary

In [48]:
# Read Gutenberg data
with open('datasets/gutenberg.txt', 'r') as gutenberg:
    sherlock = gutenberg.read().replace('\n', ' ')
    
sherlock_sentences = sent_tokenize(sherlock)

documents = [[re.sub(r'[^\w]', ' ', word).lower().encode('utf-8') for word
              in sentence.split() if word not in stopwords.words('english')]
              for sentence in sherlock_sentences]

# Remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for sentence in documents:
    for token in sentence:
        frequency[token] += 1
              
documents = [[token for token in sentence if frequency[token] > 1] 
             for sentence in documents]

dictionary = corpora.Dictionary(documents)
dictionary.save('/tmp/gutenberg.dict')

In [53]:
print dictionary
print documents

# The function doc2bow converts an array to a bag of words format
print dictionary.doc2bow(["load", "theory", "to", "the", "garden"])

Dictionary(6 unique tokens: [u'load', u'used', u'theory', u'sweller', u'john']...)
[['cognitive', 'cognitive', 'load', 'used'], ['cognitive', 'load', 'theory', 'john', 'sweller'], ['sweller', 'used', 'cognitive', 'load'], ['cognitive', 'load', 'theory', 'cognitive', 'load'], ['john']]
[(0, 1), (5, 1)]


In [54]:
import os

# Build corpus. Convert dictionary entries to integer tuples
corpus = [dictionary.doc2bow(text) for text in documents]

# Storke corpus on dics
# corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'gutenberg.mm'), corpus)

In [41]:
# Load id-> word mapping (the dictionary)
# id2word = gensim.corpora.Dictionary.load_from_text('/tmp/gutenberg.dict')

# Load corpus iterator
#mm = gensim.corpora.MmCorpus('/tmp/gutenberg.mm')

## Convert to tf-idf

In [55]:
# Build Tf-IDF model
from gensim.models.tfidfmodel import TfidfModel

# Durch die TfidfModel Klasse kann ich die Document Term Matrix gewichten.
# Wichtige (infrequente) Wörter werden höher gewichtet als frequente Wörter.
# Beide Matrizen kann ich später in das LSI(LSA) Modell hineinfügen
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

## Run LSA over tf-idf

In [56]:
# Calculate LSA with both corpus (weighted and unweighted)
from gensim.models.lsimodel import LsiModel

# Die num_topics des Models geben nichts anderes an als die Dimensionen, die
# verwendet werden, um die Reihen der Matrix zu verkleinern. Dies findet durch
# die Singular Value Decomposition statt. 
lsa       = LsiModel(corpus      , num_topics=300)
lsa_tfidf = LsiModel(corpus_tfidf, num_topics=300)
print(lsa)
print(lsa_tfidf)

LsiModel(num_terms=6, num_topics=300, decay=1.0, chunksize=20000)
LsiModel(num_terms=6, num_topics=300, decay=1.0, chunksize=20000)


## Cosine Similarities between all documents

In [57]:
from gensim.similarities import MatrixSimilarity

# Interestingly we can do similarities with a different corpus. Whe
# trained the LSI-Model with our tf-idf corpus but we could use
# just another one
similarity = MatrixSimilarity(lsa_tfidf[corpus_tfidf], num_features=lsa_tfidf.num_terms)

print similarity

MatrixSimilarity<5 docs, 6 features>


In [81]:
cosine_similarities = list(enumerate(similarity))

print(cosine_similarities)
print(cosine_similarities[4][1])



[(0, array([  9.99999940e-01,   8.84811655e-02,   7.10719287e-01,
         2.57363170e-01,   3.07810361e-10], dtype=float32)), (1, array([ 0.08848117,  0.99999994,  0.43518433,  0.57696968,  0.56626439], dtype=float32)), (2, array([  7.10719287e-01,   4.35184330e-01,   1.00000012e+00,
         1.34221151e-01,  -1.78835329e-08], dtype=float32)), (3, array([  2.57363170e-01,   5.76969683e-01,   1.34221151e-01,
         1.00000000e+00,   3.22521032e-09], dtype=float32)), (4, array([  3.07810361e-10,   5.66264391e-01,  -1.78835329e-08,
         3.22521032e-09,   1.00000000e+00], dtype=float32))]
[  3.07810361e-10   5.66264391e-01  -1.78835329e-08   3.22521032e-09
   1.00000000e+00]
