# LSA with whole texts

In [4]:
import re
import enchant
import numpy
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models.tfidfmodel import TfidfModel
from gensim.models.lsimodel import LsiModel
from gensim.similarities import MatrixSimilarity

In [5]:
# Read christmas carol data set
with open('datasets/christmas_carol.txt', 'r') as dickens:
    the_book = dickens.read()

In [6]:
# Remove punctuation
the_book = "".join(c for c in the_book if c not in ('!','.',':'))

# Split text into paragraphs
paragraphs = re.split('\s{4,}', the_book)

# Only allow english words to appear in the documents
d = enchant.Dict('en_US')

# Enable stemmer
porter_stemmer = PorterStemmer()

# List of stopwords
stops = stopwords.words('english')

documents = [[porter_stemmer.stem(word.decode('utf-8')).lower() for word in paragraph.split() 
              if d.check(porter_stemmer.stem(word.decode('utf-8'))) and 
             word.lower() not in stops] 
             for paragraph in paragraphs]

In [7]:
# Build dictionary
dictionary = corpora.Dictionary(documents)

# Build corpus
corpus = [dictionary.doc2bow(paragraph) for paragraph in documents]

# Build tf-idf model
tfidf = TfidfModel(corpus)

# Change corpus according to model
tfidf_corpus = tfidf[corpus]

In [8]:
# Run LSA
lsa = LsiModel(corpus, num_topics=300)

# Build matrix similarities
similarities = MatrixSimilarity(lsa[corpus], num_features=lsa.num_terms)

In [9]:
cosine_similarities = list(enumerate(similarities))

first_paragraph = cosine_similarities[0][1]

# Get most similar paragraphs
most_similar_paragraphs = sorted(enumerate(first_paragraph), 
                                 key=lambda x: x[1], reverse=True)

print(most_similar_paragraphs[0:10])

print(documents[0])
print(documents[most_similar_paragraphs[1][0]])
print(documents[most_similar_paragraphs[2][0]])

[(0, 0.9999997), (1, 0.33231094), (101, 0.33022419), (114, 0.33022419), (584, 0.28640732), (4, 0.26280689), (45, 0.26116392), (209, 0.23293547), (527, 0.22403502), (83, 0.22130106)]
[u'dead', u'begin', u'doubt', u'burial', u'sign', u'chief', u'mourner', u'sign', u'name', u'good', u'upon', u'chose', u'put', u'hand', u'old', u'marley', u'dead', u'door-nail']
[u'mind', u"don't", u'mean', u'say', u'dead', u'door-nail', u'might', u'regard', u'coffin-nail', u'deadest', u'trade', u'wisdom', u'ancestor', u'unhallow', u'hand', u'shall', u'disturb', u'done', u'therefor', u'permit', u'marley', u'dead', u'door-nail']
[u'doubt']


In [31]:
print(lsa.print_topics(num_topics=10, num_words=5))

[(0, '-0.322*"5" + -0.268*"84" + -0.242*"54" + -0.234*"227" + -0.222*"153"'), (1, '0.815*"227" + -0.256*"199" + -0.124*"251" + 0.123*"18" + -0.109*"84"'), (2, '0.427*"199" + -0.397*"54" + 0.331*"227" + -0.183*"35" + 0.174*"251"'), (3, '-0.559*"5" + 0.285*"2" + 0.261*"46" + 0.241*"54" + 0.170*"3"'), (4, '-0.378*"2" + -0.368*"3" + 0.291*"153" + -0.224*"5" + 0.213*"84"'), (5, '-0.408*"54" + 0.309*"153" + -0.287*"199" + -0.250*"5" + 0.216*"3"'), (6, '0.302*"3" + -0.255*"619" + 0.233*"153" + -0.217*"304" + -0.187*"244"'), (7, '0.324*"304" + 0.211*"54" + -0.197*"2" + -0.190*"440" + 0.168*"422"'), (8, '-0.316*"54" + 0.262*"619" + 0.233*"2" + -0.200*"219" + 0.199*"30"'), (9, '-0.382*"338" + -0.348*"84" + 0.217*"5" + 0.202*"153" + -0.190*"13"')]


In [68]:
# Compare another corpus to lsi
text = ["marley", "dead", "door-nail", "burial", "old"]

text_tuples = dictionary.doc2bow(text)

all_sims = similarities[lsa[text_tuples]]

print(sorted(enumerate(all_sims), key=lambda x: x[1], reverse=True))[0:10]
print(paragraphs[0])

[(0, 0.71732837), (1, 0.54634875), (45, 0.43563884), (4, 0.41289711), (258, 0.40463182), (687, 0.33629864), (644, 0.3023279), (417, 0.29983068), (521, 0.29162228), (548, 0.28124219)]
﻿MARLEY was dead to begin with There is no doubt
whatever about that The register of his burial was
signed by the clergyman, the clerk, the undertaker,
and the chief mourner Scrooge signed it and
Scrooge's name was good upon 'Change, for anything he
chose to put his hand to Old Marley was as dead as a
door-nail
