In [1]:
# Based on code at:
# - https://nlpforhackers.io/topic-modeling/
# - https://medium.com/@sherryqixuan/topic-modeling-and-pyldavis-visualization-86a543e21f58
# - https://radimrehurek.com/gensim/auto_examples/core/run_topics_and_transformations.html
# - https://monkeylearn.com/topic-analysis/

In [2]:
file = '../common-data/Example-TDBank-PersonalAcctAgree.txt'
file2 = '../common-data/nobel-2020/dataset-nyt-nobel2020.txt'
file3 = '../l1-wordcloud/data/output/Example-CSCE771_001_Fall2020.txt'

In [3]:
# Get raw text as string.
with open(file) as f:
    text = f.read()
with open(file2) as f:
    text2 = f.read()
with open(file3) as f:
    text3 = f.read()

In [4]:
# Importing libraries
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords

In [5]:
# -- If not downloaded already
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')

In [6]:
# Routines for cleaning text
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')
 
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text

In [7]:
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []

# Adding docs one by one
tokenized_data.append(clean_text(text))
tokenized_data.append(clean_text(text2))
tokenized_data.append(clean_text(text3))

In [8]:
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
 
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

In [9]:
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)


In [10]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
 
print("=" * 20)

LDA Model:
Topic #0: 0.032*"account" + 0.015*"may" + 0.013*"deposit" + 0.010*"funds" + 0.010*"bank" + 0.007*"day" + 0.007*"business" + 0.007*"check" + 0.006*"accounts" + 0.006*"agreement"
Topic #1: 0.019*"account" + 0.010*"may" + 0.009*"deposit" + 0.007*"accounts" + 0.006*"funds" + 0.006*"bank" + 0.005*"check" + 0.005*"day" + 0.005*"available" + 0.005*"checking"
Topic #2: 0.022*"account" + 0.016*"may" + 0.009*"bank" + 0.009*"deposit" + 0.007*"accounts" + 0.006*"funds" + 0.006*"check" + 0.006*"personal" + 0.006*"available" + 0.005*"checks"
Topic #3: 0.029*"account" + 0.016*"may" + 0.010*"accounts" + 0.008*"funds" + 0.008*"personal" + 0.007*"deposit" + 0.007*"day" + 0.007*"statement" + 0.006*"bank" + 0.006*"agreement"
Topic #4: 0.042*"account" + 0.017*"may" + 0.012*"deposit" + 0.010*"accounts" + 0.009*"funds" + 0.009*"day" + 0.008*"interest" + 0.008*"bank" + 0.008*"personal" + 0.008*"check"
Topic #5: 0.021*"account" + 0.014*"may" + 0.010*"accounts" + 0.009*"deposit" + 0.009*"funds" + 0.0

In [11]:
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
 

In [12]:
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
 
print("=" * 20)

LSI Model:
Topic #0: 0.606*"account" + 0.275*"may" + 0.209*"deposit" + 0.179*"bank" + 0.172*"accounts" + 0.167*"funds" + 0.128*"check" + 0.127*"day" + 0.114*"personal" + 0.106*"available"
Topic #1: 0.445*"course" + 0.270*"students" + 0.191*"project" + 0.175*"class" + 0.143*"grade" + 0.143*"assignments" + 0.143*"student" + 0.143*"code" + 0.143*"presentation" + 0.128*"work"
Topic #2: -0.544*"prize" + -0.427*"nobel" + -0.155*"peace" + -0.155*"recipients" + -0.154*"awarded" + -0.117*"ceremony" + -0.117*"literature" + -0.116*"economic" + -0.116*"monday" + -0.110*"work"
Topic #3: 
Topic #4: 
Topic #5: 
Topic #6: 
Topic #7: 
Topic #8: 
Topic #9: 


In [16]:
# - Intstall if not already there
# pip install pyLDAvis

In [17]:
import pyLDAvis.gensim as gensimvis
import pyLDAvis

In [18]:
vis_data1 = gensimvis.prepare(lda_model, corpus, dictionary)

In [19]:
pyLDAvis.display(vis_data1)

In [13]:
# Testing topics over new 'documents'
text1 = "The prize for the best bank goes to a company which does not given any loan, said Mr. Nobel."
bow1 = dictionary.doc2bow(clean_text(text1))
text2 = "The prize for the best AI goes to Mr. Turing."
bow2 = dictionary.doc2bow(clean_text(text2))

In [14]:
# Similarity between text and topics, using LSI
print(lsi_model[bow1])
print(lsi_model[bow2])
 

[(0, 0.196377471446783), (1, 0.036297351106966094), (2, -1.001516259772634)]
[(0, 0.0011662491436687289), (1, 0.020661628955633834), (2, -0.5428556516258171)]


In [15]:
# Similarity between text and docs, using LDA
print(lda_model[bow1])
print(lda_model[bow2])

[(0, 0.011186165), (1, 0.011185625), (2, 0.8993197), (3, 0.011185981), (4, 0.011188335), (5, 0.011186078), (6, 0.011185884), (7, 0.011185798), (8, 0.011187073), (9, 0.01118937)]
[(0, 0.033376634), (1, 0.03337822), (2, 0.699611), (3, 0.033376195), (4, 0.033379067), (5, 0.033375327), (6, 0.033376783), (7, 0.03337479), (8, 0.0333755), (9, 0.033376433)]


In [20]:
# Finding similarity using gensim's similarity function
from gensim import similarities
 
lda_index = similarities.MatrixSimilarity(lda_model[corpus])


In [21]:
 
# Let's perform some queries
similarities = lda_index[lda_model[bow2]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])

In [22]:
# Top most similar documents:
print(similarities)


[(1, 0.98991275), (2, 0.30249947), (0, 0.061040036)]
