In [1]:
# Based on code at:
# - https://nlpforhackers.io/topic-modeling/
# - https://medium.com/@sherryqixuan/topic-modeling-and-pyldavis-visualization-86a543e21f58
# - https://radimrehurek.com/gensim/auto_examples/core/run_topics_and_transformations.html
# - https://monkeylearn.com/topic-analysis/

In [2]:
base_location = 'https://github.com/biplav-s/course-nl/tree/master/'
file = base_location + 'common-data/Example-TDBank-PersonalAcctAgree.txt'
file2 = base_location + 'common-data/nobel-2020/dataset-nyt-nobel2020.txt'
file3 = base_location + 'l1-wordcloud/data/output/Example-CSCE771_001_Fall2020.txt'

In [3]:
import urllib

In [4]:
# Get raw text as string.

text = urllib.request.urlopen(file).read().decode('utf-8')
text2 = urllib.request.urlopen(file2).read().decode('utf-8')
text3 = urllib.request.urlopen(file3).read().decode('utf-8')

# - if files were local 
#with open(file) as f:
#    text = f.read()
#with open(file2) as f:
#    text2 = f.read()
#with open(file3) as f:
#    text3 = f.read()

In [5]:
# To confirm content
#print (text)

In [6]:
# Importing libraries
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords

In [7]:
# -- If not downloaded already
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')

In [8]:
# Routines for cleaning text
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')
 
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text

In [9]:
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []

# Adding docs one by one
tokenized_data.append(clean_text(text))
tokenized_data.append(clean_text(text2))
tokenized_data.append(clean_text(text3))

In [10]:
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
 
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

In [11]:
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)


In [12]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
 
print("=" * 20)

LDA Model:
Topic #0: 0.155*"class=" + 0.084*"blob-code" + 0.077*"blob-num" + 0.075*"js-blob-rnum" + 0.073*"blob-code-inner" + 0.071*"data-line-number=" + 0.065*"js-file-line" + 0.063*"js-line-number" + 0.063*"js-code-nav-line-number" + 0.022*"quot"
Topic #1: 0.101*"class=" + 0.066*"js-line-number" + 0.061*"blob-num" + 0.060*"js-code-nav-line-number" + 0.055*"js-file-line" + 0.055*"data-line-number=" + 0.054*"quot" + 0.045*"js-blob-rnum" + 0.042*"blob-code" + 0.039*"blob-code-inner"
Topic #2: 0.088*"class=" + 0.058*"js-file-line" + 0.051*"js-code-nav-line-number" + 0.044*"js-line-number" + 0.042*"blob-num" + 0.040*"data-line-number=" + 0.036*"blob-code-inner" + 0.034*"blob-code" + 0.034*"js-blob-rnum" + 0.017*"quot"
Topic #3: 0.092*"class=" + 0.070*"quot" + 0.061*"js-blob-rnum" + 0.061*"js-line-number" + 0.058*"js-file-line" + 0.054*"blob-num" + 0.053*"data-line-number=" + 0.051*"blob-code" + 0.047*"js-code-nav-line-number" + 0.046*"blob-code-inner"
Topic #4: 0.137*"class=" + 0.067*"js-

In [13]:
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
 

In [14]:
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
 
print("=" * 20)

LSI Model:
Topic #0: 0.589*"class=" + 0.283*"data-line-number=" + 0.283*"blob-num" + 0.283*"js-code-nav-line-number" + 0.283*"js-line-number" + 0.283*"blob-code-inner" + 0.283*"js-blob-rnum" + 0.283*"blob-code" + 0.283*"js-file-line" + 0.107*"quot"
Topic #1: 0.946*"quot" + 0.100*"true" + 0.078*"class=" + -0.068*"data-line-number=" + -0.068*"js-blob-rnum" + -0.068*"blob-code-inner" + -0.068*"js-file-line" + -0.068*"blob-num" + -0.068*"blob-code" + -0.068*"js-line-number"
Topic #2: 0.368*"account" + -0.352*"course" + -0.214*"students" + 0.167*"prize" + -0.151*"project" + -0.139*"class" + 0.131*"nobel" + 0.120*"deposit" + 0.117*"may" + -0.114*"assignments"
Topic #3: 
Topic #4: 
Topic #5: 
Topic #6: 
Topic #7: 
Topic #8: 
Topic #9: 


In [16]:
# - Intstall if not already there
# !pip install pyLDAvis

In [17]:
import pyLDAvis.gensim as gensimvis
import pyLDAvis

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [18]:
vis_data1 = gensimvis.prepare(lda_model, corpus, dictionary)

  and should_run_async(code)
  head(R).drop('saliency', 1)


In [19]:
pyLDAvis.display(vis_data1)

  and should_run_async(code)
