### INF 385
Text mining, topic modeling and all that.

In [1]:
# hide warnings to keep things tidy.
import warnings
warnings.filterwarnings('ignore')

import string

# NLTK (Natural Language Toolkit; http://www.nltk.org/)
# is a super-useful package for natural language processing
# stuff 
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


##
# note that you need to download some resources
# for NLTK models; use nltk.download()
import gensim # topic modeling
from gensim import corpora, models

In [2]:
sentence = "Hi, my name is Byron"

In [3]:
tokens = nltk.word_tokenize(sentence)
print(tokens)

['Hi', ',', 'my', 'name', 'is', 'Byron']


In [4]:
pos_tags = nltk.pos_tag(tokens)
print(pos_tags)

[('Hi', 'NNP'), (',', ','), ('my', 'PRP$'), ('name', 'NN'), ('is', 'VBZ'), ('Byron', 'NNP')]


In [5]:
print(nltk.ne_chunk(pos_tags, binary=True))

(S (NE Hi/NNP) ,/, my/PRP$ name/NN is/VBZ (NE Byron/NNP))


Credit for this example goes to Jordan Barber; https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html (I have modified a bit)

In [6]:
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

In [7]:
# now 'tokenize' these
tokenized_docs = []
for doc in doc_set:
    # lowercase the words
    doc_lowercase = doc.lower()
    # now tokenize via NLTK
    doc_tokens = nltk.tokenize.word_tokenize(doc_lowercase)
    # drop stop words, like 'the', 'a', etc.
    stop_list = stopwords.words('english')
    stop_list.extend(string.punctuation)
    doc_tokens = [word for word in doc_tokens if not word in stop_list]
    tokenized_docs.append(doc_tokens)

In [8]:
# so we hav a list of tokens (words)
print(tokenized_docs[0])

['brocolli', 'good', 'eat', 'brother', 'likes', 'eat', 'good', 'brocolli', 'mother']


In [9]:
# create a PorterStemmer
p_stemmer = PorterStemmer()
tokenized_and_stemmed = [[p_stemmer.stem(w) for w in doc] for 
                             doc in tokenized_docs]
print(tokenized_and_stemmed[0])

['brocolli', 'good', 'eat', 'brother', 'like', 'eat', 'good', 'brocolli', 'mother']


In [10]:
# now we create a 'dictionary' object for gensim
dictionary = corpora.Dictionary(tokenized_and_stemmed)
print(dictionary)

Dictionary(32 unique tokens: ['time', 'expert', 'good', 'lot', 'pressur']...)


In [11]:
# finally assemble our corpus
corpus = [dictionary.doc2bow(text) for text in tokenized_and_stemmed]
print(corpus[0])

[(0, 2), (1, 2), (2, 2), (3, 1), (4, 1), (5, 1)]


This list of tuples represents our first document. The tuples are (term ID, term frequency) pairs, so if print(dictionary.token2id) says brocolli’s id is 0, then the first tuple indicates that brocolli appeared twice in doc_a. doc2bow() only includes terms that actually occur: terms that do not occur in a document will not appear in that document’s vector.

Now we are ready to fit our model

In [12]:
lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, 
                                            id2word = dictionary, 
                                            passes=20)

In [13]:
lda_model.print_topics(num_topics=3, num_words=3)

['0.065*pressur + 0.065*health + 0.064*drive',
 '0.074*brother + 0.074*mother + 0.074*drive',
 '0.130*brocolli + 0.130*good + 0.091*eat']