In [4]:
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

In [5]:
tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

print(texts)

[[u'brocolli', u'good', u'eat', u'brother', u'like', u'eat', u'good', u'brocolli', u'mother'], [u'mother', u'spend', u'lot', u'time', u'drive', u'brother', u'around', u'basebal', u'practic'], [u'health', u'expert', u'suggest', u'drive', u'may', u'caus', u'increas', u'tension', u'blood', u'pressur'], [u'often', u'feel', u'pressur', u'perform', u'well', u'school', u'mother', u'never', u'seem', u'drive', u'brother', u'better'], [u'health', u'profession', u'say', u'brocolli', u'good', u'health']]


In [6]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

for i in range(0, len(dictionary)):
    print('{0}: {1}'.format(i, dictionary[i]))

0: brocolli
1: good
2: like
3: brother
4: mother
5: eat
6: around
7: basebal
8: drive
9: lot
10: time
11: spend
12: practic
13: tension
14: expert
15: may
16: suggest
17: caus
18: pressur
19: health
20: blood
21: increas
22: school
23: often
24: feel
25: never
26: well
27: better
28: perform
29: seem
30: say
31: profession


In [7]:
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

print(corpus)

[[(0, 2), (1, 2), (2, 1), (3, 1), (4, 1), (5, 2)], [(3, 1), (4, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)], [(8, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1)], [(3, 1), (4, 1), (8, 1), (18, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1)], [(0, 1), (1, 1), (19, 2), (30, 1), (31, 1)]]


In [8]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=3, id2word = dictionary, passes=20)

print(ldamodel)

LdaModel(num_terms=32, num_topics=3, decay=0.5, chunksize=2000)


In [9]:
print(ldamodel.print_topics(num_topics=3, num_words=4))

[(0, u'0.140*"health" + 0.080*"brocolli" + 0.080*"good" + 0.080*"profession"'), (1, u'0.082*"mother" + 0.082*"brother" + 0.057*"eat" + 0.057*"drive"'), (2, u'0.065*"pressur" + 0.065*"drive" + 0.064*"may" + 0.064*"blood"')]


In [20]:
X = []
for doc in corpus:
    doc_rel = []
    for rel in ldamodel.get_document_topics(doc):
        doc_rel.append(rel[1])
    X.append(doc_rel)
print(X)

[[0.03677226496815364, 0.92966997931160267, 0.033557755720243831], [0.033789172460376563, 0.93200945265102164, 0.034201374888601796], [0.0318222820656427, 0.031070727352879149, 0.93710699058147817], [0.026014331943600272, 0.94707144940938903, 0.026914218647010782], [0.90154308147739137, 0.049792826203051949, 0.048664092319556651]]
