In [2]:
import textacy
import textacy.datasets
import spacy
from __future__ import unicode_literals

#nlp=spacy.load('en')

print 'textacy version:', textacy.__version__
print 'spacy version:', spacy.__version__

textacy version: 0.4.1
spacy version: 1.9.0


### Prepare corpus

In [3]:
from textacy.vsm import Vectorizer
from textacy.tm.topic_model import TopicModel

# import text stream from textacy datasets
cw = textacy.datasets.CapitolWords()

text_stream, metadata_stream = textacy.fileio.split_record_fields(cw.records(limit=1000), 'text', itemwise=False)

corpus = textacy.Corpus('en', texts=text_stream, metadatas=metadata_stream)
corpus

Corpus(1000 docs; 538150 tokens)

### Vectorize words and create DTM

In textacy, class Vectorizer is used to transform one or more tokenized documents into a document-term matrix.

The shape of DTM is (#docs, #unique terms). The weighting method is optional among: tf-, tf-idf, or binary-weighted.


The first time we use a vectorizer, we need to initialize it with parameters. The process where term list is transformed into a DTM is also the process of training. If we want to use the same vectorizer, next time we can use .transform() method instead of .fit_transform(). 

Note: every time do the transform, we need to specify the term list.

In [38]:
# Every time when vectorizing, need to specify terms-list.
terms_list = (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus[:500]) #select first 500 words

# initialize a vectorizer
vectorizer = Vectorizer(weighting='tfidf', normalize=True, smooth_idf=True, min_df=3, max_df=0.95, max_n_terms=100000)
# train the vectorizer as transforming term-lists into DTM
doc_term_matrix = vectorizer.fit_transform(terms_list)

#print doc_term_matrix

### Topic modeling

Topic models in Textacy are built on scikit-learn. 


In [21]:
# Initialize a topic model
model = textacy.tm.TopicModel('nmf', n_topics=10) # n_topics: number of topics

# Train model with DTM
model.fit(doc_term_matrix)

# Use trained model to transform DTM to topics 
doc_topic_matrix = model.transform(doc_term_matrix)

print doc_topic_matrix

[[ 0.10034641  0.0253678   0.         ...,  0.          0.04499283
   0.00073213]
 [ 0.07038761  0.          0.         ...,  0.00804994  0.          0.        ]
 [ 0.13657403  0.00983649  0.         ...,  0.          0.07124204  0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.02743931  0.10817356  0.06424196 ...,  0.          0.          0.08101567]
 [ 0.          0.0272248   0.         ...,  0.          0.          0.39016118]]


In [36]:
model.save('/Users/ardellelee/PycharmProjects/textacy/mymodel_nmf.pkl')

### Model Interpretion

Input parameter:

topics=[0,1,...]/range(7): the list to specify topics/docs

top_n: number of outputs for each topic

In [25]:
# check top terms occured in specified topic

for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, topics=[0,5], top_n=10): 
    print('topic', topic_idx, ':', '   '.join(top_terms))

(u'topic', 0, u':', u'people   go   want   bill   think   money   $   program   work   year')
(u'topic', 5, u':', u'amendment   chairman   clerk   designate   offer   mr.   sanders   withdraw   vermont   gentleman')


In [28]:
# check the top docs where specified topics occur

for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, topics=[2,7], top_n=3):
    print(topic_idx)
    for j in top_docs:
        print(corpus[j].metadata['title'])

2
EMERGENCY RELIEF
REBUTTAL TO PRESIDENTIAL SPEECH
FOREIGN RELATIONS AUTHORIZATION ACT, FISCAL YEARS 1996 AND 1997-- CONFERENCE REPORT
7
FOREIGN OPERATIONS, EXPORT FINANCING, AND RELATED PROGRAMS APPROPRIATIONS ACT, 1997
RECESS
BALANCED BUDGET DOWNPAYMENT ACT, II


In [34]:
# check which topics occur in specified doc

for doc_idx, topics in model.top_doc_topics(doc_topic_matrix, docs=[3,4], top_n=4):
    print(corpus[doc_idx].metadata['title'], ':', topics)

(u"EXAMINING THE SPEAKER'S UPCOMING TRAVEL SCHEDULE", u':', (0, 8, 9, 7))
(u'FLOODING IN PENNSYLVANIA', u':', (0, 9, 8, 7))


In [35]:
# check weights of each topic

for i, val in enumerate(model.topic_weights(doc_topic_matrix)):
    print(i, val)

(0, 0.31936436172376825)
(1, 0.11837332487112134)
(2, 0.097933043951754895)
(3, 0.083756427304841438)
(4, 0.098375356008242557)
(5, 0.070792376456240674)
(6, 0.044612691461782955)
(7, 0.045166978725944329)
(8, 0.057162099048094439)
(9, 0.064463340448209144)


### Model Visualization

In [37]:
# Visualize the model

import matplotlib.pyplot as plt
model.termite_plot(doc_term_matrix, vectorizer.id_to_term, topics=-1,  n_terms=25, sort_terms_by='seriation')
#plt.show()

<matplotlib.axes._subplots.AxesSubplot at 0x14ec34150>