In [1]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

In [2]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

In [3]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [4]:
print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 1.903s.


In [5]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features for NMF...
done in 0.493s.


In [11]:
tfidf.shape

(2000, 1000)

In [12]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting tf features for LDA...
done in 0.498s.


In [13]:
tf.shape

(2000, 1000)

In [14]:
# Fit the NMF model
print("Fitting the NMF model with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))


Fitting the NMF model with tf-idf features, n_samples=2000 and n_features=1000...
done in 0.332s.


In [15]:
print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)


Topics in NMF model:
Topic #0:
just people don like think know good time make use way really right say ve want government did ll going
Topic #1:
windows file dos files program using use os problem help running drivers pc ftp ms version available screen software work
Topic #2:
god jesus bible faith christian christ christians does heaven sin believe lord life church mary atheism belief human love religion
Topic #3:
edu soon com send university internet mit ftp mail cc article pub information hope mac email blood home contact program
Topic #4:
thanks know does mail advance hi info interested email anybody card looking help like appreciated information video send list need
Topic #5:
drive drives hard disk floppy software mac scsi computer controller power apple mb rom pc problem card internal problems cable
Topic #6:
window manager application motif problem display graphics use standard time possible try using screen tried doesn faq sun certain area
Topic #7:
game team games year win pla

In [16]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

Fitting LDA models with tf features, n_samples=2000 and n_features=1000...
done in 4.159s.


In [17]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0:
edu com mail send graphics ftp pub available contact university list faq ca information cs 1993 program sun uk mit
Topic #1:
don like just know think ve way use right good going make sure ll point got need really time doesn
Topic #2:
christian think atheism faith pittsburgh new bible radio games alt lot just religion like book read play time subject believe
Topic #3:
drive disk windows thanks use card drives hard version pc software file using scsi help does new dos controller 16
Topic #4:
hiv health aids disease april medical care research 1993 light information study national service test led 10 page new drug
Topic #5:
god people does just good don jesus say israel way life know true fact time law want believe make think
Topic #6:
55 10 11 18 15 team game 19 period play 23 12 13 flyers 20 25 22 17 24 16
Topic #7:
car year just cars new engine like bike good oil insurance better tires 000 thing speed model brake driving performance
Topic #8:
people said

In [19]:
tf_vectorizer.get_feature_names()[:20]

[u'00',
 u'000',
 u'10',
 u'100',
 u'11',
 u'12',
 u'128',
 u'13',
 u'130',
 u'14',
 u'15',
 u'16',
 u'17',
 u'18',
 u'19',
 u'1992',
 u'1993',
 u'20',
 u'200',
 u'21']

In [22]:
lda.components_

array([[  4.96604155,   4.3537397 ,  21.42539886, ...,   1.57926488,
          1.33933502,   1.20988436],
       [  0.48391034,   1.85845783,  14.04720958, ...,  74.59501615,
         59.36116266,   0.27698642],
       [  0.18708486,   0.13728929,   0.31409364, ...,   1.02679042,
          2.56259123,   0.13662652],
       ..., 
       [  3.22343848,  39.1368944 ,  11.24910558, ...,  23.37779481,
          3.06315114,   0.15230766],
       [  1.41871388,  47.53082031,  16.14390001, ...,  82.46751192,
         16.51319941,  28.11660323],
       [  4.02759659,   1.24781464,  13.26101699, ...,  29.0225105 ,
          0.24834416,   0.13033208]])

In [23]:
lda.components_.shape

(10, 1000)