In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', 'comp.windows.x',
       'talk.politics.mideast', 'soc.religion.christian', 'sci.electronics', 'sci.med']

df = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'),
                       categories=cats, random_state=0)

cv = CountVectorizer(max_df=0.95, max_features=1000, min_df=2, stop_words='english', ngram_range=(1,2))
feat = cv.fit_transform(df.data)
print(feat.shape)

(7862, 1000)


In [2]:
lda = LatentDirichletAllocation(n_components=8, random_state=0)
lda.fit(feat)
print(lda.components_.shape)
lda.components_

(8, 1000)


array([[2.46251560e+02, 1.18842248e+02, 1.51715288e+02, ...,
        1.00147234e+02, 7.63673375e+01, 1.17028758e+02],
       [1.25033020e-01, 1.25052288e-01, 1.25003012e-01, ...,
        1.10644583e+02, 1.51405141e-01, 5.09788954e+01],
       [1.25103419e-01, 1.25075224e-01, 1.25082214e-01, ...,
        6.72008817e+01, 1.25138615e-01, 2.48516614e+00],
       ...,
       [1.05055615e+02, 4.94858011e-01, 2.52075927e+01, ...,
        1.80695744e+01, 1.25115936e-01, 8.33321314e+00],
       [1.25147502e-01, 2.27058083e+02, 5.45176328e+00, ...,
        1.41751120e+00, 7.67217701e+01, 4.49861794e+01],
       [1.25096012e-01, 4.05666840e+00, 1.25049904e-01, ...,
        1.63821915e+02, 1.25049991e-01, 1.49550227e-01]])

In [3]:
def display_topics(model, names, words):
    for index, topic in enumerate(model.components_):
        print('Topic #', index)
        
        indexes = topic.argsort()[::-1]
        top = indexes[:words]
        
        concat = ' '.join([names[i] for i in top])
        print(concat)

In [4]:
names = cv.get_feature_names()
display_topics(lda, names, 15)

Topic # 0
year said don didn know game just time went people think did like say home
Topic # 1
god people jesus church think believe christ say does don christian know christians bible faith
Topic # 2
know does thanks like question information help time post advance book just looking group read
Topic # 3
edu com graphics mail ftp information available data pub list computer send software ca 3d
Topic # 4
israel jews jewish israeli dos dos arab turkish people war turkey dos state government greek history
Topic # 5
file image use program window jpeg windows display version color server files using available motif
Topic # 6
armenian armenians people health medical armenia disease turkish patients cancer russian 10 azerbaijan children 92
Topic # 7
like just don ve use good think time know way make used bike want need
