In [1]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', 'comp.sys.ibm.pc.hardware', 'sci.crypt']


newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)

print('#Train set size:', len(newsgroups_train.data))
print('#Selected categories:', newsgroups_train.target_names)

#Train set size: 3219
#Selected categories: ['alt.atheism', 'comp.graphics', 'comp.sys.ibm.pc.hardware', 'sci.crypt', 'sci.space', 'talk.religion.misc']


In [3]:
from sklearn.feature_extraction.text import CountVectorizer 

cv = CountVectorizer(token_pattern="[\w']{3,}", stop_words='english', max_features=2000, min_df=5, max_df=0.5)
review_cv = cv.fit_transform(newsgroups_train.data)

In [6]:
# LDA -> LatentDirichletAllocation 
# hyper-parameter : max_iter, learning_method, n_jobs, random_state

from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

np.set_printoptions(precision=3)

lda = LatentDirichletAllocation(n_components=10, # 추출할 topic의 수 
                                max_iter=5, 
                                topic_word_prior=0.1, doc_topic_prior=1.0, 
                                learning_method='online', 
                                n_jobs= -1, # 사용 processor 수
                                random_state=0)

review_topics = lda.fit_transform(review_cv)

print('#shape of review_topics:', review_topics.shape)
print('#Sample of review_topics:', review_topics[0])

gross_topic_weights = np.mean(review_topics, axis=0)
print('#Sum of topic weights of documents:', gross_topic_weights)
print('#shape of topic word distribution:', lda.components_.shape)

#shape of review_topics: (3219, 10)
#Sample of review_topics: [0.903 0.007 0.027 0.008 0.007 0.008 0.007 0.007 0.007 0.018]
#Sum of topic weights of documents: [0.087 0.083 0.085 0.115 0.115 0.126 0.098 0.072 0.07  0.148]
#shape of topic word distribution: (10, 2000)
