# 잠재디리클레할당(LDA, Latent Dirichlet Allocation)

## 환경준비

In [1]:
!pip install pyLDAvis==2.1.2

Collecting pyLDAvis==2.1.2
  Downloading pyLDAvis-2.1.2.tar.gz (1.6 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting wheel>=0.23.0
  Using cached wheel-0.37.1-py2.py3-none-any.whl (35 kB)
Collecting scipy>=0.18.0
  Downloading scipy-1.7.3-cp39-cp39-win_amd64.whl (34.3 MB)
Collecting pandas>=0.17.0
  Downloading pandas-1.4.0-cp39-cp39-win_amd64.whl (10.5 MB)
Collecting jinja2>=2.7.2
  Using cached Jinja2-3.0.3-py3-none-any.whl (133 kB)
Collecting numexpr
  Downloading numexpr-2.8.1-cp39-cp39-win_amd64.whl (88 kB)
Collecting pytest
  Downloading pytest-6.2.5-py3-none-any.whl (280 kB)
Collecting future
  Downloading future-0.18.2.tar.gz (829 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Collecting MarkupSafe>=2.0
  Using cached MarkupSafe-2.0.1-cp39-cp39-win_amd64.whl (14 kB)
Collecting pytz>=

In [2]:
import warnings
warnings.filterwarnings(action='ignore')

##1)  sklearn 활용

In [3]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

#뉴스 다운로드 및 전처리
def get_news(apply_split=True) :
  #20newsgroup 다운로드
  dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
  documents = dataset.data

  news_df = pd.DataFrame({'document':documents})
  news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ") # 특수 문자 제거
  news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) # 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
  news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())# 전체 단어에 대한 소문자 변환
  tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화

  stop_words = stopwords.words('english') # NLTK 불용어 조회

  if apply_split :
    return tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
  else :
    return tokenized_doc.apply(lambda x: ' '.join([item for item in x if item not in stop_words]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [4]:
#공백으로 토큰 분리
def my_tokenizer(text):
    return text.split()

tokenized_docs = get_news(False)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

    
tfidf_vect = TfidfVectorizer(tokenizer = my_tokenizer)
tfidf = tfidf_vect.fit_transform(tokenized_docs)
lda = LatentDirichletAllocation(n_components=20, 
                                max_iter=20, 
                                learning_method='online', 
                                random_state=100)

lda_output = lda.fit_transform(tfidf)

In [6]:
#!pip install pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda, tfidf, tfidf_vect, mds='tsne')
pyLDAvis.display(vis)

##2) gensim 활용

In [7]:
from gensim import corpora
from gensim.models import LdaModel, TfidfModel

tokenized_docs = get_news()
id2word = corpora.Dictionary(tokenized_docs)
corpus_TDM = [id2word.doc2bow(doc) for doc in tokenized_docs]
tfidf = TfidfModel(corpus_TDM)
corpus_TFIDF = tfidf[corpus_TDM]

n = 20
lda = LdaModel(corpus=corpus_TFIDF,
                    id2word=id2word,
                    num_topics=n, 
                    random_state=100)

for t in lda.print_topics() :
  print(t)

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


(0, '0.003*"people" + 0.002*"think" + 0.002*"jesus" + 0.002*"would" + 0.002*"know" + 0.002*"much" + 0.002*"good" + 0.002*"government" + 0.002*"well" + 0.002*"right"')
(1, '0.002*"yankees" + 0.002*"catbyte" + 0.002*"dtmedin" + 0.002*"ingr" + 0.001*"alomar" + 0.001*"corp" + 0.001*"warrant" + 0.001*"math" + 0.001*"detroit" + 0.001*"gilmour"')
(2, '0.002*"phillies" + 0.002*"hawks" + 0.002*"tiff" + 0.002*"candida" + 0.001*"patent" + 0.001*"tapes" + 0.001*"rosicrucian" + 0.001*"kawasaki" + 0.001*"partition" + 0.001*"lens"')
(3, '0.002*"polygon" + 0.002*"intellect" + 0.002*"chastity" + 0.002*"cadre" + 0.002*"shameful" + 0.002*"skepticism" + 0.001*"banks" + 0.001*"surrender" + 0.001*"gordon" + 0.001*"backup"')
(4, '0.003*"would" + 0.003*"people" + 0.003*"like" + 0.003*"think" + 0.002*"said" + 0.002*"many" + 0.002*"know" + 0.002*"time" + 0.002*"israel" + 0.002*"could"')
(5, '0.002*"space" + 0.002*"satellite" + 0.001*"sleep" + 0.001*"score" + 0.001*"nubus" + 0.001*"indiana" + 0.001*"orbit" + 0.0

In [8]:
corpus_TDM = [id2word.doc2bow(doc) for doc in tokenized_docs]
tfidf = TfidfModel(corpus_TDM)
corpus_TFIDF = tfidf[corpus_TDM]

In [9]:

import pyLDAvis.gensim 

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus_TFIDF, id2word, mds='tsne')
pyLDAvis.display(vis)

ModuleNotFoundError: No module named 'pyLDAvis.gensim'