<a href="https://colab.research.google.com/github/cateto/python4NLP/blob/main/topic_modeling/LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers','footers','quotes'))
document = dataset.data
len(document)

11314

In [2]:
document[1]

"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [3]:
print(dataset.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [4]:
news_df = pd.DataFrame({'document':document})
 # 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]"," ")
 # 길이가 3이하인 단어는 제거 (영어 특수)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
 # 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
news_df['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons'

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords
 # 영어 불용어를 불러온다.
stop_words = stopwords.words('english')
# 토큰화
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())
# 불용어에 없으면 리스트에 추가
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

In [8]:
print(tokenized_doc[1])

['yeah', 'expect', 'people', 'read', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'runs', 'steam', 'sorry', 'pity', 'sorry', 'feelings', 'denial', 'faith', 'need', 'well', 'pretend', 'happily', 'ever', 'anyway', 'maybe', 'start', 'newsgroup', 'atheist', 'hard', 'bummin', 'much', 'forget', 'flintstone', 'chewables', 'bake', 'timmons']


In [9]:
tokenized_doc[:5]

0    [well, sure, story, seem, biased, disagree, st...
1    [yeah, expect, people, read, actually, accept,...
2    [although, realize, principle, strongest, poin...
3    [notwithstanding, legitimate, fuss, proposal, ...
4    [well, change, scoring, playoff, pool, unfortu...
Name: clean_doc, dtype: object

In [10]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[1])

[(52, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 1), (88, 1), (89, 1)]


In [11]:
print(dictionary[66])

faith


In [12]:
len(dictionary)

64281

In [13]:
import gensim
NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
  print(topic)

(0, '0.013*"health" + 0.011*"medical" + 0.008*"disease" + 0.007*"among"')
(1, '0.009*"president" + 0.008*"government" + 0.006*"states" + 0.006*"state"')
(2, '0.028*"space" + 0.011*"nasa" + 0.006*"launch" + 0.006*"earth"')
(3, '0.038*"file" + 0.027*"output" + 0.027*"entry" + 0.016*"program"')
(4, '0.013*"encryption" + 0.012*"chip" + 0.011*"keys" + 0.010*"security"')
(5, '0.007*"using" + 0.006*"files" + 0.006*"system" + 0.006*"code"')
(6, '0.012*"people" + 0.010*"said" + 0.006*"armenians" + 0.006*"killed"')
(7, '0.009*"card" + 0.007*"system" + 0.007*"scsi" + 0.007*"drive"')
(8, '0.012*"engine" + 0.007*"unit" + 0.006*"block" + 0.005*"radar"')
(9, '0.017*"stream" + 0.012*"filename" + 0.009*"contest" + 0.009*"length"')
(10, '0.018*"game" + 0.017*"team" + 0.012*"play" + 0.012*"games"')
(11, '0.019*"would" + 0.014*"like" + 0.012*"know" + 0.010*"time"')
(12, '0.024*"armenian" + 0.017*"turkish" + 0.013*"armenians" + 0.010*"genocide"')
(13, '0.016*"year" + 0.008*"last" + 0.007*"runs" + 0.007*"ca

In [14]:
!pip install pyLDAvis



In [20]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(lda_viz)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
