<a href="https://colab.research.google.com/github/as9786/NLP/blob/main/TopicModeling/Gensim_lda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset

In [2]:
import pandas as pd  
from sklearn.datasets import fetch_20newsgroups

In [3]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
len(documents)

11314

In [4]:
documents[1]

"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [5]:
# 총 20개의 주제
print(dataset.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


## Data preprocessing

In [6]:
# 텍스트 전처리
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

  news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")


In [7]:
news_df['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons'

In [8]:
# tokenize
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())

In [9]:
# 뉴스 총 11314개를 tokenize한 데이터
print(len(tokenized_doc))

#데이터확인
tokenized_doc[:5]

11314


0    [well, sure, about, story, seem, biased, what,...
1    [yeah, expect, people, read, actually, accept,...
2    [although, realize, that, principle, your, str...
3    [notwithstanding, legitimate, fuss, about, thi...
4    [well, will, have, change, scoring, playoff, p...
Name: clean_doc, dtype: object

# Corpus

In [10]:
from gensim import corpora

# 각 단어를 (단어id, 나온횟수) 로 바꾸는 작업
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]

In [11]:
# corpus[i] : i번째 뉴스에서 나온단어들을 가지고 (단어id, 나온횟수)들을 저장한 list
print(corpus[1]) 

# dictionary[j] : id값을 j를 가진 단어가 무엇인지 확인
print(dictionary[66])

[(0, 1), (2, 1), (20, 1), (60, 2), (66, 1), (72, 2), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 2), (85, 1), (86, 1), (87, 1), (88, 1), (89, 2), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 2), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 2), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1)]
well


# LDA

In [12]:
import gensim

#20개의 토픽, k=20
NUM_TOPICS = 20 

# passes : 알고리즘 동작횟수, num_words : 각 토픽별 출력할 단어
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.021*"with" + 0.019*"have" + 0.019*"that" + 0.015*"this"')
(1, '0.012*"this" + 0.009*"with" + 0.009*"will" + 0.008*"from"')
(2, '0.024*"output" + 0.024*"file" + 0.022*"entry" + 0.012*"program"')
(3, '0.009*"each" + 0.009*"gordon" + 0.008*"pitt" + 0.008*"cover"')
(4, '0.010*"from" + 0.009*"with" + 0.007*"were" + 0.007*"their"')
(5, '0.007*"compass" + 0.007*"candida" + 0.006*"coli" + 0.005*"infections"')
(6, '0.018*"bike" + 0.010*"riding" + 0.010*"motorcycle" + 0.009*"plastic"')
(7, '0.036*"space" + 0.015*"nasa" + 0.008*"launch" + 0.008*"earth"')
(8, '0.057*"that" + 0.023*"this" + 0.021*"have" + 0.014*"they"')
(9, '0.034*"that" + 0.015*"this" + 0.012*"have" + 0.011*"with"')
(10, '0.009*"lock" + 0.007*"cursor" + 0.006*"binaries" + 0.005*"polygon"')
(11, '0.014*"encryption" + 0.011*"security" + 0.011*"will" + 0.011*"clipper"')
(12, '0.035*"they" + 0.029*"that" + 0.018*"were" + 0.012*"there"')
(13, '0.047*"drive" + 0.032*"scsi" + 0.024*"disk" + 0.019*"hard"')
(14, '0.019*"game" + 0.01

In [13]:
# 각 토픽별 10개의 단어를 단어를 출력 (위 코드에서 num_words=10을 한것)
for i in range(20):
    print(ldamodel.print_topics()[i])

(0, '0.021*"with" + 0.019*"have" + 0.019*"that" + 0.015*"this" + 0.008*"about" + 0.008*"there" + 0.007*"like" + 0.007*"from" + 0.006*"would" + 0.006*"some"')
(1, '0.012*"this" + 0.009*"with" + 0.009*"will" + 0.008*"from" + 0.008*"number" + 0.008*"list" + 0.007*"information" + 0.007*"your" + 0.007*"which" + 0.006*"available"')
(2, '0.024*"output" + 0.024*"file" + 0.022*"entry" + 0.012*"program" + 0.010*"sale" + 0.009*"build" + 0.008*"line" + 0.008*"shipping" + 0.008*"section" + 0.008*"entries"')
(3, '0.009*"each" + 0.009*"gordon" + 0.008*"pitt" + 0.008*"cover" + 0.008*"banks" + 0.007*"soon" + 0.007*"good" + 0.007*"surrender" + 0.006*"skepticism" + 0.006*"intellect"')
(4, '0.010*"from" + 0.009*"with" + 0.007*"were" + 0.007*"their" + 0.005*"turkish" + 0.005*"states" + 0.005*"armenian" + 0.005*"health" + 0.004*"april" + 0.004*"government"')
(5, '0.007*"compass" + 0.007*"candida" + 0.006*"coli" + 0.005*"infections" + 0.005*"dept" + 0.004*"acid" + 0.003*"yeast" + 0.003*"symptoms" + 0.003*"in

# 시각화

In [14]:
!pip install pyLDAvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Collecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyLDAvis, sklearn
  Building wheel for pyLDAvis (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136898 sha256=3fca073f13dae3b320416ac20aa4c0a7d55f7c38a5fbe6024ec4a221efddab1a
  Stored 

In [20]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

  default_term_info = default_term_info.sort_values(


# 문서별 주제 분포 보기

In [21]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5:
        break
    print(i,'번째 문서의 topic 비율은',topic_list)

0 번째 문서의 topic 비율은 [(4, 0.2128784), (8, 0.31373358), (9, 0.37629592), (18, 0.087789744)]
1 번째 문서의 topic 비율은 [(3, 0.17371447), (8, 0.6635982), (9, 0.10566696), (13, 0.041334167)]
2 번째 문서의 topic 비율은 [(8, 0.58771676), (9, 0.1908939), (18, 0.21089557)]
3 번째 문서의 topic 비율은 [(0, 0.23302074), (1, 0.0314722), (4, 0.10636031), (7, 0.015120131), (8, 0.35763875), (11, 0.19669713), (19, 0.05262554)]
4 번째 문서의 topic 비율은 [(0, 0.34843418), (1, 0.052264348), (8, 0.34182933), (14, 0.23247217)]


In [22]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return(topic_table)

In [23]:
topictable = make_topictable_per_doc(ldamodel, corpus)
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable[:10]

Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,9.0,0.376,"[(4, 0.2129043), (8, 0.31398344), (9, 0.376007..."
1,1,8.0,0.6635,"[(3, 0.17370449), (8, 0.663537), (9, 0.1057351..."
2,2,8.0,0.5877,"[(8, 0.5877168), (9, 0.19089381), (18, 0.21089..."
3,3,8.0,0.3575,"[(0, 0.23318882), (1, 0.031465646), (4, 0.1063..."
4,4,0.0,0.3484,"[(0, 0.34840965), (1, 0.052265704), (8, 0.3418..."
5,5,8.0,0.3786,"[(4, 0.13135333), (8, 0.378598), (9, 0.3266510..."
6,6,4.0,0.6609,"[(0, 0.32472172), (4, 0.66094315)]"
7,7,8.0,0.7969,"[(4, 0.04348442), (8, 0.7969105), (18, 0.14856..."
8,8,8.0,0.5396,"[(0, 0.16555351), (1, 0.112085424), (7, 0.0219..."
9,9,0.0,0.7491,"[(0, 0.74913615), (6, 0.012333616), (9, 0.1611..."
