In [2]:
### Topic Modeling
# Topic은 주제를 말합니다. 토픽 모델링은 기계 학습 및 자연어 처리분야에서 토픽은 문서 집합의 추상적인
# 주제를 발견하기 위해서 사용하는 통계적 모델 중 하나로, 본문에 숨겨진 의미 구조를 발견하기 위해 사용
# 되어지는 텍스트 마이닝 기법. 

## 1) 잠재 의미 분석(Latent Semantic Anaylysis, LSA)

# BoW에 기반한 DTM이나 TF-IDF는 기본적으로 단어의 빈도 수를 이용한 수치화 방법이기 때문에 단어의 의미를
# 고려하지 못한다는 단점이 있었습니다. (이를 토픽 모델링 관점에서는 단어의 토픽을 고려하지 못한다고도 
# 합니다.) 이를 위한 대안으로 DTM의 잠재된(Latent) 의미를 이끌어내는 방법으로 
# 잠재 의미 분석(Latent Semantic Analysis, LSA)이라는 방법이 있습니다. 
# 잠재 의미 분석(Latent Semantic Indexing, LSI)이라고 부르기도 합니다. 이하 LSA라고 명명하겠습니다.

# 이 방법을 이해하기 위해서는 선형대수학의 특이값 분해(Singular Value Decomposition, SVD)를 이해할 
# 필요가 있습니다. 이하 이를 SVD라고 명명하겠습니다. 이 실습에서는 SVD를 수행하는 구체적인 선형대수학에 
# 대해서는 설명하지 않고, SVD가 갖고있는 의미를 이해하는 것에 초점을 맞춥니다.

#### 1. 특이값 분해(Singular Value Decomposition, SVD)  : https://wikidocs.net/24949

In [23]:
import numpy as np
A = np.array([[0,0,0,1,0,1,1,0,0],[0,0,0,1,1,0,1,0,0],[0,1,1,0,2,0,0,0,0],[1,0,0,0,0,0,0,1,1]])
print("DTM의 크기 : ", np.shape(A))

#Full SVD
U, s, VT = np.linalg.svd(A, full_matrices = True)
print('행렬 U :')
print(U.round(2))
print('행렬 U의 크기(shape) :',np.shape(U))

DTM의 크기 :  (4, 9)
행렬 U :
[[-0.24  0.75  0.   -0.62]
 [-0.51  0.44 -0.    0.74]
 [-0.83 -0.49 -0.   -0.27]
 [-0.   -0.    1.    0.  ]]
행렬 U의 크기(shape) : (4, 4)


In [5]:
print('특이값 벡터 :')
print(s.round(2))
print('특이값 벡터의 크기(shape) :',np.shape(s))

특이값 벡터 :
[2.69 2.05 1.73 0.77]
특이값 벡터의 크기(shape) : (4,)


In [12]:
### Numpy의 linalg.svd()는 특이값 분해의 결과로 대각 행렬이 아니라 특이값 리스트를 반환
### 때문에 수식을 대각행렬로 바꾸어 주어야 한다. 특이값을 s에 저장하고 대각 행렬의 크기를 생성한 후
### 그 행렬에 특이값을 삽입

## 대각 행렬의 크기 4x9의 임의의 행렬 생성
S = np.zeros((4,9))

## 특이값을 대각행렬에 삽입
S[:4, :4] = np.diag(s)

print('대각 행렬 S :')
print(S.round(2))

print('대각 행렬의 크기(shape) :')
print(np.shape(S))

대각 행렬 S :
[[2.69 0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   2.05 0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   1.73 0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.77 0.   0.   0.   0.   0.  ]]
대각 행렬의 크기(shape) :
(4, 9)


In [14]:
print("직교행렬 VT :")
print(VT.round(2))
print("직교 행렬 VT의 크기(shape) : ")
print(np.shape(VT))

직교행렬 VT :
[[-0.   -0.31 -0.31 -0.28 -0.8  -0.09 -0.28 -0.   -0.  ]
 [ 0.   -0.24 -0.24  0.58 -0.26  0.37  0.58 -0.   -0.  ]
 [ 0.58 -0.    0.    0.   -0.    0.   -0.    0.58  0.58]
 [ 0.   -0.35 -0.35  0.16  0.25 -0.8   0.16 -0.   -0.  ]
 [-0.   -0.78 -0.01 -0.2   0.4   0.4  -0.2   0.    0.  ]
 [-0.29  0.31 -0.78 -0.24  0.23  0.23  0.01  0.14  0.14]
 [-0.29 -0.1   0.26 -0.59 -0.08 -0.08  0.66  0.14  0.14]
 [-0.5  -0.06  0.15  0.24 -0.05 -0.05 -0.19  0.75 -0.25]
 [-0.5  -0.06  0.15  0.24 -0.05 -0.05 -0.19 -0.25  0.75]]
직교 행렬 VT의 크기(shape) : 
(9, 9)


In [15]:
### UxSxVT가 기존 행렬 A와 같은지 확인
## Numpy의 allclose()는 2개의 행렬이 동일하면 True을 반환
np.allclose(A, np.dot(np.dot(U,S),VT).round(2))

True

In [25]:
#### 절단된 SVD(Truncated SVD)

## 특이값 상위 2개만 보존
S = S[:2,:2]

print("대각 행렬 S : ")
print(S.round(2))

대각 행렬 S : 
[[2.69 0.  ]
 [0.   2.05]]


In [24]:
## 직교 행렬 U에 대해서도 2개의 열만 남기고 제거
U = U[:,:2]
print("행렬 U : ")
print(U.round(2))

행렬 U : 
[[-0.24  0.75]
 [-0.51  0.44]
 [-0.83 -0.49]
 [-0.   -0.  ]]


In [26]:
## 전치행렬인 VT 2의 행만 남기고 제거
VT = VT[:2,:]
print("직교행렬 VT : ")
print(VT.round(2))

직교행렬 VT : 
[[-0.   -0.31 -0.31 -0.28 -0.8  -0.09 -0.28 -0.   -0.  ]
 [ 0.   -0.24 -0.24  0.58 -0.26  0.37  0.58 -0.   -0.  ]]


In [27]:
A_prime = np.dot(np.dot(U,S), VT)
print(A)
print(A_prime.round(2))

[[0 0 0 1 0 1 1 0 0]
 [0 0 0 1 1 0 1 0 0]
 [0 1 1 0 2 0 0 0 0]
 [1 0 0 0 0 0 0 1 1]]
[[ 0.   -0.17 -0.17  1.08  0.12  0.62  1.08 -0.   -0.  ]
 [ 0.    0.2   0.2   0.91  0.86  0.45  0.91  0.    0.  ]
 [ 0.    0.93  0.93  0.03  2.05 -0.17  0.03  0.    0.  ]
 [ 0.    0.    0.    0.    0.    0.    0.    0.    0.  ]]


In [28]:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 8.6 MB/s eta 0:00:00
Collecting click
  Downloading click-8.1.3-py3-none-any.whl (96 kB)
     ---------------------------------------- 96.6/96.6 kB 5.8 MB/s eta 0:00:00
Collecting regex>=2021.8.3
  Downloading regex-2022.4.24-cp310-cp310-win_amd64.whl (262 kB)
     -------------------------------------- 262.0/262.0 kB 8.1 MB/s eta 0:00:00
Installing collected packages: regex, click, nltk
Successfully installed click-8.1.3 nltk-3.7 regex-2022.4.24


In [29]:
### 실습 (LSA)
## 데이터 : 사이킷런에서 Twenty Newsgroups이라고 불리는 20개의 다른 주제를 가진 뉴스그룹 
# 데이터를 제공

import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [31]:
### dataset
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=("headers","footers","quotes"))
documents = dataset.data
print("샘플의 숫 : ", len(documents))

샘플의 숫 :  11314


In [41]:
documents[1]

"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [39]:
print(dataset.target_names)
dataset.target[0]

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


17

In [51]:
### 텍스트 전처리
news_df = pd.DataFrame({'document':documents})
# print(news_df.head(2))
## 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]"," ")  # 영문자아닌 것을 " "

## 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: " ".join([w for w in x.split() if len(w)>3]))
## 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())
news_df['clean_doc'].head()

  news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]"," ")  # 영문자아닌 것을 " "


0    well sure about story seem biased what disagre...
1    yeah expect people read actually accept hard a...
2    although realize that principle your strongest...
3    notwithstanding legitimate fuss about this pro...
4    well will have change scoring playoff pool unf...
Name: clean_doc, dtype: object

In [53]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vmuser\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [59]:
### NLTK를 사용하여 불용어 제거
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x:x.split())
print(len(tokenized_doc[0]))
tokenized_doc = tokenized_doc.apply(lambda x:[item for item in x if item not in stop_words])
print(len(tokenized_doc[0]))

85
61


In [60]:
print(tokenized_doc[1])

['yeah', 'expect', 'people', 'read', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'runs', 'steam', 'sorry', 'pity', 'sorry', 'feelings', 'denial', 'faith', 'need', 'well', 'pretend', 'happily', 'ever', 'anyway', 'maybe', 'start', 'newsgroup', 'atheist', 'hard', 'bummin', 'much', 'forget', 'flintstone', 'chewables', 'bake', 'timmons']


In [62]:
### TF-IDF 행렬 만들기
# 역토큰화
detokenized_doc = []
for i in range(len(news_df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

news_df['clean_doc'] = detokenized_doc

In [63]:
news_df['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy logic runs steam sorry pity sorry feelings denial faith need well pretend happily ever anyway maybe start newsgroup atheist hard bummin much forget flintstone chewables bake timmons'

In [64]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000, max_df=0.5, smooth_idf=True)
## max_df : 0.0 ~ 1.0 사이의 값을 쓰고, 최대 문서에 걸친 포함된 단어까지 단어 꾸러미에 담아 사용할지 결정
X = vectorizer.fit_transform(news_df['clean_doc'])

### TF-IDF 행렬의 크기 확인
print("TF-IDF 행렬의 크기 : ", X.shape)

TF-IDF 행렬의 크기 :  (11314, 1000)


In [66]:
### 토픽 모델링(Topic 모델링)
svd_model = TruncatedSVD(n_components=20, algorithm='randomized',n_iter=100, random_state=122)
svd_model.fit(X)
len(svd_model.components_)

20

In [67]:
np.shape(svd_model.components_)

(20, 1000)

In [80]:
terms = vectorizer.get_feature_names()

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])

get_topics(svd_model.components_, terms)


Topic 1: [('like', 0.21386), ('know', 0.20046), ('people', 0.19293), ('think', 0.17805), ('good', 0.15128)]
Topic 2: [('thanks', 0.32888), ('windows', 0.29088), ('card', 0.18069), ('drive', 0.17455), ('mail', 0.15111)]
Topic 3: [('game', 0.37064), ('team', 0.32443), ('year', 0.28154), ('games', 0.2537), ('season', 0.18419)]
Topic 4: [('drive', 0.53324), ('scsi', 0.20165), ('hard', 0.15628), ('disk', 0.15578), ('card', 0.13994)]
Topic 5: [('windows', 0.40399), ('file', 0.25436), ('window', 0.18044), ('files', 0.16078), ('program', 0.13894)]
Topic 6: [('chip', 0.16114), ('government', 0.16009), ('mail', 0.15625), ('space', 0.1507), ('information', 0.13562)]
Topic 7: [('like', 0.67086), ('bike', 0.14236), ('chip', 0.11169), ('know', 0.11139), ('sounds', 0.10371)]
Topic 8: [('card', 0.46633), ('video', 0.22137), ('sale', 0.21266), ('monitor', 0.15463), ('offer', 0.14643)]
Topic 9: [('know', 0.46047), ('card', 0.33605), ('chip', 0.17558), ('government', 0.1522), ('video', 0.14356)]
Topic 10

In [81]:
### 2) 잠재 디리클레 할당(Latent Dirichlet Allocation, LDA)
# 토픽 모델링의 대표적인 알고리즘. 
# LDA는 문서들은 토픽들이 혼합되어 구성되어 있으며, 토픽들은 확률 분포에 기반하여 단어들 생성한다고 가정
# 데이터가 주어지면, LDA는 문서가 생성되는 과정을 역추적합니다. 
# 참고 링크 : https://lettier.com/projects/lda-topic-modeling/


In [82]:
### 개요
# LDA을 일종의 블랙박스로 보고, LDA에 문저 집합을 입력하면, 어떤 결과를 보여주는 보면.... 
#
## 실습 : 전처리 과정은 LSA와 동일... 
tokenized_doc[:5]

0    [well, sure, story, seem, biased, disagree, st...
1    [yeah, expect, people, read, actually, accept,...
2    [although, realize, principle, strongest, poin...
3    [notwithstanding, legitimate, fuss, proposal, ...
4    [well, change, scoring, playoff, pool, unfortu...
Name: clean_doc, dtype: object

In [83]:
### gensim을 사용.... 
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable
Collecting gensim
  Downloading gensim-4.2.0-cp310-cp310-win_amd64.whl (23.9 MB)
     ---------------------------------------- 23.9/23.9 MB 6.2 MB/s eta 0:00:00
Collecting smart-open>=1.8.1
  Downloading smart_open-6.0.0-py3-none-any.whl (58 kB)
     ---------------------------------------- 58.4/58.4 kB ? eta 0:00:00
Collecting Cython==0.29.28
  Downloading Cython-0.29.28-py2.py3-none-any.whl (983 kB)
     -------------------------------------- 983.8/983.8 kB 4.8 MB/s eta 0:00:00
Installing collected packages: smart-open, Cython, gensim
Successfully installed Cython-0.29.28 gensim-4.2.0 smart-open-6.0.0


In [93]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(len(corpus))

11314


In [90]:
dictionary[64280]

'unis'

In [88]:
len(dictionary)

64281

In [92]:
### LDA 모델 훈련시키기
import gensim
NUM_TOPICS = 20 
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=NUM_TOPICS, id2word=dictionary, \
    passes=15 )
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.016*"israel" + 0.016*"armenian" + 0.014*"jews" + 0.013*"turkish"')
(1, '0.015*"health" + 0.012*"medical" + 0.008*"disease" + 0.007*"pain"')
(2, '0.077*"drive" + 0.050*"scsi" + 0.047*"disk" + 0.030*"hard"')
(3, '0.012*"public" + 0.010*"encryption" + 0.010*"information" + 0.009*"government"')
(4, '0.013*"said" + 0.008*"went" + 0.007*"back" + 0.007*"know"')
(5, '0.019*"period" + 0.017*"play" + 0.012*"power" + 0.010*"goal"')
(6, '0.008*"jesus" + 0.007*"would" + 0.007*"people" + 0.006*"believe"')
(7, '0.010*"cover" + 0.010*"xlib" + 0.010*"request" + 0.009*"copies"')
(8, '0.017*"file" + 0.011*"program" + 0.009*"available" + 0.008*"files"')
(9, '0.013*"like" + 0.013*"good" + 0.009*"would" + 0.008*"much"')
(10, '0.016*"nist" + 0.013*"pitcher" + 0.012*"ncsl" + 0.008*"riders"')
(11, '0.030*"game" + 0.026*"team" + 0.021*"games" + 0.020*"year"')
(12, '0.014*"guns" + 0.012*"weapons" + 0.012*"control" + 0.011*"firearms"')
(13, '0.029*"space" + 0.011*"nasa" + 0.008*"research" + 0.007*"center"'

In [94]:
### LDA 시각화 하기(pyLDAvis)
!pip install pyLDAvis

Defaulting to user installation because normal site-packages is not writeable
Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
     ---------------------------------------- 1.7/1.7 MB 10.5 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Collecting numexpr
  Downloading numexpr-2.8.1-cp310-cp310-win_amd64.whl (88 kB)
     ---------------------------------------- 88.5/88.5 kB ? eta 0:00:00
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
  Preparing metadata (setup.py): started
  Preparing met

In [96]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [97]:
### 문서별 토픽 분포보기
for i, topic_list in enumerate(ldamodel[corpus]):
    if i == 10:
        break
    print(i,'번째 문서의 topic 비율은 ',topic_list)

0 번째 문서의 topic 비율은  [(0, 0.23375575), (6, 0.17070535), (12, 0.2972339), (17, 0.28538674)]
1 번째 문서의 topic 비율은  [(2, 0.05882811), (6, 0.38454646), (11, 0.047093105), (13, 0.09000181), (16, 0.055350535), (17, 0.345743)]
2 번째 문서의 topic 비율은  [(0, 0.24619749), (17, 0.73925614)]
3 번째 문서의 topic 비율은  [(1, 0.017805321), (3, 0.4039655), (6, 0.055324227), (9, 0.30071715), (17, 0.21041375)]
4 번째 문서의 topic 비율은  [(8, 0.0671257), (11, 0.65415573), (17, 0.24722628)]
5 번째 문서의 topic 비율은  [(5, 0.053972457), (6, 0.66026723), (9, 0.18977498), (12, 0.059575796)]
6 번째 문서의 topic 비율은  [(3, 0.025229674), (7, 0.024935847), (8, 0.113978475), (9, 0.47931907), (14, 0.041016668), (15, 0.056635603), (19, 0.24971002)]
7 번째 문서의 topic 비율은  [(0, 0.13695526), (12, 0.053616814), (15, 0.18578279), (17, 0.61051136)]
8 번째 문서의 topic 비율은  [(1, 0.16774239), (6, 0.0579454), (8, 0.07481113), (10, 0.2610358), (17, 0.4163219)]
9 번째 문서의 topic 비율은  [(3, 0.06738132), (9, 0.7368607), (13, 0.04074479), (19, 0.14388151)]


In [98]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return(topic_table)


In [99]:
topictable = make_topictable_per_doc(ldamodel, corpus)
topictable = topictable.reset_index()
topictable.columns = ['문서 번호','가장 비중이 높은 토픽','가장 높은 토픽의 비중','각 토픽의 비중']
topictable[:10]

  topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
  topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)


Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,12,0.2972,"[(0, 0.23376156), (6, 0.17066287), (12, 0.2972..."
1,1,6,0.3846,"[(2, 0.058833312), (6, 0.38463426), (11, 0.047..."
2,2,17,0.7393,"[(0, 0.24618088), (17, 0.7392728)]"
3,3,3,0.404,"[(1, 0.017805187), (3, 0.4039732), (6, 0.05528..."
4,4,11,0.6542,"[(8, 0.067186035), (11, 0.6541859), (17, 0.247..."
5,5,6,0.6605,"[(5, 0.053967286), (6, 0.6604782), (9, 0.18958..."
6,6,9,0.4793,"[(3, 0.025229456), (7, 0.02493606), (8, 0.1139..."
7,7,17,0.6105,"[(0, 0.13695353), (12, 0.05361709), (15, 0.185..."
8,8,17,0.4162,"[(1, 0.16772518), (6, 0.058040794), (8, 0.0748..."
9,9,9,0.7368,"[(3, 0.067380965), (9, 0.73684776), (13, 0.040..."


In [105]:

import pandas as pd
import urllib.request
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#urllib.request.urlretrieve("https://www.kaggle.com/datasets/therohk/million-headlines?select=abcnews-date-text.csv", filename="abcnews-date-text.csv")

data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)
print('뉴스 제목 개수 :',len(data))



  data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)


뉴스 제목 개수 : 1226258


In [106]:
print(data.head(5))

   publish_date                                      headline_text
0      20030219  aba decides against community broadcasting lic...
1      20030219     act fire witnesses must be aware of defamation
2      20030219     a g calls for infrastructure protection summit
3      20030219           air nz staff in aust strike for pay rise
4      20030219      air nz strike to affect australian travellers


In [107]:
text = data[['headline_text']]
text.head(5)

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [108]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vmuser\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vmuser\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [109]:
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)


In [110]:
print(text.head(5))

                                       headline_text
0  [aba, decides, against, community, broadcastin...
1  [act, fire, witnesses, must, be, aware, of, de...
2  [a, g, calls, for, infrastructure, protection,...
3  [air, nz, staff, in, aust, strike, for, pay, r...
4  [air, nz, strike, to, affect, australian, trav...


In [111]:
stop_words = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop_words)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop_words)])


In [112]:
print(text.head(5))

                                       headline_text
0   [aba, decides, community, broadcasting, licence]
1    [act, fire, witnesses, must, aware, defamation]
2     [g, calls, infrastructure, protection, summit]
3          [air, nz, staff, aust, strike, pay, rise]
4  [air, nz, strike, affect, australian, travellers]


In [113]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vmuser\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vmuser\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.


True

In [114]:
text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
print(text.head(5))

                                       headline_text
0       [aba, decide, community, broadcast, licence]
1      [act, fire, witness, must, aware, defamation]
2      [g, call, infrastructure, protection, summit]
3          [air, nz, staff, aust, strike, pay, rise]
4  [air, nz, strike, affect, australian, travellers]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])


In [115]:
tokenized_doc = text['headline_text'].apply(lambda x: [word for word in x if len(word) > 3])
print(tokenized_doc[:5])

0       [decide, community, broadcast, licence]
1      [fire, witness, must, aware, defamation]
2    [call, infrastructure, protection, summit]
3                   [staff, aust, strike, rise]
4      [strike, affect, australian, travellers]
Name: headline_text, dtype: object


In [116]:
# 역토큰화 (토큰화 작업을 역으로 되돌림)
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

text['headline_text'] = detokenized_doc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = detokenized_doc


In [117]:
text['headline_text'][:5]

0       decide community broadcast licence
1       fire witness must aware defamation
2    call infrastructure protection summit
3                   staff aust strike rise
4      strike affect australian travellers
Name: headline_text, dtype: object

In [118]:
# 상위 1,000개의 단어를 보존 
vectorizer = TfidfVectorizer(stop_words='english', max_features= 1000)
X = vectorizer.fit_transform(text['headline_text'])

# TF-IDF 행렬의 크기 확인
print('TF-IDF 행렬의 크기 :',X.shape)

TF-IDF 행렬의 크기 : (1226258, 1000)


In [129]:
lda_model = LatentDirichletAllocation(n_components=10,learning_method='online',random_state=777,max_iter=1)

In [130]:
lda_top = lda_model.fit_transform(X)

In [131]:
print(lda_model.components_)
print(lda_model.components_.shape) 

[[1.00000865e-01 1.00000439e-01 1.00001997e-01 ... 1.00006873e-01
  1.00003405e-01 1.00005209e-01]
 [1.00001642e-01 1.00000829e-01 6.40533260e+02 ... 1.00009111e-01
  1.00004890e-01 5.79474578e+02]
 [1.00001468e-01 1.00000275e-01 1.00001496e-01 ... 1.00004592e-01
  1.00001786e-01 1.00005396e-01]
 ...
 [1.00002822e-01 1.00000923e-01 1.00001462e-01 ... 1.00009141e-01
  1.00005015e-01 1.00008595e-01]
 [1.00004695e-01 1.00002038e-01 1.00001419e-01 ... 1.00004812e-01
  1.00002548e-01 1.00007925e-01]
 [1.07423402e+02 2.03964360e+02 1.00002187e-01 ... 1.00006822e-01
  1.00003052e-01 1.00006310e-01]]
(10, 1000)


In [132]:
# 단어 집합. 1,000개의 단어가 저장됨.
terms = vectorizer.get_feature_names()

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])

get_topics(lda_model.components_,terms)

Topic 1: [('queensland', 12908.75), ('sydney', 10948.96), ('melbourne', 8900.08), ('change', 7262.83), ('crash', 6153.23)]
Topic 2: [('australia', 19355.9), ('australian', 13286.11), ('leave', 4930.96), ('speak', 4845.83), ('perth', 4709.2)]
Topic 3: [('donald', 9114.15), ('live', 7908.02), ('federal', 4711.88), ('rise', 4630.85), ('victorian', 4567.89)]
Topic 4: [('health', 6349.63), ('tasmania', 6141.91), ('report', 5567.09), ('plan', 4834.44), ('time', 4744.41)]
Topic 5: [('state', 6086.17), ('open', 6074.01), ('coast', 6015.08), ('restrictions', 5961.35), ('woman', 5921.79)]
Topic 6: [('trump', 15903.94), ('police', 13931.38), ('home', 7318.39), ('test', 7241.0), ('market', 6529.05)]
Topic 7: [('government', 9187.9), ('record', 6384.64), ('border', 6378.89), ('help', 5807.23), ('people', 5620.56)]
Topic 8: [('coronavirus', 48038.98), ('covid', 19540.99), ('victoria', 10827.28), ('china', 8358.46), ('death', 7181.89)]
Topic 9: [('case', 10138.89), ('charge', 8386.79), ('court', 8195



In [133]:
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda_model, X, vectorizer)
pyLDAvis.display(vis)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
