# 잠재 디리클레 할당(LDA) 실습2
<ol>
    <li>뉴스 기사 제목 데이터에 대한 이해</li>
    <li>텍스트 전처리</li>
    <li>TF-IDF 행렬 만들기</li>
    <li>토픽 모델링</li>
</ol>

## 뉴스 기사 제목 데이터에 대한 이해

In [2]:
import pandas as pd
import urllib.request
#urllib.request.urlretrieve("https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv", filename="abcnews-date-text.csv")
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)

In [3]:
data = data[:20000] #일부만 사용

In [4]:
text = data[['headline_text']]
text.head(5)

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


## 텍스트 전처리
<ol>
    <li>불용어 제거</li>
    <li>표제어 추출</li>
    <li>길이가 짧은 단어 제거</li>
</ol>

In [6]:
# 단어 토큰화
import nltk
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)
text

Unnamed: 0,headline_text
0,"[aba, decides, against, community, broadcastin..."
1,"[act, fire, witnesses, must, be, aware, of, de..."
2,"[a, g, calls, for, infrastructure, protection,..."
3,"[air, nz, staff, in, aust, strike, for, pay, r..."
4,"[air, nz, strike, to, affect, australian, trav..."
...,...
19995,"[analyst, blames, govt, for, abc, digital, dem..."
19996,"[application, for, second, wagga, brothel]"
19997,"[aussie, dollar, hits, new, heights]"
19998,"[australian, coach, leaves, french, club, brive]"


In [7]:
# 불용어 제거
from nltk.corpus import stopwords
stop = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop)])
text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop)])


Unnamed: 0,headline_text
0,"[aba, decides, community, broadcasting, licence]"
1,"[act, fire, witnesses, must, aware, defamation]"
2,"[g, calls, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"
...,...
19995,"[analyst, blames, govt, abc, digital, demise]"
19996,"[application, second, wagga, brothel]"
19997,"[aussie, dollar, hits, new, heights]"
19998,"[australian, coach, leaves, french, club, brive]"


In [8]:
# 표제어 추출
from nltk.stem import WordNetLemmatizer
text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
text # 3인칭 단수 -> 1인칭, 과거 현재형 동사 -> 현재형으로 변환

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])


Unnamed: 0,headline_text
0,"[aba, decide, community, broadcast, licence]"
1,"[act, fire, witness, must, aware, defamation]"
2,"[g, call, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"
...,...
19995,"[analyst, blame, govt, abc, digital, demise]"
19996,"[application, second, wagga, brothel]"
19997,"[aussie, dollar, hit, new, heights]"
19998,"[australian, coach, leave, french, club, brive]"


In [9]:
# 길이가 짧은(3이하) 단어 제거
tokenized_doc = text['headline_text'].apply(lambda x: [word for word in x if len(word) > 3])
tokenized_doc

0                [decide, community, broadcast, licence]
1               [fire, witness, must, aware, defamation]
2             [call, infrastructure, protection, summit]
3                            [staff, aust, strike, rise]
4               [strike, affect, australian, travellers]
                              ...                       
19995            [analyst, blame, govt, digital, demise]
19996              [application, second, wagga, brothel]
19997                          [aussie, dollar, heights]
19998    [australian, coach, leave, french, club, brive]
19999         [australia, always, qaeda, target, expert]
Name: headline_text, Length: 20000, dtype: object

## TF-IDF 행렬 만들기

In [10]:
# 역토큰화 (토큰화 작업을 되돌림)
detokenized_doc = []
for i in range(len(text)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

text['headline_text'] = detokenized_doc # 다시 text['headline_text']에 재저장
text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text['headline_text'] = detokenized_doc # 다시 text['headline_text']에 재저장


Unnamed: 0,headline_text
0,decide community broadcast licence
1,fire witness must aware defamation
2,call infrastructure protection summit
3,staff aust strike rise
4,strike affect australian travellers
...,...
19995,analyst blame govt digital demise
19996,application second wagga brothel
19997,aussie dollar heights
19998,australian coach leave french club brive


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000) # 상위 1,000개의 단어를 보존 
X = vectorizer.fit_transform(text['headline_text'])
X.shape # TF-IDF 행렬의 크기 확인

(20000, 1000)

## 토픽 모델링 - LDA 수행

In [12]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model=LatentDirichletAllocation(n_components=10,
                                    learning_method='online',
                                    random_state=777,
                                    max_iter=1)
lda_top=lda_model.fit_transform(X)
print(lda_model.components_)
print(lda_model.components_.shape) 

[[ 0.10009995  0.10010818  0.10011588 ...  0.10011054  0.10016133
   0.10012603]
 [ 0.10008226  0.10010319  0.10009765 ...  0.10010516  0.10051457
   0.10012813]
 [ 0.1001514   0.10013558  0.10012901 ...  0.10010288  0.10013559
   0.10012032]
 ...
 [ 0.10013912  0.10010712 24.49911176 ...  0.10010938  0.10011624
   0.10013254]
 [ 0.10012317  0.1001066   0.10011805 ...  0.10011223  0.10013261
   0.1001039 ]
 [ 0.10010023 47.54413517  0.10011057 ...  0.10009603  0.1001182
   0.10315478]]
(10, 1000)


In [13]:
terms = vectorizer.get_feature_names() # 단어 집합. 1,000개의 단어가 저장됨.

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])
        
get_topics(lda_model.components_,terms)

Topic 1: [('probe', 98.89), ('crash', 93.62), ('drug', 86.2), ('record', 76.6), ('injure', 75.01)]
Topic 2: [('warn', 113.48), ('iraq', 107.39), ('concern', 103.39), ('australia', 86.11), ('world', 82.59)]
Topic 3: [('lead', 125.23), ('fight', 95.67), ('continue', 90.19), ('french', 82.81), ('road', 78.45)]
Topic 4: [('report', 136.05), ('final', 80.74), ('appeal', 74.56), ('china', 65.73), ('chief', 63.3)]
Topic 5: [('police', 185.59), ('urge', 163.69), ('consider', 105.29), ('leave', 92.33), ('defend', 88.29)]
Topic 6: [('kill', 156.08), ('return', 98.85), ('test', 96.53), ('case', 80.12), ('minister', 78.5)]
Topic 7: [('budget', 173.78), ('govt', 166.77), ('boost', 141.93), ('miss', 99.88), ('change', 98.55)]
Topic 8: [('plan', 217.39), ('sars', 169.18), ('charge', 162.03), ('trial', 103.55), ('rise', 97.6)]
Topic 9: [('council', 171.44), ('claim', 165.23), ('open', 115.92), ('welcome', 108.06), ('clear', 81.85)]
Topic 10: [('face', 180.08), ('fund', 166.62), ('court', 151.09), ('ta