In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print('샘플의 수 :',len(documents))

documents[1]

print(dataset.target_names)

샘플의 수 : 11314
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [2]:
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

  news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")


In [3]:
news_df['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons'

In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
# NLTK로부터 불용어를 받아온다.
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
# 불용어를 제거합니다.

In [6]:
print(tokenized_doc[1])

['yeah', 'expect', 'people', 'read', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'runs', 'steam', 'sorry', 'pity', 'sorry', 'feelings', 'denial', 'faith', 'need', 'well', 'pretend', 'happily', 'ever', 'anyway', 'maybe', 'start', 'newsgroup', 'atheist', 'hard', 'bummin', 'much', 'forget', 'flintstone', 'chewables', 'bake', 'timmons']


In [7]:
# 역토큰화 (토큰화 작업을 역으로 되돌림)
detokenized_doc = []
for i in range(len(news_df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

news_df['clean_doc'] = detokenized_doc

In [8]:
news_df['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy logic runs steam sorry pity sorry feelings denial faith need well pretend happily ever anyway maybe start newsgroup atheist hard bummin much forget flintstone chewables bake timmons'

In [9]:
vectorizer = TfidfVectorizer(stop_words='english', max_features= 1000, # 상위 1,000개의 단어를 보존 
max_df = 0.5, smooth_idf=True)

X = vectorizer.fit_transform(news_df['clean_doc'])

# TF-IDF 행렬의 크기 확인
print('TF-IDF 행렬의 크기 :',X.shape)

TF-IDF 행렬의 크기 : (11314, 1000)


토픽 모델링

In [10]:
svd_model = TruncatedSVD(n_components=10, algorithm='randomized', n_iter=100, random_state=122)
svd_model.fit(X)
len(svd_model.components_)

10

In [11]:
terms = vectorizer.get_feature_names() # 단어 집합. 1,000개의 단어가 저장됨.

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(svd_model.components_,terms)

Topic 1: [('like', 0.21386), ('know', 0.20046), ('people', 0.19293), ('think', 0.17805), ('good', 0.15128)]
Topic 2: [('thanks', 0.32888), ('windows', 0.29088), ('card', 0.18069), ('drive', 0.17455), ('mail', 0.15111)]
Topic 3: [('game', 0.37064), ('team', 0.32443), ('year', 0.28154), ('games', 0.2537), ('season', 0.18419)]
Topic 4: [('drive', 0.53324), ('scsi', 0.20165), ('hard', 0.15628), ('disk', 0.15578), ('card', 0.13994)]
Topic 5: [('windows', 0.40399), ('file', 0.25436), ('window', 0.18044), ('files', 0.16078), ('program', 0.13894)]
Topic 6: [('chip', 0.16114), ('government', 0.16009), ('mail', 0.15625), ('space', 0.1507), ('information', 0.13562)]
Topic 7: [('like', 0.67086), ('bike', 0.14236), ('chip', 0.11169), ('know', 0.11139), ('sounds', 0.10371)]
Topic 8: [('card', 0.46633), ('video', 0.22137), ('sale', 0.21266), ('monitor', 0.15463), ('offer', 0.14643)]
Topic 9: [('know', 0.46047), ('card', 0.33605), ('chip', 0.17558), ('government', 0.1522), ('video', 0.14356)]
Topic 10



LDA 실행 코드 

In [12]:
!pip install genism

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement genism (from versions: none)[0m
[31mERROR: No matching distribution found for genism[0m


In [13]:
from gensim import corpora 
dictionary = corpora.Dictionary(tokenized_doc) 
corpus = [dictionary.doc2bow(text) for text in tokenized_doc] 
print(corpus[1]) # 수행된 결과에서 두번째 뉴스 출력. 첫번째 문서의 인덱스는 0

[(52, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 1), (88, 1), (89, 1)]


In [14]:
import gensim
NUM_TOPICS = 10 # 20개의 토픽, k=20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.010*"would" + 0.008*"people" + 0.007*"think" + 0.005*"know"')
(1, '0.013*"space" + 0.005*"university" + 0.005*"nasa" + 0.005*"information"')
(2, '0.010*"people" + 0.009*"would" + 0.005*"government" + 0.004*"think"')
(3, '0.008*"game" + 0.006*"team" + 0.006*"play" + 0.005*"first"')
(4, '0.012*"file" + 0.007*"program" + 0.006*"available" + 0.006*"files"')
(5, '0.009*"bike" + 0.006*"ground" + 0.005*"wire" + 0.004*"good"')
(6, '0.011*"year" + 0.006*"last" + 0.006*"league" + 0.006*"team"')
(7, '0.007*"nrhj" + 0.005*"wwiz" + 0.004*"bxom" + 0.004*"gizw"')
(8, '0.021*"armenian" + 0.016*"turkish" + 0.015*"armenians" + 0.007*"turks"')
(9, '0.009*"would" + 0.008*"like" + 0.007*"know" + 0.007*"drive"')


In [15]:
print(ldamodel.print_topics())

[(0, '0.010*"would" + 0.008*"people" + 0.007*"think" + 0.005*"know" + 0.005*"like" + 0.005*"jesus" + 0.005*"believe" + 0.005*"even" + 0.004*"many" + 0.004*"time"'), (1, '0.013*"space" + 0.005*"university" + 0.005*"nasa" + 0.005*"information" + 0.005*"research" + 0.004*"data" + 0.004*"center" + 0.004*"available" + 0.004*"also" + 0.003*"national"'), (2, '0.010*"people" + 0.009*"would" + 0.005*"government" + 0.004*"think" + 0.004*"said" + 0.004*"know" + 0.004*"well" + 0.003*"time" + 0.003*"right" + 0.003*"like"'), (3, '0.008*"game" + 0.006*"team" + 0.006*"play" + 0.005*"first" + 0.005*"games" + 0.005*"like" + 0.005*"went" + 0.005*"back" + 0.004*"think" + 0.004*"would"'), (4, '0.012*"file" + 0.007*"program" + 0.006*"available" + 0.006*"files" + 0.005*"information" + 0.005*"output" + 0.004*"entry" + 0.004*"window" + 0.004*"version" + 0.004*"system"'), (5, '0.009*"bike" + 0.006*"ground" + 0.005*"wire" + 0.004*"good" + 0.004*"cover" + 0.004*"henrik" + 0.004*"ride" + 0.004*"pain" + 0.004*"mile

문서별 토픽 분포 보기 

In [16]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5:
        break
    print(i,'번째 문서의 topic 비율은',topic_list)

0 번째 문서의 topic 비율은 [(2, 0.9854825)]
1 번째 문서의 topic 비율은 [(0, 0.91893375), (6, 0.06155213)]
2 번째 문서의 topic 비율은 [(0, 0.16560353), (2, 0.6986081), (3, 0.036119662), (4, 0.08998995)]
3 번째 문서의 topic 비율은 [(0, 0.20255503), (2, 0.1899682), (4, 0.3128432), (6, 0.110196546), (9, 0.17697272)]
4 번째 문서의 topic 비율은 [(3, 0.96666044)]


In [17]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return(topic_table)

In [18]:
topictable = make_topictable_per_doc(ldamodel, corpus)
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable[:10]

Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,2.0,0.9855,"[(2, 0.98548245)]"
1,1,0.0,0.9189,"[(0, 0.9189321), (6, 0.061554026)]"
2,2,2.0,0.6986,"[(0, 0.16562147), (2, 0.69862056), (3, 0.03609..."
3,3,4.0,0.3128,"[(0, 0.20266289), (2, 0.18990725), (4, 0.31283..."
4,4,3.0,0.9667,"[(3, 0.96666044)]"
5,5,0.0,0.7514,"[(0, 0.7513692), (6, 0.070813775), (8, 0.14864..."
6,6,8.0,0.7088,"[(3, 0.010084361), (5, 0.013729511), (8, 0.708..."
7,7,2.0,0.5061,"[(0, 0.31030843), (2, 0.5060919), (5, 0.156614..."
8,8,3.0,0.3683,"[(0, 0.22279061), (1, 0.05294561), (2, 0.18968..."
9,9,9.0,0.6729,"[(2, 0.13092947), (5, 0.1719134), (8, 0.016003..."
