In [1]:
import pandas as pd
import pyLDAvis
import pyLDAvis.lda_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

df_Seoul120 = pd.read_csv("https://bit.ly/seoul-120-text-csv")
Seoul120_content = df_Seoul120['내용']
Seoul120_content[:3]

0    아빠 육아휴직 장려금   업무개요  남성근로자의 육아휴직을 장려하고 양육에 따른 경...
1    서울산업진흥원 서울메이드란 서울의 감성을 담은 다양하고 새로운 경험을 제공하기 위해...
2    강북구 정비중  업무개요  투명 폐트병을 교환보상하므로 수거율을 높이고 폐기물을 감...
Name: 내용, dtype: object

In [2]:
stopwords = []

In [3]:
# 토크나이징 함수 정의
from konlpy.tag import Okt
okt = Okt()

def tokenizer(raw, pos=["Noun","Alpha","Verb","Number"], stopword=stopwords):
    return [
        word for word, tag in okt.pos(
            raw, 
            stem=True    # stemming 바뀌나->바뀌다
            )
            if len(word) > 1 and tag in pos and word not in stopword

    ]

### TfidfVectorizer
max_df=, min_df=

In [4]:
# tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfVectorizer = TfidfVectorizer(tokenizer=tokenizer, max_df=0.95, min_df=2)

In [5]:
tfidf = tfidfVectorizer.fit_transform(Seoul120_content)
tfidf.toarray()[:1]



array([[0., 0., 0., ..., 0., 0., 0.]])

In [6]:
import numpy as np
np.where(tfidf.toarray()[:1] > 0)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([  66,   73,  207,  210,  328,  349,  368,  378,  401,  456,  749,
         753,  769,  890,  935,  939, 1005, 1105, 1139, 1391, 1401, 1616,
        1618, 1662, 1680, 1845, 1935, 2142, 2207, 2221, 2227, 2314, 2421,
        2467, 2507, 2535, 2607, 2816, 2990, 3009, 3038, 3039, 3186, 3633,
        3731, 3827, 3834, 3886, 4098, 4121, 4384, 4425, 4515, 4531, 4545,
        4605, 4675, 4714, 4811, 4827, 4890, 4926, 5115, 5207, 5341, 5386,
        5429, 5430, 5569, 5629, 5630, 5958, 6236, 6405, 6486, 6755, 6809]))

## n_components=3

In [7]:
latentDirichletAllocation3 = LatentDirichletAllocation(n_components=3, n_jobs=-1)

In [8]:
latentDirichletAllocation3.fit(tfidf)

In [9]:
vis = pyLDAvis.lda_model.prepare(latentDirichletAllocation3, tfidf, tfidfVectorizer)

In [10]:
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

## n_components=5

In [11]:
latentDirichletAllocation5 = LatentDirichletAllocation(n_components=5, n_jobs=-1)

In [12]:
latentDirichletAllocation5.fit(tfidf)

In [13]:
vis = pyLDAvis.lda_model.prepare(latentDirichletAllocation5, tfidf, tfidfVectorizer)

In [14]:
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

## n_components=7

In [19]:
latentDirichletAllocation7 = LatentDirichletAllocation(n_components=7, n_jobs=-1)

In [16]:
latentDirichletAllocation7.fit(tfidf)

In [17]:
vis = pyLDAvis.lda_model.prepare(latentDirichletAllocation7, tfidf, tfidfVectorizer)

In [18]:
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)