In [None]:
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
from collections import Counter
from scipy.sparse import dok_matrix
import numpy as np
from nltk.stem import WordNetLemmatizer
import stop_words
import ujson as json
import re

stopwords = set(stop_words.get_stop_words('en'))
stopwords.update(['quote', 'pmquote', 'amquote', 'just', 'don', 'one', 'thing', 'even', 'way', 'maybe', 'also', 'please', 'well', 'actually', 'something',
                                         'going', 'anything', 'le', 'ever', 'say', 'see', 'likely', 'per', 'another', 'someone', 'let', 'anyone', 'doesn', 'include', 'doe'])
lemmatizer = WordNetLemmatizer()

In [None]:
def parse_string(input_string):
    input_string = input_string.lower()
    input_string = re.sub(r'http\S+', ' ', input_string)
    words = re.sub( "[^a-zA-Z]", " ", input_string).split()
    words = [lemmatizer.lemmatize(w) for w in words]
    words = [w for w in words if w not in stopwords and len(w) > 2]
    return words

In [None]:
with open('./bitcoin_post.json') as f:
    posts = json.load(f)

In [None]:
print(len(posts))
print(posts[0])

In [None]:
preprocessed_data = []
voca = set()
word_freq = Counter()

with open('./bitcoin_post.json') as f:
    posts = json.load(f)
    for post in posts:
        post_words = parse_string(post['body'])
        word_freq.update(post_words)

with open('./bitcoin_post.json') as f:
    posts = json.load(f)
    for i, post in enumerate(posts):
        post_words = parse_string(post['body'])
        post_words = [w for w in post_words if word_freq[w] >= 10]
        if len(post_words) < 5:
            continue
        
        voca.update(post_words)
        post['words'] = post_words
        preprocessed_data.append(post)
            
            
voca = list(voca)
word2id = {w: i for i, w in enumerate(voca)}
del posts
# 약 1분 걸림

In [None]:
tdm = dok_matrix((len(preprocessed_data), len(voca)), dtype=np.float32)
for i, post in enumerate(preprocessed_data):
    for word in post['words']:
        tdm[i, word2id[word]] += 1

tdm = tdm.tocsr()
tdm = normalize(tdm)
# 약 1분 걸림

In [None]:
K = 7
nmf = NMF(n_components=K)
W = nmf.fit_transform(tdm)
H = nmf.components_

In [None]:
for k in range(K):
    print(f"{k}th topic")
    for idx in H[k].argsort()[::-1][:10]:
        print(voca[idx], end=' ')
    print()

In [None]:
from sklearn.manifold import TSNE

random_index = np.random.choice(len(preprocessed_data), size=2000)
document_2d = TSNE(init='pca').fit_transform(tdm[random_index].toarray())

document_topic = W[random_index, :].argmax(axis=1)
topic_document_indexes = [[] for i in range(K)]
for i, topic in enumerate(document_topic):
    topic_document_indexes[topic].append(i)
    
# 약 2~3분 걸림

In [None]:
from bokeh.models import HoverTool, Legend
from bokeh.palettes import Category20
from bokeh.io import show, output_notebook
from bokeh.plotting import figure, ColumnDataSource
output_notebook()

# 사용할 툴들
p = figure(plot_width=900, plot_height=600, 
                     toolbar_location='above',  x_range=(document_2d[:, 0].min()*1.05, document_2d[:, 1].max()*1.2))

# 각 토픽별 그래프에 추가하도록 source data 생성
circles = []
for k, document_indexes in enumerate(topic_document_indexes):
    document_source = ColumnDataSource(data={
        'x': document_2d[document_indexes, 0],
        'y': document_2d[document_indexes, 1],
        'topic': [k for _ in document_indexes],
        'title': [preprocessed_data[random_index[i]]['title'] for i in document_indexes],
        'body': [preprocessed_data[random_index[i]]['body'][:75] for i in document_indexes],
        'color': [Category20[10][k] for _ in document_indexes],
    })
    circles.append(p.circle('x', 'y', color='color', legend='topic', source=document_source, size=6))

# 몇가지 interaction
p.add_tools(HoverTool(tooltips=[('topic', '@topic'), ("title", "@title"), ('body', '@body')], renderers=circles, mode='mouse'))
p.legend.click_policy = 'hide'
show(p)

In [None]:
for k in range(K):
    print(f"{k}th topic")
    for idx in H[k].argsort()[::-1][:10]:
        print(voca[idx], end=' ')
    print()