## Quickstart

http://bokeh.pydata.org/en/latest/docs/user_guide/quickstart.html

In [1]:
import bokeh.sampledata

In [3]:
from bokeh.plotting import figure, output_file, show

# prepare some data
x = [1, 2, 3, 4, 5]
y = [6, 7, 2, 4, 5]

# output to static HTML file
output_file("lines.html")

# create a new plot with a title and axis labels
p = figure(title="simple line example", x_axis_label='x', y_axis_label='y')

# add a line renderer with legend and line thickness
p.line(x, y, legend="Temp.", line_width=2)

# show the results
show(p)

## test heungbunnolbu.txt

In [95]:
with open('heungbunnolbu.txt', 'r') as f:
    text = f.read()

In [96]:
from konlpy.tag import Twitter; t = Twitter()

In [97]:
def tokenize(doc):
    return ['/'.join(t) for t in t.pos(doc, norm=True, stem=True)]

In [98]:
%%time
text_nouns = tokenize(text)

CPU times: user 160 ms, sys: 467 ms, total: 627 ms
Wall time: 521 ms


In [99]:
text_nouns[:10]

['옛날/Noun',
 '아주/Noun',
 '멀다/Adjective',
 '옛날/Noun',
 ',/Punctuation',
 '착하다/Adjective',
 '아우/Noun',
 '와/Josa',
 '욕심/Noun',
 '쟁이/Suffix']

In [100]:
import gensim

In [101]:
%%time
model = gensim.models.Word2Vec(text_nouns, size=300, window=4, min_count=1, workers=4)

CPU times: user 93 ms, sys: 12.1 ms, total: 105 ms
Wall time: 86 ms


In [102]:
len(text_nouns), len(set(text_nouns))

(898, 364)

In [103]:
top = 1000

In [104]:
vectors = model.syn0[:top]

In [105]:
labels = model.index2word[:top]

In [106]:
labels[:10]

['/', 'o', 'n', 'u', 'N', 'a', 't', 'e', '다', 'r']

...  
왜죠...?

In [118]:
texts = []
with open('heungbunnolbu.txt', 'r') as f:
    texts = [line for line in f if line.rstrip('\n') != '']

In [119]:
texts[:2]

['옛날 아주 먼 옛날, 착한 아우와 욕심쟁이 형이 한 마을에 살았습니다. 아버지가 물려준 재산은 형인 놀부가 몽땅 차지해서 동생인 흥부는 가난하게 살아야 했습니다. 아이가 많은 흥부는 아무리 열심히 일해도 살림이 나아지지 않았습니다.\n',
 '어느날 먹을 것이 떨어진 흥부는 할 수 없이 놀부를 찾아 갔습니다. "형님, 쌀 좀 꾸어 주세요. 아이들이 굶고 있어요. 가을이 되면 꼭 갚겠습니다." "뭐라고?" 이 게으름뱅이 같은 놈! 열심히 일 할 생각은 않고 구걸이나 다니다니! 썩 나가거라!" 놀부는 담뱃대를 휘두르며 소리를 질러 흥부를 쫒아 버렸습니다. \n']

In [120]:
text_nouns = [tokenize(text) for text in texts]

In [122]:
%%time
model = gensim.models.Word2Vec(text_nouns, size=300, window=4, min_count=1, workers=4)

CPU times: user 54.6 ms, sys: 14.2 ms, total: 68.8 ms
Wall time: 81.4 ms


In [124]:
vectors = model.syn0[:top]

In [125]:
vectors = model.syn0[:top]

In [126]:
labels[:10]

['./Punctuation',
 '하다/Verb',
 '가/Josa',
 '이/Josa',
 '박/Noun',
 '을/Josa',
 ',/Punctuation',
 '는/Josa',
 '를/Josa',
 '"/Punctuation']

In [127]:
import numpy as np
from sklearn.manifold import TSNE

In [128]:
if np.shape(vectors)[1] > 2:
    tsne = TSNE(perplexity=10, n_components=2, init='random', n_iter=1000, verbose=1, learning_rate=500, method='exact')
    vectors = tsne.fit_transform(vectors)

[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 364 / 364
[t-SNE] Mean sigma: 0.002942
[t-SNE] KL divergence after 100 iterations with early exaggeration: 23.584786
[t-SNE] Error after 375 iterations: 23.584786


In [129]:
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, LabelSet

In [130]:
source = ColumnDataSource(data=dict(x=vectors.T[0], y=vectors.T[1], word=labels))

In [131]:
label_set = LabelSet(x='x',y='y',text='word',source=source)

In [132]:
tools = "pan,wheel_zoom,box_zoom,reset,resize"

In [133]:
p = figure(plot_width=900,plot_height=900,tools=[tools],title='title')

In [134]:
p.circle('x','y',size=5,source=source,alpha=0.6)

In [135]:
p.add_layout(label_set)

In [136]:
show(p)

## Working in the Notebook

http://bokeh.pydata.org/en/latest/docs/user_guide/notebook.html

In [138]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row

In [139]:
output_notebook()

In [140]:
show(p)