In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

np.random.seed(777)

In [2]:
data = pd.read_csv('../5-0. FastText/train_corpus.txt', delimiter='\t', names=['label', 'words'])

In [3]:
data.head()

Unnamed: 0,label,words
0,__label__N,아_IC 더빙_NNG ._SF ._SY 진짜_MAG 짜증_NNG 나_NP 네요_VC...
1,__label__P,흠_IC ._SF .._SY 포스터_NNP 보고_JKB 초딩_NNG 영화_NNG 줄...
2,__label__N,너무_MAG 재_XPN 밓었_UNKNOWN 다_MAG 그래서_VV+EC 보_VX 는...
3,__label__N,교도소_NNG 이야기_NNG 구먼_VCP+EF ._SF ._SY 솔직히_MAG 재미...
4,__label__P,사이몬페그_NNP 의_JKG 익살_NNG 스런_XSA+ETM 연기_NNG 가_JKS...


In [4]:
x_data = np.array([str(w).strip().split(' ') for w in data.words.values])
y_data = np.array([w.replace('__label__','') for w in data.label.values])
print(x_data.shape)
print(y_data.shape)

(149985,)
(149985,)


In [23]:
tagging_filter = []
tmp = []
for words in x_data[:1000]:
    for word in words:
        w = word.split('_')
        if len(w) > 1:
            pos1 = w[1]
            if len(pos1) > 0 and len(pos1[0]) > 0 and pos1[0] in ['N']:
                tmp.append(word)
    if len(tmp) > 0:
        tagging_filter.append(tmp)

In [24]:
from gensim.models import word2vec
import multiprocessing
model = word2vec.Word2Vec(
    workers     = multiprocessing.cpu_count(),
    window      = 5,      # Context window size 
    size        = 300,    # 300차원짜리 벡터스페이스에 embedding
    min_count   = 1000,     # 등장 횟수가 10 이하인 단어는 무시
    sg          = 0,      # 0이면 CBOW, 1이면 skip-gram을 사용
    batch_words = 100,    # 사전을 구축할때 한번에 읽을 단어 수
    iter        = 100     # 보통 딥러닝에서 말하는 epoch과 비슷한, 반복 횟수
    )

model.build_vocab(tagging_filter)
model.train(tagging_filter)
# 트레이닝이 완료되면 init_sims 명령으로 필요없는 메모리를 unload 시킨다.
model.init_sims(replace=True)

In [25]:
# model.index2word
vectors = model.syn0
vectors

array([[ 0.00144855, -0.0473314 , -0.02546046, ...,  0.02198311,
        -0.04224599, -0.04596429],
       [ 0.07621625,  0.03478968, -0.03362486, ..., -0.12553377,
         0.03926117, -0.02914982],
       [-0.03797698, -0.10297445,  0.01636764, ..., -0.04329744,
        -0.01821846, -0.07366881],
       ..., 
       [ 0.01703958,  0.02020932,  0.00251819, ...,  0.06026173,
         0.06738231, -0.03503033],
       [-0.06826647, -0.07715841, -0.0342704 , ...,  0.14538567,
        -0.00164928, -0.02938739],
       [ 0.03275493,  0.0289843 , -0.06080511, ...,  0.0112748 ,
         0.06513945,  0.05452209]], dtype=float32)

In [26]:
from sklearn.manifold import TSNE
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) 
# 여기서 중요한 것은 n_components만큼 차원을 축소함. Cartarian 좌표계로 시각화하기 위해 2차원으로 축소.
vectors_simple = tsne.fit_transform(vectors)

In [27]:
from bokeh.plotting import figure, show, output_notebook, output_file
from bokeh.models import HoverTool, ColumnDataSource, LabelSet
output_notebook()

source = ColumnDataSource(
    data=dict(
        x=vectors_simple.T[0],
        y=vectors_simple.T[1],
        word=model.index2word,
    )
)

labels = LabelSet(
    x='x', y='y', text='word', 
    level='glyph', 
    x_offset=5, y_offset=5, 
    source=source, 
    render_mode='canvas',
    text_font_size="3pt"
)

TOOLS = "pan,wheel_zoom,box_zoom,reset,resize"

p = figure(plot_width=700, plot_height=700, tools=[TOOLS], title="test")
p.circle('x', 'y', size=7, source=source, alpha=0.4, fill_color="red", line_color="#ff9900")

p.add_layout(labels)
show(p)