# Doc2Vec model
> * Positive or Negative

In [1]:
import pickle

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

## 모델 생성을 위한 함수정의

In [2]:
import multiprocessing
cores = multiprocessing.cpu_count()
def Make_Doc2Vec_Model(data, size, dm, dm_concat, dm_mean, hs, negative, epoch, window, alpha, min_alpha, workers, tagger):
    from tqdm import tqdm
    tqdm.pandas(desc="progress-bar")
    from datetime import datetime
    from gensim.models import doc2vec
    start = datetime.now()
    modelPath = './model/'
    modelName = 'doc2vec_size-{}_epoch-{}_window-{}_negative-{}_hs-{}_dm-{}_dm_concat-{}_dm_mean-{}_by-{}.model'.format(
        size, epoch, window, negative, hs, dm, dm_concat, dm_mean, tagger)
    modelName = modelPath+modelName
    print (modelName)
    if window!=None:
        d2v_model = doc2vec.Doc2Vec(vector_size = size, dm = dm, dm_concat = dm_concat,
                   dm_mean = dm_mean, negative = negative, hs = hs, window = window,
                   alpha = alpha, min_alpha = min_alpha, workers = workers, epochs= epoch)
    else:
        d2v_model = doc2vec.Doc2Vec(vector_size = size, dm = dm, dm_concat = dm_concat,
                   dm_mean = dm_mean, negative = negative, hs = hs,
                   alpha = alpha, min_alpha = min_alpha, workers = workers, epochs= epoch)
    d2v_model.build_vocab(tqdm(data))
    d2v_model.train(tqdm(data), total_examples=d2v_model.corpus_count, epochs=d2v_model.iter)
    
    end = datetime.now()
    d2v_model.save(modelName)
    print ("Total running time: ", end-start)
    return d2v_model

# Doc2Vec 생성

* vector size
 [ 1000, 2000 ]

In [3]:
import numpy as np
import pandas as pd

## 감정 분석을 위한 rawdata

In [None]:
rawdata = pd.read_csv('./data/sentiment_data/raw_data_for_sentiment.txt',header=None,encoding='utf-8')
print (rawdata.shape)

## Making Doc2Vec Using tagger Twitter

In [4]:
from collections import namedtuple
from gensim.models.doc2vec import TaggedDocument
TaggedDocument = namedtuple('TaggedDocument', 'words tags')



### Tagging

In [None]:
from ckonlpy.tag import Twitter as ctwitter
ct = ctwitter()

In [None]:
# twitter
def tokenize1(doc):
    return ['/'.join(t) for t in ct.pos(doc)]

In [None]:
# pickle로 저장된 파일이 없을 때
raw_doc_ct = [(tokenize1(rawdata.loc[idx][0]), rawdata.loc[idx][1]) for idx in tqdm(rawdata.index)]
pickle.dump(raw_doc_ct, open('./data/pre_data/tagged_data/pre_data_by_ct_for_sentiment_analysis.pickled','wb'))

### Doc2Vec 기본 포맷으로 변경

In [None]:
# pickle로 저장된 파일이 없을 때

tagged_ct = [TaggedDocument(d, [c]) for d, c in tqdm(raw_doc_ct)]
pickle.dump(tagged_ct, open('./data/pre_data/tagged_data/pre_by_ct_data_tagged_run_docs.pickled','wb'))

In [None]:
del raw_doc_ct

## model 만들기

In [None]:
# pickle로 저장된 파일이 있을 때
tagged_ct = pickle.load(open('./data/pre_data/tagged_data/pre_by_ct_data_tagged_run_docs.pickled','rb'))

### train dataset & test dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# pickle로 저장된 파일이 없을 때
train, test = train_test_split(tagged_ct, test_size=0.1, random_state=42)
del tagged_ct
pickle.dump(train, open('./data/pre_data/train_test_Data/pre_by_ct_train.pickled','wb'))
pickle.dump(test, open('./data/pre_data/train_test_Data/pre_by_ct_test.pickled','wb'))

In [5]:
# pickle로 저장된 파일이 있을 때
train = pickle.load(open('./data/pre_data/train_test_Data/pre_by_ct_train.pickled','rb'))
test = pickle.load(open('./data/pre_data/train_test_Data/pre_by_ct_test.pickled','rb'))

### model 1

In [6]:
from konlpy.utils import pprint

* size : Dimensionality of the feature vectors 
* dm : 1 - distibuted memory (PV-DM)  
* dm_concat : 1 - use concatenation of context vectors rather than sum/average  
* dm_mean : 0 - don't use the sum of the context word vectors  
> dm is used in non-concatenative mode.
* negative : 7 - neative specifies how many 'noise words' should be drawn.
* hs : 0 - hierarchical softmax 사용여부
* window : 5 - The maximum distance between the current and predicted word within a sentence.  
* alpha : the initial learning rate  
* min_alpha : learning rate will linearly drop to min_alpha as training progresses

In [7]:
%%time
#PV-DM W/
d2v_model = Make_Doc2Vec_Model(data=train, size = 2000, dm = 1, dm_concat = 1,
                   dm_mean = 0, negative = 7, hs = 0, epoch = 20, window = 5,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'ct')
pprint(d2v_model.most_similar('문재인/Noun'))
pprint(d2v_model.most_similar('노무현/Noun'))
pprint(d2v_model.most_similar('박근혜/Noun'))

./model/doc2vec_size-2000_epoch-20_window-5_negative-7_hs-0_dm-1_dm_concat-1_dm_mean-0_by-ct.model


100%|██████████| 442359/442359 [00:17<00:00, 24601.61it/s]
100%|██████████| 442359/442359 [2:04:35<00:00, 59.17it/s]  


Total running time:  9:02:40.866757


  """


[('김종인/Noun', 0.5183357000350952),
 ('안상수/Noun', 0.5107406377792358),
 ('박지원/Noun', 0.4917794466018677),
 ('추미애/Noun', 0.4915170669555664),
 ('손학규/Noun', 0.48670122027397156),
 ('김한길/Noun', 0.4859696328639984),
 ('이종걸/Noun', 0.48380544781684875),
 ('최병렬/Noun', 0.48350751399993896),
 ('강재섭/Noun', 0.46817871928215027),
 ('정몽준/Noun', 0.46718764305114746)]


  


[('이명박/Noun', 0.5241331458091736),
 ('金泳三/Foreign', 0.5214452743530273),
 ('노태우/Noun', 0.4929990768432617),
 ('盧泰愚/Foreign', 0.47707486152648926),
 ('김영삼/Noun', 0.4678577184677124),
 ('루카셴코/Noun', 0.4637618660926819),
 ('마두로/Noun', 0.4572169780731201),
 ('박정희/Noun', 0.45470479130744934),
 ('金大中/Foreign', 0.4526221752166748),
 ('김대중/Noun', 0.45118850469589233)]


  import sys


[('이명박/Noun', 0.45259445905685425),
 ('朴/Foreign', 0.41956573724746704),
 ('노무현/Noun', 0.38582926988601685),
 ('문재인/Noun', 0.38446545600891113),
 ('盧泰愚/Foreign', 0.37503623962402344),
 ('이회창/Noun', 0.37414059042930603),
 ('盧/Foreign', 0.3680959939956665),
 ('마두로/Noun', 0.36593005061149597),
 ('아로요/Noun', 0.36101600527763367),
 ('안상수/Noun', 0.3571595549583435)]
Wall time: 9h 3min 23s


In [8]:
del d2v_model

### model 2

* size : Dimensionality of the feature vectors 
* dm : 1 - distibuted memory (PV-DM)  
* dm_concat : 0 - don't use concatenation of context vectors rather than sum/average  
* dm_mean : 1 - don't use the sum of the context word vectors  
> dm is used in non-concatenative mode.
* negative : 7 - neative specifies how many 'noise words' should be drawn.
* hs : 0 - hierarchical softmax 사용여부
* window : 10 - The maximum distance between the current and predicted word within a sentence.  
* alpha : the initial learning rate  
* min_alpha : learning rate will linearly drop to min_alpha as training progresses

In [9]:
%%time
#PV-DM w/
d2v_model = Make_Doc2Vec_Model(data=train, size = 2000, dm = 1, dm_concat = 0,
                   dm_mean = 1, negative = 7, hs = 0, epoch = 20, window = 10,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'ct')
pprint(d2v_model.most_similar('문재인/Noun'))
pprint(d2v_model.most_similar('노무현/Noun'))
pprint(d2v_model.most_similar('박근혜/Noun'))

./model/doc2vec_size-2000_epoch-20_window-10_negative-7_hs-0_dm-1_dm_concat-0_dm_mean-1_by-ct.model


100%|██████████| 442359/442359 [00:35<00:00, 12308.80it/s]
100%|██████████| 442359/442359 [04:41<00:00, 1571.53it/s]


Total running time:  1:27:47.740909


  """


[('문/Noun', 0.24678705632686615),
 ('이태근/Noun', 0.2418072372674942),
 ('조수용/Noun', 0.21648380160331726),
 ('이인찬/Noun', 0.2156585156917572),
 ('손학규/Noun', 0.20564328134059906),
 ('원희목/Noun', 0.19503045082092285),
 ('정세균/Noun', 0.19401618838310242),
 ('김상백/Noun', 0.19292360544204712),
 ('드림위즈/Noun', 0.19290193915367126),
 ('마에하라/Noun', 0.19176803529262543)]


  


[('노/Noun', 0.3435490131378174),
 ('이명박/Noun', 0.3290066123008728),
 ('박근혜/Noun', 0.27620890736579895),
 ('김영삼/Noun', 0.2651824355125427),
 ('▒金/Foreign', 0.2198270857334137),
 ('셀라야/Noun', 0.21787340939044952),
 ('면노/Noun', 0.21710556745529175),
 ('탄자/Noun', 0.2153436541557312),
 ('기록관/Noun', 0.21247893571853638),
 ('취임일/Noun', 0.21080957353115082)]


  import sys


[('박/Noun', 0.37544625997543335),
 ('이명박/Noun', 0.3380833864212036),
 ('노무현/Noun', 0.27620890736579895),
 ('면노/Noun', 0.2632753849029541),
 ('김영삼/Noun', 0.2564114034175873),
 ('■朴/Foreign', 0.2518764138221741),
 ('“朴/Foreign', 0.2517133951187134),
 ('아사드/Noun', 0.24524010717868805),
 ('지명전/Noun', 0.24189071357250214),
 ('바첼레트/Noun', 0.24139946699142456)]
Wall time: 1h 27min 54s


In [10]:
del d2v_model

### model 3

* size : Dimensionality of the feature vectors 
* dm : 0 - distributed bag of words (PV-DBOW)
* dm_concat : 0 - don't use concatenation of context vectors rather than sum/average  
* dm_mean : 0 - don't use the sum of the context word vectors  
> dm is used in non-concatenative mode.
* negative : 7 - neative specifies how many 'noise words' should be drawn.
* hs : 0 - hierarchical softmax 사용여부
* window : 5 - The maximum distance between the current and predicted word within a sentence.  
* alpha : the initial learning rate  
* min_alpha : learning rate will linearly drop to min_alpha as training progresses

In [11]:
%%time
# PV - DBOW
d2v_model = Make_Doc2Vec_Model(data=train, size = 2000, dm = 0, dm_concat = 0,
                   dm_mean = 0, negative = 7, hs = 0, epoch = 20, window = None,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'ct')
pprint(d2v_model.most_similar('문재인/Noun'))
pprint(d2v_model.most_similar('노무현/Noun'))
pprint(d2v_model.most_similar('박근혜/Noun'))

./model/doc2vec_size-2000_epoch-20_window-None_negative-7_hs-0_dm-0_dm_concat-0_dm_mean-0_by-ct.model


100%|██████████| 442359/442359 [00:17<00:00, 24951.43it/s]
100%|██████████| 442359/442359 [05:08<00:00, 1433.95it/s]


Total running time:  1:24:22.376557


  """


[('카오스/Noun', 0.10608500987291336),
 ('통과하던/Verb', 0.10094453394412994),
 ('매복/Noun', 0.08865201473236084),
 ('티롤/Noun', 0.08777494728565216),
 ('서민수/Noun', 0.08608179539442062),
 ('인디안/Noun', 0.08595235645771027),
 ('라이트헤비급/Noun', 0.08498610556125641),
 ('통용/Noun', 0.0843491330742836),
 ('외부/Noun', 0.08415405452251434),
 ('입단속/Noun', 0.08380858600139618)]


  


[('방배동/Noun', 0.09651711583137512),
 ('makim/Alpha', 0.09540006518363953),
 ('빈칸/Noun', 0.09387905150651932),
 ('최윤정/Noun', 0.08827579021453857),
 ('로동당/Noun', 0.08640139549970627),
 ('탈옥/Noun', 0.08564408868551254),
 ('수출상품/Noun', 0.08333203196525574),
 ('단언적/Noun', 0.0822734534740448),
 ('저커버그/Noun', 0.0815555602312088),
 ('통금/Noun', 0.08134506642818451)]


  import sys


[('장보/Noun', 0.10854393988847733),
 ('사망하는/Verb', 0.097142793238163),
 ('읍성/Noun', 0.09556154906749725),
 ('막노동/Noun', 0.09376087784767151),
 ('약속어음/Noun', 0.09169736504554749),
 ('내쉰/Verb', 0.08903086185455322),
 ('생산자물가지수/Noun', 0.08887159824371338),
 ('격화됐/Noun', 0.08878053724765778),
 ('판타스틱/Noun', 0.08716996014118195),
 ('조직책/Noun', 0.08614011108875275)]
Wall time: 1h 24min 59s


In [12]:
del train
del test
del d2v_model

## Making Doc2Vec Using tagger mecab

In [13]:
from collections import namedtuple
from gensim.models.doc2vec import TaggedDocument
TaggedDocument = namedtuple('TaggedDocument', 'words tags')

### tagging

In [None]:
from konlpy.tag import Mecab
mecab = Mecab()

In [None]:
# mecab
def tokenize2(doc):
    return ['/'.join(t) for t in mecab.pos(doc)]

In [None]:
# pickle로 저장된 파일이 없을 때
raw_doc_mecab = [(tokenize2(rawdata.loc[idx][0]), rawdata.loc[idx][1]) for idx in tqdm(rawdata.index)]
pickle.dump(raw_doc_mecab, open('./data/pre_data/tagged_data/pre_data_by_mecab_for_sentiment_analysis.pickled','wb'))

### Doc2Vec 기본 포맷으로 변경

In [None]:
# pickle로 저장된 파일이 없을 때
tagged_mecab = [TaggedDocument(d, [c]) for d, c in raw_doc_mecab]
pickle.dump(tagged_mecab, open('./data/pre_data/tagged_data/pre_by_mecab_data_tagged_run_docs.pickled','wb'))

In [None]:
del raw_doc_mecab

## model 만들기

In [None]:
# pickle로 저장된 파일이 있을 때
tagged_mecab = pickle.load(open('./data/pre_data/tagged_data/pre_by_mecab_data_tagged_run_docs.pickled','rb'))

### train dataset & test dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# pickle로 저장된 파일이 없을 때
train2, test2 = train_test_split(tagged_mecab, test_size=0.1, random_state=42)
del tagged_ct
pickle.dump(train2, open('./data/pre_data/train_test_Data/pre_by_mecab_train.pickled','wb'))
pickle.dump(test2, open('./data/pre_data/train_test_Data/pre_by_mecab_test.pickled','wb'))

In [14]:
# pickle로 저장된 파일이 있을 때
train2 = pickle.load(open('./data/pre_data/train_test_Data/pre_by_mecab_train.pickled','rb'))
test2 = pickle.load(open('./data/pre_data/train_test_Data/pre_by_mecab_test.pickled','rb'))


### model 1

* size : Dimensionality of the feature vectors 
* dm : 1 - distibuted memory (PV-DM)  
* dm_concat : 1 - use concatenation of context vectors rather than sum/average  
* dm_mean : 0 - don't use the sum of the context word vectors  
> dm is used in non-concatenative mode.
* negative : 7 - neative specifies how many 'noise words' should be drawn.
* hs : 0 - hierarchical softmax 사용여부
* window : 5 - The maximum distance between the current and predicted word within a sentence.  
* alpha : the initial learning rate  
* min_alpha : learning rate will linearly drop to min_alpha as training progresses

In [15]:
from konlpy.utils import pprint

In [16]:
%%time
#PV-DM W/
d2v_model = Make_Doc2Vec_Model(data=train2, size = 2000, dm = 1, dm_concat = 1,
                   dm_mean = 0, negative = 7, hs = 0, epoch = 20, window = 5,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'mecab')
pprint(d2v_model.most_similar('문재인/NNP'))
pprint(d2v_model.most_similar('노무현/NNP'))
pprint(d2v_model.most_similar('박근혜/NNP'))

./model/doc2vec_size-2000_epoch-20_window-5_negative-7_hs-0_dm-1_dm_concat-1_dm_mean-0_by-mecab.model


100%|██████████| 442359/442359 [00:25<00:00, 17177.83it/s]
100%|██████████| 442359/442359 [2:32:40<00:00, 48.29it/s]  


Total running time:  9:11:46.111764


  """


[('안철수/NNP', 0.5724848508834839),
 ('김무성/NNP', 0.5294156074523926),
 ('김한길/NNP', 0.525905430316925),
 ('박지원/NNP', 0.5178284645080566),
 ('안상수/NNP', 0.5129897594451904),
 ('정몽준/NNP', 0.5129006505012512),
 ('김종인/NNP', 0.5113946199417114),
 ('손학규/NNP', 0.5102208852767944),
 ('문희상/NNP', 0.50984126329422),
 ('정세균/NNP', 0.5082828402519226)]


  


[('이명박/NNP', 0.5427179932594299),
 ('노태우/NNP', 0.5334432125091553),
 ('김대중/NNP', 0.5133453607559204),
 ('전두환/NNP', 0.502194344997406),
 ('김영삼/NNP', 0.49199628829956055),
 ('아로요/NNP', 0.4718013107776642),
 ('MB/SL', 0.4612730145454407),
 ('박정희/NNP', 0.45217856764793396),
 ('盧/NNG', 0.448153018951416),
 ('盧/NNG', 0.44479823112487793)]


  import sys


[('이명박/NNP', 0.5788164138793945),
 ('MB/SL', 0.4639942944049835),
 ('노태우/NNP', 0.45901602506637573),
 ('노무현/NNP', 0.4367173910140991),
 ('박/NNP', 0.43140652775764465),
 ('어김없/VA', 0.428242027759552),
 ('문재인/NNP', 0.42159518599510193),
 ('서청원/NNP', 0.41485822200775146),
 ('윤보선/NNP', 0.41140493750572205),
 ('김무성/NNP', 0.408000648021698)]
Wall time: 9h 15min 42s


In [17]:
del d2v_model

### model 2

* size : Dimensionality of the feature vectors 
* dm : 1 - distibuted memory (PV-DM)  
* dm_concat : 0 - don't use concatenation of context vectors rather than sum/average  
* dm_mean : 1 - don't use the sum of the context word vectors  
> dm is used in non-concatenative mode.
* negative : 7 - neative specifies how many 'noise words' should be drawn.
* hs : 0 - hierarchical softmax 사용여부
* window : 10 - The maximum distance between the current and predicted word within a sentence.  
* alpha : the initial learning rate  
* min_alpha : learning rate will linearly drop to min_alpha as training progresses

In [18]:
%%time
#PV-DBOW
d2v_model = Make_Doc2Vec_Model(data=train2, size = 2000, dm = 1, dm_concat = 0,
                   dm_mean = 1, negative = 7, hs = 0, epoch = 20, window = 10,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'mecab')
pprint(d2v_model.most_similar('문재인/NNP'))
pprint(d2v_model.most_similar('노무현/NNP'))
pprint(d2v_model.most_similar('박근혜/NNP'))

./model/doc2vec_size-2000_epoch-20_window-10_negative-7_hs-0_dm-1_dm_concat-0_dm_mean-1_by-mecab.model


100%|██████████| 442359/442359 [00:41<00:00, 10705.90it/s]
100%|██████████| 442359/442359 [04:36<00:00, 1599.22it/s]


Total running time:  1:23:08.893789


  """


[('문/VV+ETM', 0.28360697627067566),
 ('문재/NNG', 0.2664259672164917),
 ('이태근/NNP', 0.24687394499778748),
 ('전혜숙/NNP', 0.2448309361934662),
 ('정세균/NNP', 0.24112921953201294),
 ('원희목/NNP', 0.23862120509147644),
 ('조순형/NNP', 0.224394291639328),
 ('손학규/NNP', 0.2221895456314087),
 ('정호준/NNP', 0.21453070640563965),
 ('추미애/NNP', 0.21410048007965088)]


  


[('노/NNP', 0.34591615200042725),
 ('이명박/NNP', 0.3437889814376831),
 ('박근혜/NNP', 0.29280489683151245),
 ('나온다든지/VV+EC', 0.2774888277053833),
 ('노/IC', 0.27348142862319946),
 ('김영삼/NNP', 0.2703419327735901),
 ('ㆍ노무현/UNKNOWN', 0.2680293917655945),
 ('노/XPN', 0.2668951451778412),
 ('유셴코/NNP', 0.26135867834091187),
 ('룰라/IC', 0.2592846155166626)]


  import sys


[('박/NNP', 0.4148608148097992),
 ('정문목/NNP', 0.30864912271499634),
 ('노무현/NNP', 0.2928048372268677),
 ('이명박/NNP', 0.2873270511627197),
 ('우말라/NNP', 0.2748571038246155),
 ('이기흥/NNP', 0.2692067623138428),
 ('노/NNP', 0.2534322440624237),
 ('나온다든지/VV+EC', 0.2485388070344925),
 ('유셴코/NNP', 0.24410727620124817),
 ('김영삼/NNP', 0.244068905711174)]
Wall time: 1h 23min 16s


In [19]:
del d2v_model

### model 3

* size : Dimensionality of the feature vectors 
* dm : 0 - distributed bag of words (PV-DBOW)
* dm_concat : 0 - don't use concatenation of context vectors rather than sum/average  
* dm_mean : 0 - don't use the sum of the context word vectors  
> dm is used in non-concatenative mode.
* negative : 7 - neative specifies how many 'noise words' should be drawn.
* hs : 0 - hierarchical softmax 사용여부
* window : 5 - The maximum distance between the current and predicted word within a sentence.  
* alpha : the initial learning rate  
* min_alpha : learning rate will linearly drop to min_alpha as training progresses

In [20]:
%%time
#PV-DM w/
d2v_model = Make_Doc2Vec_Model(data=train2, size = 2000, dm = 0, dm_concat = 0,
                   dm_mean = 0, negative = 7, hs = 0, epoch = 20, window = None,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'mecab')
pprint(d2v_model.most_similar('문재인/NNP'))
pprint(d2v_model.most_similar('노무현/NNP'))
pprint(d2v_model.most_similar('박근혜/NNP'))

./model/doc2vec_size-2000_epoch-20_window-None_negative-7_hs-0_dm-0_dm_concat-0_dm_mean-0_by-mecab.model


100%|██████████| 442359/442359 [00:15<00:00, 28443.12it/s]
100%|██████████| 442359/442359 [03:48<00:00, 1933.01it/s]


Total running time:  1:15:33.483903


  """


[('백화점식/NNP', 0.09711657464504242),
 ('버스정류장/NNP', 0.08856742829084396),
 ('사회지도층/NNP', 0.08808135986328125),
 ('햇발/NNG', 0.08589445054531097),
 ('불가피/XR', 0.08572833985090256),
 ('htchoi/SL', 0.08566995710134506),
 ('나빴/VA+EP', 0.08528344333171844),
 ('원화/NNG', 0.08455102145671844),
 ('강종/NNP', 0.08450424671173096),
 ('토론회/NNG', 0.08347360789775848)]


  


[('중소기업청장/NNP', 0.09881342947483063),
 ('맵/NNP', 0.09794344753026962),
 ('폭격/NNG', 0.09161762148141861),
 ('알파인/NNP', 0.0907985121011734),
 ('예치/NNG', 0.09036141633987427),
 ('코르코바두산/NNP', 0.08970692753791809),
 ('박노해/NNP', 0.08800055086612701),
 ('절충/NNP', 0.0862005352973938),
 ('해진다고/XSA+EC+VX+EC', 0.08506476879119873),
 ('봉화군/NNG', 0.08451708406209946)]


  import sys


[('OTC/SL', 0.10523991286754608),
 ('무샤라프/NNP', 0.09607601165771484),
 ('김한표/NNP', 0.09233810007572174),
 ('패권국/NNP', 0.09138218313455582),
 ('방북/NNG', 0.08726222813129425),
 ('역도계/NNP', 0.08249004185199738),
 ('어니/NNP', 0.08202850818634033),
 ('옛적/NNG', 0.08199796080589294),
 ('인거/NNG', 0.0809951201081276),
 ('누들/NNG', 0.08086809515953064)]
Wall time: 1h 15min 40s


In [21]:
del train2
del test2
del d2v_model