# Doc2Vec model
> * Positive or Negative

## * Paragraph Vector
> Le and Mikolov 2014 introduces the Paragraph Vector, which outperforms more naïve representations of documents such as averaging the Word2vec word vectors of a document. The idea is straightforward: we act as if a paragraph (or document) is just another vector like a word vector, but we will call it a paragraph vector. We determine the embedding of the paragraph in vector space in the same way as words. Our paragraph vector model considers local word order like bag of n-grams, but gives us a denser representation in vector space compared to a sparse, high-dimensional representation.

> * Paragraph Vector - Distributed Memory (PV-DM)
>> This is the Paragraph Vector model analogous to Continuous-bag-of-words Word2vec. The paragraph vectors are obtained by training a neural network on the fake task of inferring a center word based on context words and a context paragraph. A paragraph is a context for all words in the paragraph, and a word in a paragraph can have that paragraph as a context.

> * Paragraph Vector - Distributed Bag of Words (PV-DBOW)
>> This is the Paragraph Vector model analogous to Skip-gram Word2vec. The paragraph vectors are obtained by training a neural network on the fake task of predicting a probability distribution of words in a paragraph given a randomly-sampled word from the paragraph.

In [1]:
import pickle
import os 
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

## 모델 생성을 위한 함수정의

In [2]:
import multiprocessing
cores = int(multiprocessing.cpu_count())
def Make_Doc2Vec_Model(data, size, dm, dm_concat, dm_mean, hs, negative, epoch, window, alpha, min_alpha, workers, tagger):
    from tqdm import tqdm
    tqdm.pandas(desc="progress-bar")
    from datetime import datetime
    from gensim.models import doc2vec
    start = datetime.now()
    modelPath = './model/'
    modelName = 'doc2vec_size-{}_epoch-{}_window-{}_negative-{}_hs-{}_dm-{}_dm_concat-{}_dm_mean-{}_by-{}.model'.format(
        size, epoch, window, negative, hs, dm, dm_concat, dm_mean, tagger)
    modelName = modelPath+modelName
    print (modelName)
    if window!=None:
        d2v_model = doc2vec.Doc2Vec(vector_size = size, dm = dm, dm_concat = dm_concat,
                   dm_mean = dm_mean, negative = negative, hs = hs, window = window,
                   alpha = alpha, min_alpha = min_alpha, workers = workers, epochs= epoch)
    else:
        d2v_model = doc2vec.Doc2Vec(vector_size = size, dm = dm, dm_concat = dm_concat,
                   dm_mean = dm_mean, negative = negative, hs = hs,
                   alpha = alpha, min_alpha = min_alpha, workers = workers, epochs= epoch)
    d2v_model.build_vocab(tqdm(data))
    d2v_model.train(tqdm(data), total_examples=d2v_model.corpus_count, epochs=d2v_model.iter)
    
    end = datetime.now()
    d2v_model.save(modelName)
    print ("Total running time: ", end-start)
    return d2v_model

# Doc2Vec 생성

In [3]:
import numpy as np
import pandas as pd

## 감정 분석을 위한 rawdata

In [None]:
rawdata = pd.read_csv('./data/sentiment_data/raw_data_for_sentiment.txt',header=None,encoding='utf-8')
print (rawdata.shape)

## Making Doc2Vec Using tagger Twitter

In [4]:
from collections import namedtuple
from gensim.models.doc2vec import TaggedDocument
TaggedDocument = namedtuple('TaggedDocument', 'words tags sentiment')



### Tagging

In [None]:
from ckonlpy.tag import Twitter as ctwitter
ct = ctwitter()

In [None]:
# twitter
def tokenize1(doc):
    return ['/'.join(t) for t in ct.pos(doc)]

In [None]:
if os.path.isfile('./data/pre_data/tagged_data/pre_data_by_ct_for_doc2vec_sentiment_analysis.pickled'):
    raw_doc_ct = pickle.load(open('./data/pre_data/tagged_data/pre_raw_data_by_ct_for_doc2vec_sentiment_analysis.pickled','rb'))
else:    
    raw_doc_ct = [(tokenize1(rawdata.loc[idx][0]), ['doc_'+str(idx)], [rawdata.loc[idx][1]]) for idx in tqdm(rawdata.index)]
    pickle.dump(raw_doc_ct, open('./data/pre_data/tagged_data/pre_raw_data_by_ct_for_doc2vec_sentiment_analysis.pickled','wb'))

### Doc2Vec 기본 포맷으로 변경

In [None]:
if os.path.isfile('./data/pre_data/tagged_data/pre_data_by_ct_tagged_doc2vec_sentiment_analysis.pickled'):
    tagged_ct = pickle.load(open('./data/pre_data/tagged_data/pre_data_by_ct_tagged_doc2vec_sentiment_analysis.pickled','rb'))
else:
    tagged_ct = [TaggedDocument(b, c, d) for b, c, d in tqdm(raw_doc_ct)]
    pickle.dump(tagged_ct, open('./data/pre_data/tagged_data/pre_data_by_ct_tagged_doc2vec_sentiment_analysis.pickled','wb'))

In [None]:
del raw_doc_ct

### train dataset & test dataset

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(tagged_ct, test_size=0.1, random_state=42)

In [5]:
if os.path.isfile('./data/pre_data/train_test_Data/pre_data_by_ct_train_for_doc2vec_sentiment_analysis.pickled'):
    train = pickle.load(open('./data/pre_data/train_test_Data/pre_data_by_ct_train_for_doc2vec_sentiment_analysis.pickled','rb'))
else:
    pickle.dump(train, open('./data/pre_data/train_test_Data/pre_data_by_ct_train_for_doc2vec_sentiment_analysis.pickled','wb'))
    
if os.path.isfile('./data/pre_data/train_test_Data/pre_data_by_ct_test_for_doc2vec_sentiment_analysis.pickled'):
    test = pickle.load(open('./data/pre_data/train_test_Data/pre_data_by_ct_test_for_doc2vec_sentiment_analysis.pickled','rb'))
else:
    pickle.dump(test, open('./data/pre_data/train_test_Data/pre_data_by_ct_test_for_doc2vec_sentiment_analysis.pickled','wb'))

In [None]:
del tagged_ct

### model 1

In [6]:
from konlpy.utils import pprint

* size : Dimensionality of the feature vectors 
* dm : 1 - distibuted memory (PV-DM)  
* dm_concat : 1 - use concatenation of context vectors rather than sum/average  
* dm_mean : 0 - don't use the sum of the context word vectors  
> dm is used in non-concatenative mode.
* negative : 7 - neative specifies how many 'noise words' should be drawn.
* hs : 0 - hierarchical softmax 사용여부
* window : 5 - The maximum distance between the current and predicted word within a sentence.  
* alpha : the initial learning rate  
* min_alpha : learning rate will linearly drop to min_alpha as training progresses

In [7]:
%%time
#PV-DM W/
d2v_model = Make_Doc2Vec_Model(data=train, size = 1000, dm = 1, dm_concat = 1,
                   dm_mean = 0, negative = 7, hs = 0, epoch = 20, window = 5,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'ct')
pprint(d2v_model.most_similar('문재인/Noun'))
pprint(d2v_model.most_similar('노무현/Noun'))
pprint(d2v_model.most_similar('박근혜/Noun'))

./model/doc2vec_size-1000_epoch-20_window-5_negative-7_hs-0_dm-1_dm_concat-1_dm_mean-0_by-ct.model


100%|██████████| 442359/442359 [00:17<00:00, 25390.23it/s]
100%|██████████| 442359/442359 [1:36:44<00:00, 76.20it/s]


Total running time:  4:55:11.823557


  """


[('문국현/Noun', 0.5745145678520203),
 ('손학규/Noun', 0.5639787912368774),
 ('정몽준/Noun', 0.5287864208221436),
 ('추미애/Noun', 0.5242229104042053),
 ('김종인/Noun', 0.5189003944396973),
 ('심상정/Noun', 0.5182517766952515),
 ('조순형/Noun', 0.5097339153289795),
 ('안상수/Noun', 0.5079224109649658),
 ('이정희/Noun', 0.49288955330848694),
 ('송영길/Noun', 0.4892556071281433)]


  


[('盧/Foreign', 0.5360435247421265),
 ('이명박/Noun', 0.4909990727901459),
 ('金泳三/Foreign', 0.4755754768848419),
 ('盧泰愚/Foreign', 0.467711865901947),
 ('金大中/Foreign', 0.4661300778388977),
 ('김영삼/Noun', 0.4571351706981659),
 ('마두로/Noun', 0.4521355926990509),
 ('김대중/Noun', 0.4520781636238098),
 ('노태우/Noun', 0.44878318905830383),
 ('무샤라프/Noun', 0.44861674308776855)]


  import sys


[('MB/Alpha', 0.4731568396091461),
 ('이명박/Noun', 0.4706622362136841),
 ('朴/Foreign', 0.4702518582344055),
 ('盧/Foreign', 0.4422094225883484),
 ('최병렬/Noun', 0.4401675760746002),
 ('노무현/Noun', 0.4201396107673645),
 ('아로요/Noun', 0.4178679585456848),
 ('盧泰愚/Foreign', 0.41626977920532227),
 ('무가베/Noun', 0.4150621294975281),
 ('라모스/Noun', 0.41151320934295654)]
Wall time: 4h 55min 50s


In [8]:
del d2v_model

### model 2

* size : Dimensionality of the feature vectors 
* dm : 1 - distibuted memory (PV-DM)  
* dm_concat : 0 - don't use concatenation of context vectors rather than sum/average  
* dm_mean : 1 - don't use the sum of the context word vectors  
> dm is used in non-concatenative mode.
* negative : 7 - neative specifies how many 'noise words' should be drawn.
* hs : 0 - hierarchical softmax 사용여부
* window : 10 - The maximum distance between the current and predicted word within a sentence.  
* alpha : the initial learning rate  
* min_alpha : learning rate will linearly drop to min_alpha as training progresses

In [9]:
%%time
#PV-DM w/
d2v_model = Make_Doc2Vec_Model(data=train, size = 1000, dm = 1, dm_concat = 0,
                   dm_mean = 1, negative = 7, hs = 0, epoch = 20, window = 10,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'ct')
pprint(d2v_model.most_similar('문재인/Noun'))
pprint(d2v_model.most_similar('노무현/Noun'))
pprint(d2v_model.most_similar('박근혜/Noun'))

./model/doc2vec_size-1000_epoch-20_window-10_negative-7_hs-0_dm-1_dm_concat-0_dm_mean-1_by-ct.model


100%|██████████| 442359/442359 [00:15<00:00, 29377.88it/s]
100%|██████████| 442359/442359 [02:21<00:00, 3123.48it/s]


Total running time:  0:41:51.613135


  """


[('손학규/Noun', 0.28437939286231995),
 ('정세균/Noun', 0.2816855311393738),
 ('문/Noun', 0.28093063831329346),
 ('김한길/Noun', 0.2715970575809479),
 ('추미애/Noun', 0.2682795226573944),
 ('이태근/Noun', 0.2642826437950134),
 ('김중권/Noun', 0.2622499167919159),
 ('김/Noun', 0.2603785991668701),
 ('김종인/Noun', 0.25953930616378784),
 ('이인찬/Noun', 0.25789928436279297)]


  


[('노/Noun', 0.4571610391139984),
 ('이명박/Noun', 0.3688324987888336),
 ('김영삼/Noun', 0.3431937098503113),
 ('노태우/Noun', 0.34158068895339966),
 ('박근혜/Noun', 0.3341344892978668),
 ('金大中/Foreign', 0.30319347977638245),
 ('취임일/Noun', 0.30027008056640625),
 ('박/Noun', 0.2979949712753296),
 ('면노/Noun', 0.2967212200164795),
 ('“金大中/Foreign', 0.29175442457199097)]


  import sys


[('박/Noun', 0.45748814940452576),
 ('이명박/Noun', 0.39562541246414185),
 ('김영삼/Noun', 0.36602863669395447),
 ('노/Noun', 0.3357464671134949),
 ('노태우/Noun', 0.33475083112716675),
 ('노무현/Noun', 0.33413445949554443),
 ('金泳三/Foreign', 0.32288122177124023),
 ('김대중/Noun', 0.3181714117527008),
 ('김전/Noun', 0.3008650541305542),
 ('盧泰愚/Foreign', 0.3006989359855652)]
Wall time: 42min


In [10]:
del d2v_model

### model 3

* size : Dimensionality of the feature vectors 
* dm : 0 - distributed bag of words (PV-DBOW)
* dm_concat : 0 - don't use concatenation of context vectors rather than sum/average  
* dm_mean : 0 - don't use the sum of the context word vectors  
> dm is used in non-concatenative mode.
* negative : 7 - neative specifies how many 'noise words' should be drawn.
* hs : 0 - hierarchical softmax 사용여부
* window : 5 - The maximum distance between the current and predicted word within a sentence.  
* alpha : the initial learning rate  
* min_alpha : learning rate will linearly drop to min_alpha as training progresses

In [11]:
%%time
# PV - DBOW
d2v_model = Make_Doc2Vec_Model(data=train, size = 1000, dm = 0, dm_concat = 0,
                   dm_mean = 0, negative = 7, hs = 0, epoch = 20, window = None,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'ct')
pprint(d2v_model.most_similar('문재인/Noun'))
pprint(d2v_model.most_similar('노무현/Noun'))
pprint(d2v_model.most_similar('박근혜/Noun'))

./model/doc2vec_size-1000_epoch-20_window-None_negative-7_hs-0_dm-0_dm_concat-0_dm_mean-0_by-ct.model


100%|██████████| 442359/442359 [00:18<00:00, 23646.48it/s]
100%|██████████| 442359/442359 [01:55<00:00, 3833.25it/s]


Total running time:  0:32:03.951003


  """


[('사항/Noun', 0.1305052787065506),
 ('추진/Noun', 0.12882840633392334),
 ('해결될/Verb', 0.12366967648267746),
 ('놀람/Noun', 0.12347520887851715),
 ('부과방식/Noun', 0.1232670471072197),
 ('유가공/Noun', 0.12200957536697388),
 ('상의/Noun', 0.12150819599628448),
 ('동자/Noun', 0.12081922590732574),
 ('크로네/Noun', 0.12057416141033173),
 ('로맨스/Noun', 0.11997126042842865)]


  


[('표현하기/Verb', 0.1285589337348938),
 ('創黨/Foreign', 0.12728844583034515),
 ('궤적/Noun', 0.1240358054637909),
 ('할부금/Noun', 0.1219300925731659),
 ('향토기업/Noun', 0.12023414671421051),
 ('Customer/Alpha', 0.11999914795160294),
 ('최중경/Noun', 0.11913421750068665),
 ('계기/Noun', 0.11692361533641815),
 ('농식품부/Noun', 0.1168595626950264),
 ('홍대앞/Noun', 0.1159517914056778)]


  import sys


[('에일린/Noun', 0.12598663568496704),
 ('스테이크하우스/Noun', 0.12526842951774597),
 ('핵심역량/Noun', 0.12277888506650925),
 ('오큘러스/Noun', 0.12205159664154053),
 ('자제령/Noun', 0.12167312204837799),
 ('choo/Alpha', 0.12156837433576584),
 ('김근/Noun', 0.12118206918239594),
 ('431/Number', 0.12076354771852493),
 ('바꾸시/Verb', 0.12017674744129181),
 ('말하시/Verb', 0.11960489302873611)]
Wall time: 32min 13s


In [12]:
del train
del test
del d2v_model

## Making Doc2Vec Using tagger mecab

In [13]:
from collections import namedtuple
from gensim.models.doc2vec import TaggedDocument
TaggedDocument = namedtuple('TaggedDocument', 'words tags sentiment')

### tagging

In [None]:
from konlpy.tag import Mecab
mecab = Mecab()

In [None]:
# mecab
def tokenize2(doc):
    return ['/'.join(t) for t in mecab.pos(doc)]

In [None]:
if os.path.isfile('./data/pre_data/tagged_data/pre_raw_data_by_mecab_for_doc2vec_sentiment_analysis.pickled'):
    raw_doc_mecab = pickle.load(open('./data/pre_data/tagged_data/pre_raw_data_by_mecab_for_doc2vec_sentiment_analysis.pickled','rb'))
else:    
    raw_doc_mecab = [(tokenize2(rawdata.loc[idx][0]), ['doc_'+str(idx)], [rawdata.loc[idx][1]]) for idx in tqdm(rawdata.index)]
    pickle.dump(raw_doc_mecab, open('./data/pre_data/tagged_data/pre_raw_data_by_mecab_for_doc2vec_sentiment_analysis.pickled','wb'))

### Doc2Vec 기본 포맷으로 변경

In [None]:
if os.path.isfile('./data/pre_data/tagged_data/pre_data_by_mecab_tagged_doc2vec_sentiment_analysis.pickled'):
    tagged_mecab = pickle.load(open('./data/pre_data/tagged_data/pre_data_by_mecab_tagged_doc2vec_sentiment_analysis.pickled','rb'))
else:
    tagged_mecab = [TaggedDocument(b, c, d) for b, c, d in tqdm(raw_doc_mecab)]
    pickle.dump(tagged_mecab, open('./data/pre_data/tagged_data/pre_data_by_mecab_tagged_doc2vec_sentiment_analysis.pickled','wb'))

In [None]:
del raw_doc_mecab

### train dataset & test dataset

In [None]:
from sklearn.model_selection import train_test_split
train2, test2 = train_test_split(tagged_mecab, test_size=0.1, random_state=42)

In [14]:
if os.path.isfile('./data/pre_data/train_test_Data/pre_data_by_mecab_train_for_doc2vec_sentiment_analysis.pickled'):
    train2 = pickle.load(open('./data/pre_data/train_test_Data/pre_data_by_mecab_train_for_doc2vec_sentiment_analysis.pickled','rb'))
else:
    pickle.dump(train2, open('./data/pre_data/train_test_Data/pre_data_by_mecab_train_for_doc2vec_sentiment_analysis.pickled','wb'))
    
if os.path.isfile('./data/pre_data/train_test_Data/pre_data_by_mecab_test_for_doc2vec_sentiment_analysis.pickled'):
    test2 = pickle.load(open('./data/pre_data/train_test_Data/pre_data_by_mecab_test_for_doc2vec_sentiment_analysis.pickled','rb'))
else:
    pickle.dump(test2, open('./data/pre_data/train_test_Data/pre_data_by_mecab_test_for_doc2vec_sentiment_analysis.pickled','wb'))

In [None]:
del tagged_mecab


### model 1

* size : Dimensionality of the feature vectors 
* dm : 1 - distibuted memory (PV-DM)  
* dm_concat : 1 - use concatenation of context vectors rather than sum/average  
* dm_mean : 0 - don't use the sum of the context word vectors  
> dm is used in non-concatenative mode.
* negative : 7 - neative specifies how many 'noise words' should be drawn.
* hs : 0 - hierarchical softmax 사용여부
* window : 5 - The maximum distance between the current and predicted word within a sentence.  
* alpha : the initial learning rate  
* min_alpha : learning rate will linearly drop to min_alpha as training progresses

In [15]:
from konlpy.utils import pprint

In [16]:
%%time
#PV-DM W/
d2v_model = Make_Doc2Vec_Model(data=train2, size = 1000, dm = 1, dm_concat = 1,
                   dm_mean = 0, negative = 7, hs = 0, epoch = 20, window = 5,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'mecab')
pprint(d2v_model.most_similar('문재인/NNP'))
pprint(d2v_model.most_similar('노무현/NNP'))
pprint(d2v_model.most_similar('박근혜/NNP'))

./model/doc2vec_size-1000_epoch-20_window-5_negative-7_hs-0_dm-1_dm_concat-1_dm_mean-0_by-mecab.model


100%|██████████| 442359/442359 [00:17<00:00, 25506.64it/s]
100%|██████████| 442359/442359 [1:36:20<00:00, 76.53it/s]


Total running time:  4:51:59.582521


  """


[('김종인/NNP', 0.5914317965507507),
 ('김문수/NNP', 0.5768975615501404),
 ('김두관/NNP', 0.5701437592506409),
 ('정몽준/NNP', 0.5679817199707031),
 ('최병렬/NNP', 0.5669589638710022),
 ('안철수/NNP', 0.5669171214103699),
 ('김무성/NNP', 0.566095769405365),
 ('이정희/NNP', 0.5634468793869019),
 ('김한길/NNP', 0.5606587529182434),
 ('정세균/NNP', 0.5414032936096191)]


  


[('이명박/NNP', 0.5704056024551392),
 ('노태우/NNP', 0.539089560508728),
 ('박정희/NNP', 0.4880334734916687),
 ('김대중/NNP', 0.48197466135025024),
 ('MB/SL', 0.47288140654563904),
 ('盧/NNG', 0.47176772356033325),
 ('노/XPN', 0.4700508117675781),
 ('최규하/NNP', 0.46087783575057983),
 ('무바라크/NNP', 0.457191526889801),
 ('전두환/NNP', 0.45221325755119324)]


  import sys


[('이명박/NNP', 0.5519698262214661),
 ('이재오/NNP', 0.4740789830684662),
 ('문재인/NNP', 0.4669095575809479),
 ('노무현/NNP', 0.4510551691055298),
 ('김희옥/NNP', 0.43907028436660767),
 ('盧/NNG', 0.4387660026550293),
 ('김무성/NNP', 0.43777239322662354),
 ('정몽준/NNP', 0.42740482091903687),
 ('문희상/NNP', 0.4208645224571228),
 ('이계/NNP', 0.4183489680290222)]
Wall time: 4h 52min 22s


In [17]:
del d2v_model

### model 2

* size : Dimensionality of the feature vectors 
* dm : 1 - distibuted memory (PV-DM)  
* dm_concat : 0 - don't use concatenation of context vectors rather than sum/average  
* dm_mean : 1 - don't use the sum of the context word vectors  
> dm is used in non-concatenative mode.
* negative : 7 - neative specifies how many 'noise words' should be drawn.
* hs : 0 - hierarchical softmax 사용여부
* window : 10 - The maximum distance between the current and predicted word within a sentence.  
* alpha : the initial learning rate  
* min_alpha : learning rate will linearly drop to min_alpha as training progresses

In [18]:
%%time
#PV-DBOW
d2v_model = Make_Doc2Vec_Model(data=train2, size = 1000, dm = 1, dm_concat = 0,
                   dm_mean = 1, negative = 7, hs = 0, epoch = 20, window = 10,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'mecab')
pprint(d2v_model.most_similar('문재인/NNP'))
pprint(d2v_model.most_similar('노무현/NNP'))
pprint(d2v_model.most_similar('박근혜/NNP'))

./model/doc2vec_size-1000_epoch-20_window-10_negative-7_hs-0_dm-1_dm_concat-0_dm_mean-1_by-mecab.model


100%|██████████| 442359/442359 [00:20<00:00, 21154.88it/s]
100%|██████████| 442359/442359 [02:25<00:00, 3035.91it/s]


Total running time:  0:44:00.894669


  """


[('정세균/NNP', 0.301902174949646),
 ('문재/NNG', 0.28970351815223694),
 ('문/VV+ETM', 0.2805789113044739),
 ('김중권/NNP', 0.2722403407096863),
 ('이태근/NNP', 0.2683906555175781),
 ('이기택/NNP', 0.26732707023620605),
 ('손학규/NNP', 0.2630409598350525),
 ('원희목/NNP', 0.2590258717536926),
 ('김한길/NNP', 0.25734183192253113),
 ('이회창/NNP', 0.25231292843818665)]


  


[('노/NNP', 0.4737602472305298),
 ('이명박/NNP', 0.43864765763282776),
 ('박근혜/NNP', 0.4373871684074402),
 ('김영삼/NNP', 0.3979782462120056),
 ('박/NNP', 0.3549848198890686),
 ('박정희/NNP', 0.3496416211128235),
 ('노/XPN', 0.34903544187545776),
 ('노/IC', 0.3286374807357788),
 ('나온다든지/VV+EC', 0.32059842348098755),
 ('노태우/NNP', 0.30990082025527954)]


  import sys


[('박/NNP', 0.5090948343276978),
 ('노무현/NNP', 0.4373871684074402),
 ('노/NNP', 0.4117463231086731),
 ('이명박/NNP', 0.3763102889060974),
 ('김전/NNP', 0.36156490445137024),
 ('김영삼/NNP', 0.3613571226596832),
 ('노태우/NNP', 0.330650269985199),
 ('노/XPN', 0.3244098424911499),
 ('이기흥/NNP', 0.31733354926109314),
 ('이회창/NNP', 0.3152332901954651)]
Wall time: 44min 10s


In [19]:
del d2v_model

### model 3

* size : Dimensionality of the feature vectors 
* dm : 0 - distributed bag of words (PV-DBOW)
* dm_concat : 0 - don't use concatenation of context vectors rather than sum/average  
* dm_mean : 0 - don't use the sum of the context word vectors  
> dm is used in non-concatenative mode.
* negative : 7 - neative specifies how many 'noise words' should be drawn.
* hs : 0 - hierarchical softmax 사용여부
* window : 5 - The maximum distance between the current and predicted word within a sentence.  
* alpha : the initial learning rate  
* min_alpha : learning rate will linearly drop to min_alpha as training progresses

In [20]:
%%time
#PV-DM w/
d2v_model = Make_Doc2Vec_Model(data=train2, size = 1000, dm = 0, dm_concat = 0,
                   dm_mean = 0, negative = 7, hs = 0, epoch = 20, window = None,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'mecab')
pprint(d2v_model.most_similar('문재인/NNP'))
pprint(d2v_model.most_similar('노무현/NNP'))
pprint(d2v_model.most_similar('박근혜/NNP'))

./model/doc2vec_size-1000_epoch-20_window-None_negative-7_hs-0_dm-0_dm_concat-0_dm_mean-0_by-mecab.model


100%|██████████| 442359/442359 [00:15<00:00, 28935.19it/s]
100%|██████████| 442359/442359 [01:46<00:00, 4162.28it/s]


Total running time:  0:33:27.333062


  """


[('恒/NNG', 0.13732600212097168),
 ('동대문을/NNP', 0.12650419771671295),
 ('지니/VV', 0.11878867447376251),
 ('296/SN', 0.11741693317890167),
 ('말투/NNG', 0.11736077070236206),
 ('끈다/VV+EF', 0.11415259540081024),
 ('쿠치마/NNP', 0.11363274604082108),
 ('공사발주/NNP', 0.11336299777030945),
 ('들쑤셔/VV+EC', 0.11329080164432526),
 ('밝혀라/VV+EF', 0.11317186057567596)]


  


[('하사관/NNG', 0.14039146900177002),
 ('내치/VV', 0.13206955790519714),
 ('한랭전선/NNP', 0.13179874420166016),
 ('이명박정부/NNP', 0.1309611052274704),
 ('무진장/MAG', 0.13049069046974182),
 ('재외국민/NNP', 0.12885761260986328),
 ('Yes/SL', 0.12869225442409515),
 ('Blue/SL', 0.12623512744903564),
 ('저탄소/NNP', 0.12375755608081818),
 ('가구별/NNP', 0.1205480769276619)]


  import sys


[('전라선/NNG', 0.13626524806022644),
 ('주무른/VV+ETM', 0.13590066134929657),
 ('행복나눔/NNP', 0.13064438104629517),
 ('깍두기/NNG', 0.12917208671569824),
 ('안내서비스/NNP', 0.12885552644729614),
 ('베일/NNG', 0.12714152038097382),
 ('버스비/NNP', 0.12303851544857025),
 ('부대사/NNP', 0.12252257019281387),
 ('먹잇감/NNP', 0.12188868224620819),
 ('집념/NNG', 0.12122316658496857)]
Wall time: 33min 40s


In [21]:
del train2
del test2
del d2v_model