<br></br>
# **gensim | doc2vec**
<br></br>
## **1 네이버 리뷰 문장의 활용**
네이버 영화리뷰 단어모델 만들기

In [1]:
from konlpy.tag import Twitter
twitter = Twitter()

def read_data(filename):
    with open(filename, 'r' ,encoding='utf-8') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
    from random import randint
    random_data = [data[randint(1, len(data))]  for no in range(int(len(data)/10)) ]
    return random_data

def tokenize(doc):
    return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]

In [2]:
%%time
from collections import namedtuple
train_data        = read_data('./data/ratings_train.txt')
train_docs        = [(tokenize(row[1]), row[2]) for row in train_data[1:]]
TaggedDocument    = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]

CPU times: user 27.2 s, sys: 271 ms, total: 27.5 s
Wall time: 18.6 s


In [3]:
from pprint import pprint
pprint(tagged_train_docs[0])

TaggedDocument(words=['영화/Noun', '역사상/Noun', '통/Noun', '들다/Verb', "'/Punctuation", '거대/Noun', '제작비/Noun', '가/Josa', '들어가다/Verb', '러닝/Noun', '타임/Noun', '의/Josa', '절반/Noun', '분량/Noun', '의/Josa', '사족/Noun', '이/Josa', '붙다/Verb', '영화/Noun', '목록/Noun', "'/Punctuation", '이/Noun', '있다/Adjective', '다섯/Noun', '손가락/Noun', '안/Noun', '에는/Josa', '꼭/Noun', '들어가다/Verb', '것/Noun', '이다/Josa'], tags=['0'])


<br></br>
## **2 doc2vec 파라미터 설정 및 학습**

In [4]:
%%time
from gensim.models import doc2vec
doc_vectorizer = doc2vec.Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.025, seed=1234)
doc_vectorizer.build_vocab(tagged_train_docs)

for epoch in range(10):
    doc_vectorizer.train(tagged_train_docs, 
                         total_examples = doc_vectorizer.corpus_count, 
                         epochs = doc_vectorizer.epochs)
    doc_vectorizer.alpha -= 0.002
    doc_vectorizer.min_alpha = doc_vectorizer.alpha 

# 학습이 완료된 모델의 데이터를 저장한다
doc_vectorizer.save('./data/doc2vec.model')

CPU times: user 1min, sys: 4.96 s, total: 1min 5s
Wall time: 34.7 s


<br></br>
## **3 doc2Vec 모델활용**

In [5]:
from gensim.models import doc2vec
from pprint import pprint
doc_vectorizer = doc2vec.Doc2Vec.load('./data/doc2vec.model')

In [6]:
pprint(doc_vectorizer.wv.most_similar('공포/Noun'))

[('장르/Noun', 0.4658203721046448),
 ('교과서/Noun', 0.43489059805870056),
 ('효과/Noun', 0.4230882525444031),
 ('↓/Foreign', 0.4171814024448395),
 ('서스펜스/Noun', 0.41564539074897766),
 ('싱겁다/Adjective', 0.41195330023765564),
 ('단순/Noun', 0.40946850180625916),
 ('범죄/Noun', 0.4074081778526306),
 ('거부/Noun', 0.40322983264923096),
 ('코미디/Noun', 0.39780300855636597)]


  if np.issubdtype(vec.dtype, np.int):


In [7]:
doc_vectorizer.wv.similarity('공포/Noun', 'ㅋㅋ/KoreanParticle')

  if np.issubdtype(vec.dtype, np.int):


0.021508785

In [8]:
pprint(doc_vectorizer.wv.most_similar(positive=['여자/Noun', '공포/Noun'], 
                                      negative=['남자/Noun']))

[('코미디/Noun', 0.3807429075241089),
 ('싱겁다/Adjective', 0.3694385290145874),
 ('단순/Noun', 0.3490898311138153),
 ('거부/Noun', 0.34379300475120544),
 ('인지/Noun', 0.3303176462650299),
 ('피/Noun', 0.31789112091064453),
 ('나서다/Verb', 0.31452712416648865),
 ('장르/Noun', 0.3000449240207672),
 ('애매/Noun', 0.2979757785797119),
 ('서스펜스/Noun', 0.29294371604919434)]


  if np.issubdtype(vec.dtype, np.int):


In [9]:
doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun'])[:10]

array([-1.5043770e-06,  1.2723691e-02,  2.3739752e-03, -3.8722011e-03,
        6.5828748e-03, -1.9027082e-03,  1.3088444e-02,  4.3375581e-03,
        5.9464159e-03,  3.2994461e-03], dtype=float32)

In [10]:
doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun']).sum()

-0.012216017