# Library

In [32]:
import sys
from os import listdir, path
from pyknp import Jumanpp
from gensim import models
from gensim.models.doc2vec import LabeledSentence

# Load Data

In [33]:
def corpus_files():
    dirs = [path.join('./data/text',x) for x in listdir('./data/text') if not x.endswith('.txt')]
    print(dirs)
    docs = [path.join(x, y) for x in dirs for y in listdir(x) if not x.startswith('LICENSE')]
    
    return docs

In [19]:
def read_document(path):
    listdir(path)
    with open(path, 'r') as f:
        return f.read()

In [36]:
def split_into_words(text):
    results = Jumanpp().analysis(text)
    return [mrph.midasi for mrph in results.mrph_list()]

In [38]:
def doc_to_sentence(doc, name):
    words = split_into_words(doc)
    return LabeledSentence(words=words, tags=[name])

In [22]:
def corpus_to_sentences(corpus):
    docs = [read_document(x) for x in corpus]
    sentences = []
    for idx, (doc, name) in enumerate(zip(docs, corpus)):
        sys.stdout.write('\r前処理中 {}/{}'.format(idx, len(corpus)))
#         yield doc_to_sentence(doc, name)
        sentences.append(doc_to_sentence(doc, name))
    return sentences

In [39]:
corpus = corpus_files()
sentences = corpus_to_sentences(corpus)

['./data/text/topic-news', './data/text/sports-watch', './data/text/it-life-hack', './data/text/livedoor-homme', './data/text/movie-enter', './data/text/peachy', './data/text/kaden-channel', './data/text/dokujo-tsushin', './data/text/smax']
前処理中 0/7376

  return LabeledSentence(words=words, tags=[name])


前処理中 6175/7376

KeyboardInterrupt: 

In [40]:
corpus = corpus_files()
sentences = corpus_to_sentences(corpus)

model = models.Doc2Vec(sentences, dm=0, size=300, windows=15, alpha=0.025,\
                      min_alpha=0.025, min_count=1, sample=1e-6)

print('\n訓練開始')
for epoch in range(20):
    print('Epoch: {}'.format(epoch + 1))
    model.train(sentences)
    model.alpha -= (0.025 - 0.0001) / 19
    model.min_alpha = model.alpha

['./data/text/topic-news', './data/text/sports-watch', './data/text/it-life-hack', './data/text/livedoor-homme', './data/text/movie-enter', './data/text/peachy', './data/text/kaden-channel', './data/text/dokujo-tsushin', './data/text/smax']
前処理中 17/7376

  return LabeledSentence(words=words, tags=[name])


前処理中 7375/7376




訓練開始
Epoch: 1


ValueError: You must specify either total_examples or total_words, for proper job parameters updationand progress calculations. The usual value is total_examples=model.corpus_count.

In [52]:
print('\n訓練開始')
for epoch in range(20):
    print('Epoch: {}'.format(epoch + 1))
    model.train(sentences,total_examples = sum([len(sen) for sen in sentences]), epochs =1)
    model.alpha -= (0.025 - 0.0001) / 19
    model.min_alpha = model.alpha


訓練開始
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19
Epoch: 20


In [53]:
model.save('doc2vec.model')
model = models.Doc2Vec.load('doc2vec.model')

In [63]:
model.docvecs.most_similar('./data/text/livedoor-homme/livedoor-homme-5625149.txt', topn=1)

[('./data/text/kaden-channel/kaden-channel-6130802.txt', 0.9987810254096985)]

In [62]:
model.docvecs.most_similar('./data/text/dokujo-tsushin/dokujo-tsushin-4778030.txt', topn=1)

[('./data/text/kaden-channel/kaden-channel-6282788.txt', 0.9984859228134155)]

In [61]:
model.docvecs.similarity('./data/text/livedoor-homme/livedoor-homme-4700669.txt', './data/text/movie-enter/movie-enter-5947726.txt')

0.9986588

In [60]:
model.docvecs.similarity('./data/text/livedoor-homme/livedoor-homme-4700669.txt', './data/text/peachy/peachy-4289213.txt')

0.99816