<a href="https://colab.research.google.com/github/aso2001054/AI_teach2020/blob/master/2020AI0404_Doc2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Doc2Cevで文章を学習したコーパスモデルを作って保存

### データリストの作成

In [None]:

#学習対象とする青空文庫の作品リスト --- (*1)
list = [
    {"author":{
        "name":"宮澤 賢治",
        "url":"https://www.aozora.gr.jp/cards/000081/files/"}, 
     "books":[
        {"name":"銀河鉄道の夜","zipname":"43737_ruby_19028.zip"},
        {"name":"注文の多い料理店","zipname":"1927_ruby_17835.zip"},
        {"name":"セロ弾きのゴーシュ","zipname":"470_ruby_3987.zip"},
        {"name":"やまなし","zipname":"46605_ruby_29758.zip"},
        {"name":"どんぐりと山猫","zipname":"43752_ruby_17595.zip"},
    ]},
    {"author":{
        "name":"芥川 竜之介",
        "url":"https://www.aozora.gr.jp/cards/000879/files/"}, 
     "books":[
        {"name":"羅生門","zipname":"127_ruby_150.zip"},
        {"name":"鼻","zipname":"42_ruby_154.zip"},
        {"name":"河童","zipname":"69_ruby_1321.zip"},
        {"name":"歯車","zipname":"42377_ruby_34744.zip"},
        {"name":"老年","zipname":"131_ruby_241.zip"},
    ]},
    {"author":{
        "name":"ポー エドガー・アラン",
        "url":"https://www.aozora.gr.jp/cards/000094/files/"}, 
     "books":[
        {"name":"ウィリアム・ウィルスン","zipname":"2523_ruby_19896.zip"},
        {"name":"落穴と振子","zipname":"1871_ruby_17551.zip"},
        {"name":"黒猫","zipname":"530_ruby_20931.zip"},
        {"name":"群集の人","zipname":"56535_ruby_69925.zip"},
        {"name":"沈黙","zipname":"56537_ruby_70425.zip"},
    ]},
    {"author":{
        "name":"紫式部",
        "url":"https://www.aozora.gr.jp/cards/000052/files/"}, 
     "books":[
        {"name":"源氏物語 01 桐壺","zipname":"5016_ruby_9746.zip"},
        {"name":"源氏物語 02 帚木","zipname":"5017_ruby_9752.zip"},
        {"name":"源氏物語 03 空蝉","zipname":"5018_ruby_9754.zip"},
        {"name":"源氏物語 04 夕顔","zipname":"5019_ruby_9761.zip"},
        {"name":"源氏物語 05 若紫","zipname":"5020_ruby_11253.zip"},
    ]},
]

### MeCabのインストール

In [None]:
# 形態素分析ライブラリーMeCab と 辞書(mecab-ipadic-NEologd)のインストール 
!apt-get -q -y install sudo file mecab libmecab-dev mecab-ipadic-utf8 git curl python-mecab > /dev/null
!pip install mecab-python3 > /dev/null

# MeCabの実行時の指定パスをインストールパスにリンクさせる
# シンボリックリンク（/etc/mecabrcを/usr/local/etc/mecabrcで参照できるようにする）
!ln -s /etc/mecabrc /usr/local/etc/mecabrc

ln: failed to create symbolic link '/usr/local/etc/mecabrc': File exists


In [None]:
def book_list():
  for novellist in list:
    author = novellist["author"]
    for book in novellist["books"]:
      yield author, book
   

In [None]:
import zipfile
import os.path
import urllib.request as req
def read_book(author, book):
  zipname = book["zipname"]
  if not os.path.exists(zipname):
    req.urlretrieve(author["url"] + zipname, zipname)
  zipname = book["zipname"]
  with zipfile.ZipFile(zipname, "r") as zf:
    for filename in zf.namelist():
      if os.path.splitext(filename)[1] != ".txt":
        continue
      with zf.open(filename, "r") as f:
        return f.read().decode("shift-jis")

In [None]:
import MeCab

mecab = MeCab.Tagger()

def split_words(text):
  node = mecab.parseToNode(text)
  wakati_words = []
  while node is not None:
    hinshi = node.feature.split(",")[0]
    if hinshi in ["動詞", "形容詞"]:
      wakati_words.append(node.surface)
    elif hinshi in ["動詞", "形容詞"]:
      wakati_words.append(node.feature.split(",")[6])
    node = node.next
  return wakati_words


In [None]:
from gensim import models
from gensim.models.doc2vec import TaggedDocument
documents = []
for author, book in book_list():
  words = read_book(author, book)
  wakati_words = split_words(words)
  document = TaggedDocument(wakati_words, [author["name"] + ":" + book["name"]])
  print(document)

  documents.append(document)

  model = models.doc2vec(documents, dm=0, vector_size=300, window=15, min_count=1)
  model.save('aozora.model')
  print("モデル作成完了")



TaggedDocument(['現れる', '付く', 'する', 'ち', 'ち', '流', 'れ', 'し', 'い', '白い', 'こく', 'つるし', '黒い', 'せ', '白く', 'けぶっ', 'し', 'かけ', 'あげ', 'あげ', 'あげよ', 'やめ', '読ん', 'ねむく', '読む', '読む', 'ない', 'わから', 'き', 'ち', 'する', '早く', '見つけ', 'わかっ', 'いる', 'いき', 'お', 'い', 'よく', '立ちあがり', '立っ', 'みる', '答える', 'でき', 'ふりかえっ', '見', 'く', 'わらい', 'し', 'なっ', 'しまい', 'い', 'よっ', 'く', '思い', '答える', 'でき', '困', 'っ', 'す', 'し', 'あげ', '立ち上がっ', 'でき', 'い', '見', 'い', 'いで', 'い', 'し', '白い', 'いい', '見', '見える', 'なっ', 'うなずき', 'なり', '知っ', 'い', '知っ', 'いる', '読ん', 'あっ', '読む', 'もっ', 'き', 'ひろげ', 'ある', 'つく', 'し', 'い', 'しゃし', '見', 'わす', 'れる', 'なかっ', 'へんじ', 'し', 'し', 'つらく', '出', 'ない', 'なっ', '知っ', 'き', 'どく', 'がっ', 'へんじ', 'し', '考える', 'たまらない', 'あわ', 'れ', 'する', 'い', '考える', 'す', 'あたる', 'ち', 'ち', '流', '考える', '似', 'い', 'ち', 'ち', '細', 'うかん', 'いる', 'あぶら', 'あたる', 'あたる', 'い', 'くう', 'ある', '速', 'える', 'ち', 'かん', 'いる', 'いる', '見る', '深い', '青く', '見える', '深', 'く', '遠い', '集まっ', '見え', '白く', '見える', 'なさい', '光る', 'す', 'はいっ', 'し', '光る', '光っ', 'いる', '考え', 'あっ', 'ち', 'ある', 'し', '立っ',

TypeError: ignored

In [None]:
from gensim import models
model = models.Doc2Vec.load('aozora.model')

In [None]:
import urllib.request as req
import zipfile
import os.path

def read_book(url, zipname):
  if not os.path.exists(zipname):
    req.urlretrieve(url, zipname)
  with zipfile.ZipFile(zipname, "r") as zf:
    for filename in zf.namelist():
      with zf.open(filename, "r" ) as f:
        return f.read().decode("shift-jis")


In [None]:
def similar(title, url):
  zipname = url.split("/")[-1]

  words = read_book(url, zipname)
  wakati_words = split_words(words)
  vector = model.infer_vector(wakati_words)
  print("---[", title, "]と似た作品は?---")
  print(model.docvecs.most_similar([vector], topn=3))
  print("")

In [None]:
similar("宮沢賢治:よだかの星", "https://www.aozora.gr.jp/cards/000081/files/473_ruby_467.zip")
similar("芥川龍之介:犬と笛", "https://www.aozora.gr.jp/cards/000879/files/56_ruby_845.zip")
similar("ポーエドガー・アラン：マリー:ロジェエの怪事件", "https://www.aozora.gr.jp/cards/000094/files/4261_ruby_54182.zip")
similar("紫式部:源氏物語　06　末摘花", "https://www.aozora.gr.jp/cards/000052/files/5021_ruby_11106.zip")