In [1]:
from bs4 import BeautifulSoup
import requests
import logging
import jaconv
from gensim import corpora
from gensim import models
from janome.tokenizer import Tokenizer
from pprint import pprint
import pandas as pd, numpy as np
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
url = "https://www.aozora.gr.jp/access_ranking/2019_xhtml.html"
URL = ""
res = requests.get(url)
res.encoding = 'shift-jis'
soup = BeautifulSoup(res.content, "html.parser")

In [3]:
url_list = [url["href"] for i, url in enumerate(soup.find_all("a", target="_blank")) if i < 50]

In [4]:
title = []
name = []
text = []
for url in url_list:
    res = requests.get(url)
    url_start = url[:37]
    res.encoding = 'shift-jis'
    soup = BeautifulSoup(res.content, "html.parser")
    for i, a in enumerate(soup.find_all("a")):
        if i == 7:
            url_end = a["href"][1:]
    url = url_start + url_end
    res = requests.get(url)
    res.encoding = 'shift-jis'
    soup = BeautifulSoup(res.content, "html.parser")
    title.append(soup.find("h1").string)
    name.append(soup.find("h2").string)
    for tag in soup.find_all(["rt", "rp"]):
        tag.decompose()
    text.append(soup.find("div",{'class': 'main_text'}).get_text())

In [5]:
df = pd.DataFrame({'タイトル': title, '作者': name, '本文': text})

In [6]:
df.head()

Unnamed: 0,タイトル,作者,本文
0,〔雨ニモマケズ〕,宮澤賢治,\r\n雨ニモマケズ\r\n風ニモマケズ\r\n雪ニモ夏ノ暑サニモマケヌ\r\n丈夫ナカラダ...
1,走れメロス,太宰治,\r\n　メロスは激怒した。必ず、かの邪智暴虐の王を除かなければならぬと決意した。メロスには...
2,山月記,中島敦,\r\n　隴西の李徴は博学才穎、天宝の末年、若くして名を虎榜に連ね、ついで江南尉に補せられた...
3,こころ,夏目漱石,\n上　先生と私\n\n\n一\n\r\n　私はその人を常に先生と呼んでいた。だからここでも...
4,羅生門,芥川龍之介,\r\n　ある日の暮方の事である。一人の下人が、羅生門の下で雨やみを待っていた。\r\n　広...


In [7]:
t = Tokenizer()
wakati_list = []
for i, doc in enumerate(text):
    print(i)
    if i == 0:
        doc = jaconv.kata2hira(doc)
    tokens = t.tokenize(doc)
    wakati = []
    for token in tokens:
        if token.part_of_speech.split(',')[0] not in ['記号']:
            wakati.append(token.surface)
    wakati_list.append(wakati)
#     test
    if i > 1:
        break

0
1
2


In [8]:
for wakati in wakati_list:
    print(wakati[:10])

['雨', 'に', 'も', 'まけ', 'ず', '風', 'に', 'も', 'まけ', 'ず']
['メロス', 'は', '激怒', 'し', 'た', '必ず', 'かの', '邪智', '暴虐', 'の']
['隴西', 'の', '李', '徴', 'は', '博学', '才', '穎', '天宝', 'の']


In [9]:
dictionary = corpora.Dictionary(wakati_list)
print('==={単語: ID}===')
pprint(dictionary.token2id)

2020-10-21 04:58:41,983 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-10-21 04:58:42,005 : INFO : built Dictionary(2203 unique tokens: ['あらゆる', 'ある', 'あれ', 'い', 'いつも']...) from 3 documents (total 8970 corpus positions)


==={単語: ID}===
{'あ': 132,
 'ああ': 133,
 'あからめ': 134,
 'あげよ': 135,
 'あげる': 136,
 'あさましい': 1455,
 'あす': 137,
 'あたり': 138,
 'あちこち': 139,
 'あっ': 140,
 'あっぱれ': 141,
 'あて': 142,
 'あと': 1456,
 'あなた': 143,
 'あの': 144,
 'あぶない': 1457,
 'あらゆる': 0,
 'あり': 1458,
 'ありがたい': 145,
 'ありがとう': 146,
 'ある': 1,
 'あるい': 147,
 'あれ': 2,
 'あろ': 1459,
 'あわや': 1460,
 'あんな': 148,
 'い': 3,
 'いい': 149,
 'いいえ': 150,
 'いう': 1461,
 'いえ': 151,
 'いきり立っ': 152,
 'いくばく': 1462,
 'いくぶん': 153,
 'いけ': 154,
 'いちど': 155,
 'いっ': 156,
 'いっそ': 157,
 'いつ': 158,
 'いつも': 4,
 'いのち': 159,
 'いひ': 5,
 'いふ': 6,
 'いま': 160,
 'いや': 161,
 'いよ': 162,
 'いよいよ': 163,
 'いる': 164,
 'いろ': 165,
 'いわ': 1463,
 'う': 7,
 'うずくまり': 166,
 'うたい': 167,
 'うたっ': 168,
 'うだろ': 1464,
 'うち': 169,
 'うとうと': 170,
 'うめく': 171,
 'うらみ': 172,
 'うるさく': 173,
 'うれしい': 174,
 'うんと': 175,
 'え': 176,
 'えい': 177,
 'えて': 1465,
 'お': 178,
 'おいおい': 179,
 'おくれ': 180,
 'おっしゃる': 181,
 'おどろい': 182,
 'おまえ': 183,
 'おめおめ': 1466,
 'おめでとう': 184,
 'おら': 1467,
 'おろおろ': 8,
 'お互い': 185,
 'お詫び': 186

In [10]:
corpus = list(map(dictionary.doc2bow, wakati_list))
print('===(単語ID, 出現回数)===')
pprint(corpus)

===(単語ID, 出現回数)===
[[(0, 1),
  (1, 1),
  (2, 3),
  (3, 1),
  (4, 1),
  (5, 2),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 2),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 4),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 7),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 7),
  (38, 1),
  (39, 5),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 2),
  (44, 1),
  (45, 1),
  (46, 15),
  (47, 1),
  (48, 11),
  (49, 1),
  (50, 5),
  (51, 1),
  (52, 3),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 3),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 7),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 2),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 7),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 1),
  (89, 6),

In [11]:
test_model = models.TfidfModel(corpus)
corpus_tfidf = test_model[corpus]
print('===(単語ID, TF-IDF)===')
for doc in corpus_tfidf:
    print(doc[:4])

2020-10-21 04:58:42,937 : INFO : collecting document frequencies
2020-10-21 04:58:42,944 : INFO : PROGRESS: processing document #0
2020-10-21 04:58:42,946 : INFO : calculating IDF weights for 3 documents and 2203 features (2543 matrix non-zeros)


===(単語ID, TF-IDF)===
[(0, 0.09466959838459445), (2, 0.10481919601528028), (4, 0.09466959838459445), (5, 0.1893391967691889)]
[(2, 0.010053208106213159), (10, 0.00753990607965987), (13, 0.0025133020265532898), (40, 0.0050266040531065795)]
[(15, 0.008028064744658793), (26, 0.004014032372329396), (29, 0.004014032372329396), (88, 0.01204209711698819)]


In [12]:
texts_tfidf = []
for doc in corpus_tfidf:
    text_tfidf = []
    for word in doc:
        text_tfidf.append([word[1], dictionary[word[0]]])
    texts_tfidf.append(text_tfidf)
listed_texts_tfidf = []
for i in texts_tfidf:
    listed_texts_tfidf.append(sorted(i, reverse=True))
print('===[単語, TF-IDF]===')
for i in listed_texts_tfidf:
    print(i[:3])

===[単語, TF-IDF]===
[[0.3786783935383778, '菩薩'], [0.28400879515378336, 'まけ'], [0.20963839203056056, '南無']]
[[0.5175463366837201, '私'], [0.5175463366837201, 'メロス'], [0.2437902965756691, 'が']]
[[0.5002990376871312, '己'], [0.3331646869033399, 'が'], [0.25689807182908136, 'で']]


In [13]:
for i in range(len(listed_texts_tfidf)):
    print('')
    print('%s.' % i, '〜%s〜' % df['タイトル'][i])
    for i, text in enumerate(listed_texts_tfidf[i]):
        print(text)
        if i > 100:
            break


0. 〜〔雨ニモマケズ〕〜
[0.3786783935383778, '菩薩']
[0.28400879515378336, 'まけ']
[0.20963839203056056, '南無']
[0.1893391967691889, '朿']
[0.1893391967691889, 'いひ']
[0.1397589280203737, '行']
[0.10481919601528028, 'あれ']
[0.09466959838459445, '［＃「']
[0.09466959838459445, '雪']
[0.09466959838459445, '野菜']
[0.09466959838459445, '釈迦牟尼']
[0.09466959838459445, '負']
[0.09466959838459445, '萓']
[0.09466959838459445, '稲']
[0.09466959838459445, '瞋']
[0.09466959838459445, '看病']
[0.09466959838459445, '病気']
[0.09466959838459445, '玄米']
[0.09466959838459445, '無辺']
[0.09466959838459445, '浄']
[0.09466959838459445, '松']
[0.09466959838459445, '暑']
[0.09466959838459445, '慾']
[0.09466959838459445, '小さな']
[0.09466959838459445, '安立']
[0.09466959838459445, '如来']
[0.09466959838459445, '多']
[0.09466959838459445, '夏']
[0.09466959838459445, '味噌']
[0.09466959838459445, '合']
[0.09466959838459445, '南無妙法蓮華経']
[0.09466959838459445, '北']
[0.09466959838459445, '仏']
[0.09466959838459445, '丈夫']
[0.09466959838459445, 'ー']
[0.09466959838459

In [20]:
text_list = []
for i in range(len(listed_texts_tfidf)):
    texts = []
    for i, text in enumerate(listed_texts_tfidf[i]):
        texts.append(text[1])
        if i > 100:
            break
    text_list.append(texts)

In [41]:
docs

[['菩薩',
  'まけ',
  '南無',
  '朿',
  'いひ',
  '行',
  'あれ',
  '［＃「',
  '雪',
  '野菜',
  '釈迦牟尼',
  '負',
  '萓',
  '稲',
  '瞋',
  '看病',
  '病気',
  '玄米',
  '無辺',
  '浄',
  '松',
  '暑',
  '慾',
  '小さな',
  '安立',
  '如来',
  '多',
  '夏',
  '味噌',
  '合',
  '南無妙法蓮華経',
  '北',
  '仏',
  '丈夫',
  'ー',
  'ゐる',
  'ゐ',
  'わらっ',
  'わたし',
  'わすれ',
  'わかり',
  'ゎやそしょうがあれば',
  'よば',
  'やめろ',
  'もち',
  'みんな',
  'みき',
  'ほめ',
  'ぶん',
  'ぶ',
  'ひどり',
  'ひ',
  'はなみ',
  'のぼ',
  'なつ',
  'ながし',
  'でく',
  'つまらない',
  'つかれ',
  'たべ',
  'しづか',
  'さむ',
  'さうな',
  'こども',
  'こ',
  'けん',
  'くに',
  'きし',
  'がら',
  'かんじょう',
  'おろおろ',
  'いふ',
  'いつも',
  'あらゆる',
  'き',
  '風',
  '雨',
  '野原',
  '西',
  '母',
  '死に',
  '林',
  '東',
  '小屋',
  '宝',
  '四',
  '南',
  '入れ',
  'よく',
  'やり',
  'とき',
  'そして',
  'じ',
  'く',
  'からだ'],
 ['私',
  'メロス',
  'が',
  'で',
  '無い',
  'おまえ',
  '王',
  'セリヌンティウス',
  'ます',
  'いる',
  '妹',
  'いま',
  '市',
  '走る',
  'たち',
  'ここ',
  'か',
  '村',
  'です',
  'それから',
  'ない',
  '陽',
  '殺し',
  '下さい',
  'わし',
  'すぐ',
  '来',
  'なら',
  'く

In [35]:
docs = text_list
companies = title[:3]

In [36]:
# 参考記事： http://qiita.com/okappy/items/32a7ba7eddf8203c9fa1
class LabeledListSentence(object):
    def __init__(self, words_list, labels):
        self.words_list = words_list
        self.labels = labels

    def __iter__(self):
        for i, words in enumerate(self.words_list):
            yield models.doc2vec.LabeledSentence(words, ['%s' % self.labels[i]])

In [37]:
# ライブラリ読み込み
from gensim import models

# gensim にクチコミを登録
# クチコミに会社名を付与するため、参考記事で実装されていた拡張クラスを使っています
sentences = LabeledListSentence(docs, companies)

# doc2vec の学習条件設定
# alpha: 学習率 / min_count: X回未満しか出てこない単語は無視
# size: ベクトルの次元数 / iter: 反復回数 / workers: 並列実行数
model = models.Doc2Vec(alpha=0.025, min_count=5,
                       size=100, iter=20, workers=4)

# doc2vec の学習前準備(単語リスト構築)
model.build_vocab(sentences)

# Wikipedia から学習させた単語ベクトルを無理やり適用して利用することも出来ます
model.intersect_word2vec_format('./data/wiki/wiki2vec.bin', binary=True)

# 学習実行
model.train(sentences)

# セーブ
# model.save('./data/doc2vec.model')

# 学習後はモデルをファイルからロード可能
# model = models.Doc2Vec.load('./data/doc2vec.model')

# 順番が変わってしまうことがあるので会社リストは学習後に再呼び出し
# companies = model.docvecs.offset2doctag

2020-10-21 05:10:06,410 : INFO : collecting all words and their counts
  yield models.doc2vec.LabeledSentence(words, ['%s' % self.labels[i]])
2020-10-21 05:10:06,412 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-10-21 05:10:06,414 : INFO : collected 285 word types and 3 unique tags from a corpus of 3 examples and 299 words
2020-10-21 05:10:06,416 : INFO : Loading a fresh vocabulary
2020-10-21 05:10:06,420 : INFO : effective_min_count=5 retains 0 unique words (0% of original 285, drops 285)
2020-10-21 05:10:06,422 : INFO : effective_min_count=5 leaves 0 word corpus (0% of original 299, drops 299)
2020-10-21 05:10:06,426 : INFO : deleting the raw counts dictionary of 285 items
2020-10-21 05:10:06,430 : INFO : sample=0.001 downsamples 0 most-common words
2020-10-21 05:10:06,432 : INFO : downsampling leaves estimated 0 word corpus (0.0% of prior 0)
2020-10-21 05:10:06,435 : INFO : estimated required memory for 0 words and 100 dimensions: 1800 bytes
20

RuntimeError: you must first build vocabulary before training the model