## MeCabによる形態素解析

In [18]:
import zipfile
import glob
import re

import MeCab
import ipadic
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [11]:
text = "Udemyは素晴らしいオンライン学習サービスです"
m = MeCab.Tagger(ipadic.MECAB_ARGS)
m1 = m.parse(text)
m1

'Udemy\t名詞,固有名詞,組織,*,*,*,*\nは\t助詞,係助詞,*,*,*,*,は,ハ,ワ\n素晴らしい\t形容詞,自立,*,*,形容詞・イ段,基本形,素晴らしい,スバラシイ,スバラシイ\nオンライン\t名詞,一般,*,*,*,*,オンライン,オンライン,オンライン\n学習\t名詞,サ変接続,*,*,*,*,学習,ガクシュウ,ガクシュー\nサービス\t名詞,サ変接続,*,*,*,*,サービス,サービス,サービス\nです\t助動詞,*,*,*,特殊・デス,基本形,です,デス,デス\nEOS\n'

In [8]:
for row in m1.split("\n"):      
    print(row)

Udemy	名詞,固有名詞,組織,*,*,*,*
は	助詞,係助詞,*,*,*,*,は,ハ,ワ
素晴らしい	形容詞,自立,*,*,形容詞・イ段,基本形,素晴らしい,スバラシイ,スバラシイ
オンライン	名詞,一般,*,*,*,*,オンライン,オンライン,オンライン
学習	名詞,サ変接続,*,*,*,*,学習,ガクシュウ,ガクシュー
サービス	名詞,サ変接続,*,*,*,*,サービス,サービス,サービス
です	助動詞,*,*,*,特殊・デス,基本形,です,デス,デス
EOS



### 名詞だけ取り出す

In [10]:
noun_list =[]
for row in m1.split("\n"):
    if "名詞" in row:
        noun_list.append(row.split("\t")[0])        
    print(row)
noun_list

Udemy	名詞,固有名詞,組織,*,*,*,*
は	助詞,係助詞,*,*,*,*,は,ハ,ワ
素晴らしい	形容詞,自立,*,*,形容詞・イ段,基本形,素晴らしい,スバラシイ,スバラシイ
オンライン	名詞,一般,*,*,*,*,オンライン,オンライン,オンライン
学習	名詞,サ変接続,*,*,*,*,学習,ガクシュウ,ガクシュー
サービス	名詞,サ変接続,*,*,*,*,サービス,サービス,サービス
です	助動詞,*,*,*,特殊・デス,基本形,です,デス,デス
EOS



['Udemy', 'オンライン', '学習', 'サービス']

### livedoorニュースを取得して形態素解析

In [None]:
# サンプルデータを用意
zip_file = 'topic-news_sub.zip'

extract_dir = ''

with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [25]:
files = glob.glob("sample_data/*.txt")

In [27]:
documents = []

for file_path in files:
    with open(file_path, "r", encoding="utf-8") as f:
        documents.append(f.read())

In [29]:
documents

['http://news.livedoor.com/article/detail/5903225/\n2011-10-02T10:00:00+0900\n悪評が次から次へと溢れ出る川越シェフ\n最近、TVでも人気者の川越シェフ。代官山のおしゃれなイタリアンレストランのオーナーシェフでありながら、タレントとも軽妙なトークを交わすマルチな才能で、TV業界でもその地位を不動のものとしつつある。また、その甘いマスクに魅せられた、「川越女子」なる固定ファンまで存在するという。\n\nしかし、ここにきて川越シェフの悪評がどんどん噴出しているようだ。最近では、ミニストップの川越シェフプロデュースクリスマスケーキが、1万円という超高額で売り出されることが話題となった。\n\n【関連記事】\nミニストップの川越シェフプロデュースクリスマスケーキがバカ高いと話題に\nhttp://news.livedoor.com/topics/detail/5899457/\n\n普通のケーキなら、原価は200円程度。川越シェフプロデュースだけに、多少豪華に作ってあるかもしれないが、それでも「ぼったくり」というネット上の反応が多かった。また、それ以上に、「有名になった途端、ここぞとばかりに金儲けしている」という声も散見された。\n\nまた、普段のマスコミ対応の悪さもかなりのものだ。数年前までは、川越シェフなどほとんど誰も知らないような状態だったが、有名となった今は手の平を返すように、週刊誌の記者なを見下すような態度が見られるようになったという。\n\nマスコミのおかげで有名になっておきながら、恩知らずとはこのことだろう。\n\n【関連記事】\nシェフのタレント化はもうたくさん!?\u3000マスコミが嫌うタレントシェフ\nhttp://news.livedoor.com/topics/detail/5801160/\n\nしかし、単なるシェフ風情が調子に乗ってしまうのも無理はない。何せ、「株式会社TATSUYA KAWAGOE」の年商は約9,000万円、さらにTVへの出演料も高額に上る。\n\nまた、6月にTV番組で「バツ2」の独身であることを告白している。あの甘いマスクに有名人とくれば、モテないはずはない。つまり、カネもオンナも“やりたい放題”なのだろう。\n\n【関連記事】\n離婚告白川越達也

#### 名詞だけを抜き出す
正規表現を用いて日本語のみ抽出

In [None]:
document_list = []

m = MeCab.Tagger(ipadic.MECAB_ARGS)
pattern = re.compile(r"[ぁ-んァ-ヶ一-龥々]")

for document in documents:
    m1 = m.parse(document)
    
    noun_list =[]
    for row in m1.split("\n"):
        target_word = row.split("\t")[0]
        if target_word == "EOS":
            break
        else:
            word = row.split("\t")[1]
            if word[:2] == "名詞":
                if pattern.findall(target_word) != []:
                   noun_list.append(target_word)
    words = " ".join(noun_list)
    document_list.append(words　　　)


In [38]:
document_list

['悪評 次 次 川越 シェフ 最近 人気 者 川越 シェフ 代官山 おしゃれ イタリアン レストラン オーナー シェフ タレント 軽妙 トーク マルチ 才能 業界 地位 不動 もの マスク 川越 女子 固定 ファン 存在 ここ 川越 シェフ 悪評 噴出 よう 最近 ミニストップ 川越 シェフプロデュースクリスマスケーキ 万 円 高額 こと 話題 関連 記事 ミニストップ 川越 シェフプロデュースクリスマスケーキ バカ 話題 普通 ケーキ 原価 円 程度 川越 シェフ プロデュース 豪華 くり ネット 上 反応 それ 以上 有名 途端 ここ 金儲け 声 散見 普段 マスコミ 対応 さ かなり もの 数 年 前 川越 シェフ 誰 よう 状態 有名 今 手の平 よう 週刊 誌 記者 よう 態度 よう マスコミ おかげ 有名 恩知らず こと 関連 記事 シェフ タレント 化 たくさん マスコミ タレント シェフ シェフ 風情 調子 の 無理 株式会社 年商 万 円 出演 料 高額 月 番組 バツ 独身 こと 告白 マスク 有名人 モテ はず カネ オンナ 放題 の 関連 記事 離婚 告白 川越 達也 シェフ 週 ネイルサロン 収録 時 口紅 川越 シェフ 才 年下 美人 秘書 シェフ デキ 説 否定 天狗 っぷり 盛ん 報道 よう 最近 収録 現場 ブチギレ こと よう 現場 スタッフ 評判 の 関連 記事 川越 シェフ 我慢 限界 ブチギレ あと 肝心 料理 腕 デパ 地下 惣菜 レベル 意見 それ 上 目線 評論 疑問符 化けの皮 十分 本人 の 関連 記事 大人気 イケメンシェフ 川越 達也 氏 カネ 評判 川越 シェフ 程度 買い 視聴 者 レベル 問題 メディア 戦略 よう 頭 残念 人 言葉 日本 景気 ため パー お金 もの 関連 記事 川越 達也 プロデュース 商品 件 川越 ドッグフード 存在 川越 達也 毎日 川越 達也 監修 商品 川越 女子 川越 達也 メニュー 川越 シェフ キムチ 川越 達也 ドヤ 顔 川越 女子 ブチギレ 私 気 気 川越 達也 シェフ',
 '写真 流出 アイドル たち アイドル 男性 一緒 プリクラ ニャンニャン 写真 流出 スキャンダル 発展 ケース アイドル 交際 表沙汰 の 今 昔 ご法度 女性 アイドル ファン 男

In [39]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(document_list)

In [40]:
tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6663 stored elements and shape (100, 3294)>

In [44]:
pd.DataFrame(tfidf_matrix.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,3254,3255,3256,3257,3258,3259,3260,3261,3262,3263,3264,3265,3266,3267,3268,3269,3270,3271,3272,3273,3274,3275,3276,3277,3278,3279,3280,3281,3282,3283,3284,3285,3286,3287,3288,3289,3290,3291,3292,3293
0,0.0,0.0,0.0,0.0,0.0,0.023218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02858,0.0,0.031146,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.062292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.048635,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.036343,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.025668,0.0,...,0.036964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.073928,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### コサイン類似度の関数
$$
Cos類似度 = \frac{a_1b_1 + a_2b_2}{\sqrt{a_1^2+a_2^2}\sqrt{b_1^2+b_2^2}}
$$


In [45]:
def cos_sim(vec_a, vec_b):
    dot = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    return dot / (norm_a * norm_b)

In [55]:
num = tfidf_matrix.shape[0]
cos_sim_matrix = np.zeros((num, num))

In [56]:
for i in range(num):
    for j in range(num):
        vec_a = tfidf_matrix[i,:].toarray().flatten()
        vec_b = tfidf_matrix[j,:].toarray().flatten()
        
        cos_sim_matrix[i, j] = cos_sim(vec_a, vec_b)

In [58]:
pd.DataFrame(cos_sim_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,1.000000,0.022710,0.015538,0.017553,0.009409,0.014440,0.010969,0.007492,0.020132,0.006399,0.004916,0.017104,0.018703,0.020998,0.016503,0.017612,0.015006,0.009055,0.016175,0.001394,0.021398,0.014936,0.024654,0.014651,0.007639,0.013533,0.007107,0.013936,0.009848,0.010243,0.020839,0.013712,0.014607,0.015765,0.012912,0.011940,0.003548,0.014598,0.027424,0.011558,...,0.009284,0.008012,0.028670,0.018209,0.007000,0.010156,0.025607,0.006556,0.010796,0.016640,0.007670,0.018573,0.007721,0.020341,0.013332,0.010523,0.018290,0.010052,0.007769,0.016068,0.029872,0.010316,0.015329,0.006938,0.015076,0.022207,0.015003,0.018247,0.006231,0.013149,0.034409,0.015197,0.006960,0.023845,0.007047,0.014256,0.009097,0.019439,0.003494,0.004380
1,0.022710,1.000000,0.034542,0.015970,0.018366,0.015958,0.015647,0.011941,0.010752,0.061438,0.027067,0.023834,0.042407,0.012589,0.009264,0.025130,0.011577,0.008055,0.012439,0.001313,0.040390,0.027526,0.017585,0.018500,0.004797,0.005043,0.007256,0.014563,0.004383,0.016700,0.005641,0.031242,0.010974,0.022391,0.026653,0.025829,0.015253,0.016099,0.019133,0.078869,...,0.007507,0.009538,0.017667,0.009342,0.014785,0.016393,0.043894,0.002623,0.008097,0.028922,0.010728,0.005283,0.019919,0.026347,0.012845,0.015441,0.031221,0.017251,0.009027,0.021817,0.011364,0.018993,0.007411,0.001618,0.010714,0.012880,0.180326,0.017933,0.013892,0.012528,0.008751,0.009804,0.043899,0.019993,0.006667,0.020644,0.027634,0.050744,0.001035,0.005053
2,0.015538,0.034542,1.000000,0.021207,0.058747,0.022948,0.010561,0.017898,0.018978,0.024315,0.086284,0.024923,0.018720,0.006901,0.267810,0.023116,0.017653,0.012221,0.061683,0.001635,0.028101,0.017212,0.004761,0.019249,0.000000,0.023329,0.008345,0.015991,0.206558,0.026044,0.022267,0.035847,0.014757,0.009456,0.035118,0.012382,0.171472,0.020549,0.027577,0.061086,...,0.011976,0.004190,0.011691,0.017537,0.035639,0.008227,0.009734,0.004056,0.004153,0.027865,0.024908,0.006162,0.013495,0.034949,0.016993,0.012313,0.027725,0.006204,0.018128,0.009129,0.007190,0.041152,0.010728,0.045030,0.009824,0.004552,0.020355,0.031041,0.004095,0.008965,0.008009,0.022879,0.008352,0.020846,0.010121,0.010624,0.021217,0.013562,0.004805,0.002072
3,0.017553,0.015970,0.021207,1.000000,0.008038,0.006696,0.019709,0.008395,0.012995,0.025412,0.004069,0.008214,0.017444,0.004345,0.001732,0.013935,0.031435,0.011716,0.005762,0.002216,0.008918,0.022346,0.004516,0.008107,0.014689,0.003847,0.024552,0.010943,0.010252,0.007378,0.011688,0.010483,0.008320,0.029075,0.009186,0.009328,0.004748,0.011777,0.015301,0.016721,...,0.003919,0.003421,0.005894,0.016616,0.003195,0.001791,0.016333,0.010962,0.009753,0.035275,0.009370,0.009203,0.012415,0.009270,0.008969,0.004882,0.009072,0.011873,0.007609,0.011772,0.006967,0.002831,0.020019,0.009679,0.005040,0.013355,0.028461,0.013509,0.011568,0.014636,0.014070,0.004913,0.033730,0.007448,0.007045,0.007950,0.015064,0.008343,0.006416,0.018425
4,0.009409,0.018366,0.058747,0.008038,1.000000,0.003243,0.100169,0.009583,0.016584,0.003341,0.021265,0.019232,0.046186,0.014595,0.035778,0.018462,0.022150,0.002794,0.085707,0.003568,0.043214,0.035379,0.000673,0.113712,0.014476,0.008403,0.012346,0.011940,0.016309,0.022928,0.011089,0.014703,0.048928,0.006701,0.053104,0.005927,0.026795,0.004369,0.014219,0.008186,...,0.000872,0.003295,0.011032,0.004638,0.001003,0.007723,0.011217,0.004469,0.008194,0.019527,0.016977,0.013034,0.008076,0.006667,0.021460,0.007928,0.036049,0.032319,0.009906,0.018829,0.011091,0.023201,0.002422,0.037369,0.016675,0.003157,0.042397,0.006714,0.002747,0.004276,0.004687,0.033521,0.018698,0.008635,0.004621,0.034328,0.013516,0.028376,0.008834,0.004522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.014256,0.020644,0.010624,0.007950,0.034328,0.020487,0.005161,0.052809,0.016663,0.009370,0.010266,0.016499,0.008271,0.011376,0.005659,0.006899,0.009678,0.005667,0.009969,0.009504,0.008172,0.004520,0.010540,0.008651,0.014734,0.018616,0.005562,0.011098,0.004071,0.011303,0.004001,0.011998,0.008356,0.009370,0.012581,0.021063,0.014330,0.012382,0.010642,0.012118,...,0.005035,0.000816,0.005317,0.018745,0.010818,0.012787,0.010295,0.000662,0.007283,0.019196,0.001868,0.015883,0.003542,0.008797,0.004458,0.007044,0.004033,0.008942,0.005224,0.189264,0.008750,0.015342,0.004468,0.003891,0.014072,0.008908,0.001645,0.030177,0.004139,0.003848,0.009361,0.002236,0.005335,0.012819,0.002430,1.000000,0.008310,0.011850,0.007484,0.004972
96,0.009097,0.027634,0.021217,0.015064,0.013516,0.005452,0.006211,0.006733,0.004589,0.015406,0.002229,0.012553,0.004799,0.041763,0.016071,0.047396,0.009786,0.002693,0.014478,0.000800,0.033789,0.040772,0.005209,0.021682,0.004935,0.006819,0.000000,0.010187,0.006737,0.114354,0.016261,0.004647,0.010631,0.028956,0.017316,0.034445,0.006502,0.022413,0.017218,0.031337,...,0.002788,0.002852,0.005088,0.009293,0.019902,0.012475,0.015838,0.012641,0.010146,0.029272,0.024549,0.012420,0.053994,0.009119,0.002516,0.004225,0.042565,0.023730,0.023966,0.006699,0.005680,0.033270,0.007043,0.011546,0.040338,0.003973,0.014806,0.018834,0.007114,0.018962,0.003589,0.013967,0.032735,0.014212,0.006290,0.008310,1.000000,0.022451,0.001697,0.004946
97,0.019439,0.050744,0.013562,0.008343,0.028376,0.003809,0.009368,0.008645,0.009235,0.027599,0.017398,0.021489,0.017670,0.015266,0.003415,0.035527,0.009698,0.005082,0.014822,0.000590,0.049219,0.016536,0.005961,0.025485,0.016898,0.017421,0.008807,0.008633,0.016748,0.012421,0.015697,0.001938,0.016840,0.014282,0.023673,0.023093,0.001826,0.008403,0.003130,0.059946,...,0.005030,0.007084,0.012266,0.020180,0.023260,0.020582,0.007627,0.005204,0.005545,0.037216,0.001318,0.005173,0.009132,0.040130,0.025831,0.006513,0.024652,0.002322,0.008553,0.024205,0.008260,0.074424,0.006785,0.011284,0.013663,0.010443,0.037729,0.024887,0.005978,0.017645,0.003616,0.004397,0.007914,0.015219,0.004778,0.011850,0.022451,1.000000,0.013941,0.012914
98,0.003494,0.001035,0.004805,0.006416,0.008834,0.018147,0.005950,0.005560,0.001637,0.005704,0.001196,0.021267,0.023133,0.018574,0.005849,0.001363,0.018477,0.000000,0.000716,0.000000,0.000000,0.013483,0.001856,0.009914,0.002937,0.001766,0.004757,0.012797,0.004363,0.007776,0.006261,0.000000,0.004866,0.013940,0.011167,0.016263,0.000000,0.003738,0.000756,0.002839,...,0.000432,0.005133,0.004168,0.003291,0.000437,0.007966,0.038219,0.000000,0.002211,0.011469,0.001695,0.004640,0.008452,0.002424,0.009434,0.003133,0.091962,0.000000,0.005602,0.034152,0.005450,0.017090,0.003346,0.000000,0.002401,0.007706,0.030781,0.012156,0.003519,0.006133,0.009009,0.011437,0.007869,0.004565,0.005861,0.007484,0.001697,0.013941,1.000000,0.012288


#### sklearnを使えば1行でできる

In [59]:
cos_sim_matrix_sklearn = cosine_similarity(tfidf_matrix)

In [61]:
pd.DataFrame(cos_sim_matrix_sklearn)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,1.000000,0.022710,0.015538,0.017553,0.009409,0.014440,0.010969,0.007492,0.020132,0.006399,0.004916,0.017104,0.018703,0.020998,0.016503,0.017612,0.015006,0.009055,0.016175,0.001394,0.021398,0.014936,0.024654,0.014651,0.007639,0.013533,0.007107,0.013936,0.009848,0.010243,0.020839,0.013712,0.014607,0.015765,0.012912,0.011940,0.003548,0.014598,0.027424,0.011558,...,0.009284,0.008012,0.028670,0.018209,0.007000,0.010156,0.025607,0.006556,0.010796,0.016640,0.007670,0.018573,0.007721,0.020341,0.013332,0.010523,0.018290,0.010052,0.007769,0.016068,0.029872,0.010316,0.015329,0.006938,0.015076,0.022207,0.015003,0.018247,0.006231,0.013149,0.034409,0.015197,0.006960,0.023845,0.007047,0.014256,0.009097,0.019439,0.003494,0.004380
1,0.022710,1.000000,0.034542,0.015970,0.018366,0.015958,0.015647,0.011941,0.010752,0.061438,0.027067,0.023834,0.042407,0.012589,0.009264,0.025130,0.011577,0.008055,0.012439,0.001313,0.040390,0.027526,0.017585,0.018500,0.004797,0.005043,0.007256,0.014563,0.004383,0.016700,0.005641,0.031242,0.010974,0.022391,0.026653,0.025829,0.015253,0.016099,0.019133,0.078869,...,0.007507,0.009538,0.017667,0.009342,0.014785,0.016393,0.043894,0.002623,0.008097,0.028922,0.010728,0.005283,0.019919,0.026347,0.012845,0.015441,0.031221,0.017251,0.009027,0.021817,0.011364,0.018993,0.007411,0.001618,0.010714,0.012880,0.180326,0.017933,0.013892,0.012528,0.008751,0.009804,0.043899,0.019993,0.006667,0.020644,0.027634,0.050744,0.001035,0.005053
2,0.015538,0.034542,1.000000,0.021207,0.058747,0.022948,0.010561,0.017898,0.018978,0.024315,0.086284,0.024923,0.018720,0.006901,0.267810,0.023116,0.017653,0.012221,0.061683,0.001635,0.028101,0.017212,0.004761,0.019249,0.000000,0.023329,0.008345,0.015991,0.206558,0.026044,0.022267,0.035847,0.014757,0.009456,0.035118,0.012382,0.171472,0.020549,0.027577,0.061086,...,0.011976,0.004190,0.011691,0.017537,0.035639,0.008227,0.009734,0.004056,0.004153,0.027865,0.024908,0.006162,0.013495,0.034949,0.016993,0.012313,0.027725,0.006204,0.018128,0.009129,0.007190,0.041152,0.010728,0.045030,0.009824,0.004552,0.020355,0.031041,0.004095,0.008965,0.008009,0.022879,0.008352,0.020846,0.010121,0.010624,0.021217,0.013562,0.004805,0.002072
3,0.017553,0.015970,0.021207,1.000000,0.008038,0.006696,0.019709,0.008395,0.012995,0.025412,0.004069,0.008214,0.017444,0.004345,0.001732,0.013935,0.031435,0.011716,0.005762,0.002216,0.008918,0.022346,0.004516,0.008107,0.014689,0.003847,0.024552,0.010943,0.010252,0.007378,0.011688,0.010483,0.008320,0.029075,0.009186,0.009328,0.004748,0.011777,0.015301,0.016721,...,0.003919,0.003421,0.005894,0.016616,0.003195,0.001791,0.016333,0.010962,0.009753,0.035275,0.009370,0.009203,0.012415,0.009270,0.008969,0.004882,0.009072,0.011873,0.007609,0.011772,0.006967,0.002831,0.020019,0.009679,0.005040,0.013355,0.028461,0.013509,0.011568,0.014636,0.014070,0.004913,0.033730,0.007448,0.007045,0.007950,0.015064,0.008343,0.006416,0.018425
4,0.009409,0.018366,0.058747,0.008038,1.000000,0.003243,0.100169,0.009583,0.016584,0.003341,0.021265,0.019232,0.046186,0.014595,0.035778,0.018462,0.022150,0.002794,0.085707,0.003568,0.043214,0.035379,0.000673,0.113712,0.014476,0.008403,0.012346,0.011940,0.016309,0.022928,0.011089,0.014703,0.048928,0.006701,0.053104,0.005927,0.026795,0.004369,0.014219,0.008186,...,0.000872,0.003295,0.011032,0.004638,0.001003,0.007723,0.011217,0.004469,0.008194,0.019527,0.016977,0.013034,0.008076,0.006667,0.021460,0.007928,0.036049,0.032319,0.009906,0.018829,0.011091,0.023201,0.002422,0.037369,0.016675,0.003157,0.042397,0.006714,0.002747,0.004276,0.004687,0.033521,0.018698,0.008635,0.004621,0.034328,0.013516,0.028376,0.008834,0.004522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.014256,0.020644,0.010624,0.007950,0.034328,0.020487,0.005161,0.052809,0.016663,0.009370,0.010266,0.016499,0.008271,0.011376,0.005659,0.006899,0.009678,0.005667,0.009969,0.009504,0.008172,0.004520,0.010540,0.008651,0.014734,0.018616,0.005562,0.011098,0.004071,0.011303,0.004001,0.011998,0.008356,0.009370,0.012581,0.021063,0.014330,0.012382,0.010642,0.012118,...,0.005035,0.000816,0.005317,0.018745,0.010818,0.012787,0.010295,0.000662,0.007283,0.019196,0.001868,0.015883,0.003542,0.008797,0.004458,0.007044,0.004033,0.008942,0.005224,0.189264,0.008750,0.015342,0.004468,0.003891,0.014072,0.008908,0.001645,0.030177,0.004139,0.003848,0.009361,0.002236,0.005335,0.012819,0.002430,1.000000,0.008310,0.011850,0.007484,0.004972
96,0.009097,0.027634,0.021217,0.015064,0.013516,0.005452,0.006211,0.006733,0.004589,0.015406,0.002229,0.012553,0.004799,0.041763,0.016071,0.047396,0.009786,0.002693,0.014478,0.000800,0.033789,0.040772,0.005209,0.021682,0.004935,0.006819,0.000000,0.010187,0.006737,0.114354,0.016261,0.004647,0.010631,0.028956,0.017316,0.034445,0.006502,0.022413,0.017218,0.031337,...,0.002788,0.002852,0.005088,0.009293,0.019902,0.012475,0.015838,0.012641,0.010146,0.029272,0.024549,0.012420,0.053994,0.009119,0.002516,0.004225,0.042565,0.023730,0.023966,0.006699,0.005680,0.033270,0.007043,0.011546,0.040338,0.003973,0.014806,0.018834,0.007114,0.018962,0.003589,0.013967,0.032735,0.014212,0.006290,0.008310,1.000000,0.022451,0.001697,0.004946
97,0.019439,0.050744,0.013562,0.008343,0.028376,0.003809,0.009368,0.008645,0.009235,0.027599,0.017398,0.021489,0.017670,0.015266,0.003415,0.035527,0.009698,0.005082,0.014822,0.000590,0.049219,0.016536,0.005961,0.025485,0.016898,0.017421,0.008807,0.008633,0.016748,0.012421,0.015697,0.001938,0.016840,0.014282,0.023673,0.023093,0.001826,0.008403,0.003130,0.059946,...,0.005030,0.007084,0.012266,0.020180,0.023260,0.020582,0.007627,0.005204,0.005545,0.037216,0.001318,0.005173,0.009132,0.040130,0.025831,0.006513,0.024652,0.002322,0.008553,0.024205,0.008260,0.074424,0.006785,0.011284,0.013663,0.010443,0.037729,0.024887,0.005978,0.017645,0.003616,0.004397,0.007914,0.015219,0.004778,0.011850,0.022451,1.000000,0.013941,0.012914
98,0.003494,0.001035,0.004805,0.006416,0.008834,0.018147,0.005950,0.005560,0.001637,0.005704,0.001196,0.021267,0.023133,0.018574,0.005849,0.001363,0.018477,0.000000,0.000716,0.000000,0.000000,0.013483,0.001856,0.009914,0.002937,0.001766,0.004757,0.012797,0.004363,0.007776,0.006261,0.000000,0.004866,0.013940,0.011167,0.016263,0.000000,0.003738,0.000756,0.002839,...,0.000432,0.005133,0.004168,0.003291,0.000437,0.007966,0.038219,0.000000,0.002211,0.011469,0.001695,0.004640,0.008452,0.002424,0.009434,0.003133,0.091962,0.000000,0.005602,0.034152,0.005450,0.017090,0.003346,0.000000,0.002401,0.007706,0.030781,0.012156,0.003519,0.006133,0.009009,0.011437,0.007869,0.004565,0.005861,0.007484,0.001697,0.013941,1.000000,0.012288


#### もっとも類似度の高い文章を抽出する

In [66]:
# 対角の1を0に
np.fill_diagonal(cos_sim_matrix, 0)

doc_index = np.unravel_index(np.argmax(cos_sim_matrix), cos_sim_matrix.shape)

In [67]:
cos_sim_matrix[doc_index]

np.float64(0.6108464979109193)

In [68]:
doc_index

(np.int64(38), np.int64(40))

In [69]:
documents[38]

'http://news.livedoor.com/article/detail/5937779/\n2011-10-14T16:44:00+0900\nビッグダディに「実はけっこう金持ち」疑惑が浮上\n男手ひとつで4男4女を育ててきたビッグダディこと林下清志さんを中心とする大家族に密着したドキュメント番組「痛快!ビッグダディ」は、2006年の放送開始以来、10月で通算13回目の放送となり、お茶の間での人気も定着した感がある。\n\n現在、ビッグダディの家族は4男4女に再婚した18歳年下の妻の連れ子1男4女が加わり、合計15人の大家族に膨れ上がった。\n\nしかし、ここにきて「ビッグダディの金回りがおかしい」という疑惑がネット上で噴出している。\n\nもともとは奄美大島で接骨院を営んでいたビッグダディだが、十分に稼げず、豊田では雇われる形で出稼ぎに出ていた。出稼ぎしなければならないほど逼迫した状況で、しかも10人以上の子供がいるとなれば、いくら子供手当てがあるとはいえ、家計が火の車であることは容易に想像できる。\n\nしかし、10月1日、8日と2週連続で放映された同番組では、豊田の接骨院を辞め小豆島へ移住、さらに引越しに際してはマイクロバスをチャーター、小豆島に一軒家を購入、そして小豆島で接骨院を開業するなど、やけに金回りがいいのだ。また、ネット上の番組ウォッチャーによる、「子供がたまにブランド品のTシャツを着ている」といった書き込みも目立つ。\n\n常識的に考えれば、接骨院の開業には少なくとも100万円以上の資金が必要だろう。番組内では「友人から借りた」としていたが、そんなにあっさりお金を貸してくれる友人がいるなら、もっと早い段階で助けを求めていそうなものだが…。\n\nネット上では「知名度を生かして講演会に出演し儲けている」といった報告も相次いだ。講演会に関しては、2011年5月に実際に開催されていた模様で、参加した聴衆がブログにその模様を記している。下手な芸能人よりも知名度があるビッグダディの現状を考えると、講演会のギャラは最低でも数十万円になることが予想される。\n\n【関連リンク】\n・【ビックダディ講演会】ダディの子育て論\n\nまた、「TV局から多額のギャラを受け取っているのではないか」という疑惑も噴出した。一部週刊誌ではギャラの存在について否定してい

In [70]:
documents[40]

'http://news.livedoor.com/article/detail/5938279/\n2011-10-16T10:00:00+0900\n″ビッグダディ″放送内容に様々な疑惑が浮上\nテレビ朝日系列で放映されている人気番組『痛快!!ビッグダディ』。\n\n整体師の林下清志さん（通称ビッグダディ）とその妻が、様々な土地に移り住みながら13人の子どもを育てるという、いわゆる"大家族モノ"のドキュメント番組である。\n\n2011年10月までに特別編を含め18回も放送され、その度に高視聴率を叩き出す人気ぶりだが、最近はビッグダディの過去や番組の舞台裏を探るニュースも話題になっている。\n\n【関連情報】\n・ビッグダディが“衝撃の過去”をネットで明かしていた疑惑が浮上\nhttp://news.livedoor.com/article/detail/5930673/\n\n仕事をはじめ常に行き当たりばったりに見えるビッグダディ。しかし移住先は「何らかの支援制度を設けた地域を狙っている」と指摘する声も。\n\n【関連情報】\n・移住者に支援がある地域を狙うビッグダディ（リアルライブ?）\nhttp://news.livedoor.com/article/detail/5927775/\n\nまた一方では「子供がたまにブランド品のTシャツを着ている」などの書き込みも目立ち「ビッグダディの金回りがおかしい」という疑惑もネット上で噴出した。\n\n【関連情報】\n・ビッグダディに「実はけっこう金持ち」疑惑が浮上\nhttp://news.livedoor.com/article/detail/5937779/\n\n従来の大家族モノを裏切って「確信犯的」とも指摘されているが、貧しいながらも明るく懸命に生きる姿まで「ヤラセだった」というオチではないことを祈りたい。\n\n【関連情報】\n・裏のストーリー隠すビッグダディ\nhttp://news.livedoor.com/topics/detail/5929592/\n'