In [1]:
from gensim import corpora
from gensim import models
from gensim.models.word2vec import Word2Vec, Text8Corpus
from gensim.utils import simple_preprocess
from janome.tokenizer import Tokenizer
from pprint import pprint
import pandas as pd
import numpy as np
import logging

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
documents = [
    'a b c a',
    'c b c',
    'b b a',
    'a c c',
    'c b a'
]

In [4]:
texts = list(map(lambda x:x.split(), documents))

In [5]:
texts

[['a', 'b', 'c', 'a'],
 ['c', 'b', 'c'],
 ['b', 'b', 'a'],
 ['a', 'c', 'c'],
 ['c', 'b', 'a']]

In [6]:
dictionary = corpora.Dictionary(texts)
print('===単語->idの変換辞書===')
pprint(dictionary.token2id)

2020-10-19 02:50:11,319 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-10-19 02:50:11,322 : INFO : built Dictionary(3 unique tokens: ['a', 'b', 'c']) from 5 documents (total 16 corpus positions)


===単語->idの変換辞書===
{'a': 0, 'b': 1, 'c': 2}


In [7]:
corpus = list(map(dictionary.doc2bow, texts))
print('===corpus化されたtexts===')
pprint(corpus)

===corpus化されたtexts===
[[(0, 2), (1, 1), (2, 1)],
 [(1, 1), (2, 2)],
 [(0, 1), (1, 2)],
 [(0, 1), (2, 2)],
 [(0, 1), (1, 1), (2, 1)]]


In [8]:
test_model = models.TfidfModel(corpus)

2020-10-19 02:50:11,371 : INFO : collecting document frequencies
2020-10-19 02:50:11,373 : INFO : PROGRESS: processing document #0
2020-10-19 02:50:11,380 : INFO : calculating IDF weights for 5 documents and 3 features (12 matrix non-zeros)


In [9]:
corpus_tfidf = test_model[corpus]

In [10]:
pprint('===結果表示===')
for doc in corpus_tfidf:
    print(doc)

'===結果表示==='
[(0, 0.816496580927726), (1, 0.408248290463863), (2, 0.408248290463863)]
[(1, 0.447213595499958), (2, 0.894427190999916)]
[(0, 0.447213595499958), (1, 0.894427190999916)]
[(0, 0.447213595499958), (2, 0.894427190999916)]
[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]


In [11]:
texts_tfidf = []
for doc in corpus_tfidf:
    text_tfidf = []
    for word in doc:
        text_tfidf.append([dictionary[word[0]], word[1]])
    texts_tfidf.append(text_tfidf)
    
print('===結果表示===')
for text in texts_tfidf:
    print(text)

===結果表示===
[['a', 0.816496580927726], ['b', 0.408248290463863], ['c', 0.408248290463863]]
[['b', 0.447213595499958], ['c', 0.894427190999916]]
[['a', 0.447213595499958], ['b', 0.894427190999916]]
[['a', 0.447213595499958], ['c', 0.894427190999916]]
[['a', 0.5773502691896257], ['b', 0.5773502691896257], ['c', 0.5773502691896257]]


In [12]:
t = Tokenizer()

In [13]:
df = pd.read_csv('/work/data/BunshuOnline/news.csv')

In [14]:
doc_list = list(df.news_page_list)

In [15]:
wakati_list = []

In [16]:
for doc in doc_list:
    wakati_text = list(t.tokenize(doc, wakati=True))
    wakati_list.append(wakati_text)

In [17]:
wakati_list

[['「',
  '例年',
  'なら',
  '『',
  '駅伝',
  'は',
  'トラック',
  'と',
  'は',
  '別物',
  'だ',
  'から',
  '、',
  '1',
  '年生',
  'は',
  '戦力',
  'として',
  'あまり',
  '期待',
  'し',
  'すぎ',
  'ない',
  'ほう',
  'が',
  'いい',
  '』',
  'という',
  '話',
  'が',
  '出る',
  'ん',
  'です',
  '。',
  'でも',
  '、',
  '今年',
  'は',
  'ちょっと',
  '雰囲気',
  'が',
  '違い',
  'ます',
  'ね',
  '」',
  ' ',
  'そんな',
  '風',
  'に',
  '今季',
  'の',
  '驚き',
  'を',
  '語る',
  'の',
  'は',
  '、',
  'スポーツ',
  '紙',
  'の',
  '駅伝',
  '担当',
  '記者',
  'だ',
  '。',
  ' ',
  '春先',
  'から',
  '続く',
  'コロナ',
  '禍',
  'の',
  '中',
  'で',
  '、',
  '今年',
  'は',
  'ここ',
  'まで',
  'スポーツ',
  '界',
  'も',
  '大きな',
  '影響',
  'を',
  '受け',
  'て',
  'き',
  'た',
  '。',
  'それ',
  'は',
  '学生',
  '長距離',
  '界',
  'において',
  'も',
  '同様',
  'で',
  '、',
  '春',
  'から',
  '夏',
  'にかけて',
  '大会',
  'の',
  '中止',
  'は',
  'もちろん',
  '、',
  '記録',
  '会',
  'や',
  '各校',
  'の',
  '練習',
  'に',
  'も',
  '大きな',
  '支障',
  'が',
  '出',
  '続け',
  'て',
  'い',
  'た',
  '。',
  ' ',
  '本来',
  'なら',
  'ば',