In [6]:
import gensim
import MeCab
import os
import glob
import pickle

In [7]:
root_dir = './jawiki_data/'  # 用意したデータのあるdirectory
infile_base = root_dir + 'jawiki-latest-pages-articles-'
N_ARTICLES = 551  # 記事数
MARK_ARTICLE = '=' * 20  # 記事ごとの分割マーク
sample_ratio = 0.3  # 時間節約のため全記事の30%のみ使用


SUFFIX_SPLIT = '_split_v2.txt'
SUFFIX_TOPIC = '_topicid_v2.txt'

In [10]:
def ja_tokenize(text):
#     mecab = MeCab.Tagger("-Ochasen -d /opt/local/lib/mecab/dic/mecab-ipadic-neologd")
    mecab = MeCab.Tagger("-Ochasen -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd")
    mecab.parse('')
    lines = text.split('\n')
    results = []
    for s in lines:
        node = mecab.parseToNode(s)
        while node:
            features = node.feature.split(',')
            if features[0] != 'BOS/EOS':
                w = features[-3]  # word_base
                p = features[0]  # part of speech
                if p in ['名詞', '形容詞', '動詞']:
                    results.append(w)
            node = node.next
        results.append('\n')
    return results

def to_article_words(nums):
    fname_topicid =\
        infile_base + '%03d-%03d' % (nums[0], nums[-1]) + SUFFIX_TOPIC
    g = open(fname_topicid, 'w')
    for i in nums:
        path = infile_base + '%03d' % i + '.txt'
        print(path)
        file_split = path.replace('.txt', SUFFIX_SPLIT)

        text = open(path, 'r').read()
        text = text.lower()
        for word in ['file:', 'ファイル:', 'image:', '画像:']:
            text = text.replace('[[' + word, word)
        text = text.replace(']]', '').replace(':', ' ')
        for j, s in enumerate(text.split('\n\n[[')):
            words = ja_tokenize(s)
            if j == 0:
                wt = ' '.join(words)
            else:
                wt += MARK_ARTICLE + ' '.join(words)
            g.write('%d-%d: %s\n' % (i, j+1, s.split('\n')[0].replace('[', '').strip()))
        open(file_split, 'w').write(wt)
    g.close()

In [None]:
# 記事情報を単語情報に加工
nums = range(1, N_ARTICLES + 1)
to_article_words(nums)

./jawiki_data/jawiki-latest-pages-articles-001.txt
./jawiki_data/jawiki-latest-pages-articles-002.txt
./jawiki_data/jawiki-latest-pages-articles-003.txt
./jawiki_data/jawiki-latest-pages-articles-004.txt
./jawiki_data/jawiki-latest-pages-articles-005.txt
./jawiki_data/jawiki-latest-pages-articles-006.txt
./jawiki_data/jawiki-latest-pages-articles-007.txt
./jawiki_data/jawiki-latest-pages-articles-008.txt
./jawiki_data/jawiki-latest-pages-articles-009.txt
./jawiki_data/jawiki-latest-pages-articles-010.txt
./jawiki_data/jawiki-latest-pages-articles-011.txt
./jawiki_data/jawiki-latest-pages-articles-012.txt
./jawiki_data/jawiki-latest-pages-articles-013.txt
./jawiki_data/jawiki-latest-pages-articles-014.txt
./jawiki_data/jawiki-latest-pages-articles-015.txt
./jawiki_data/jawiki-latest-pages-articles-016.txt
./jawiki_data/jawiki-latest-pages-articles-017.txt
./jawiki_data/jawiki-latest-pages-articles-018.txt
./jawiki_data/jawiki-latest-pages-articles-019.txt
./jawiki_data/jawiki-latest-pag

In [None]:
# 記事ごとの単語情報を取得
import random
random.seed(0)

splitfiles = sorted(glob.glob(root_dir + '*' + SUFFIX_SPLIT))
#print(len(splitfiles), splitfiles)

docs = []
j = 0
g = open(infile_base + 'resample_docid_v2.txt', 'w')
for i, file_split in enumerate(splitfiles):
    print(file_split)
    texts0 = open(file_split, 'r').read().split(MARK_ARTICLE)
    n = len(texts0)
    indices = sorted(random.sample(range(n), int(sample_ratio * n)))
    for idx in indices:
        text = texts0[idx]
        docs.append(text.split())
        g.write('%d: %d-%d\n' % (j, i+1, idx+1))
        j += 1
#     break
g.close()

In [None]:
# 記事ごとの単語情報から辞書を作成
if not os.path.exists('./jawiki_model'):
    os.mkdir('./jawiki_model')
dictionary = gensim.corpora.Dictionary(docs)
dictionary.save_as_text('./jawiki_model/jawiki_wordid_resample_no-filtered_v2.txt')

# あまり重要でない単語を除去
dictionary.filter_extremes(no_below=20, no_above=0.3, keep_n=None)
dictionary.save_as_text('./jawiki_model/jawiki_wordid_resample_v2.txt')

In [None]:
# 記事ごとの単語ベースの情報（コーパス）を作成
corpus = [dictionary.doc2bow(doc) for doc in docs]
gensim.corpora.MmCorpus.serialize('./jawiki_model/jawiki_bow_resample_v2.mm', corpus)

In [None]:
# コーパスをTFIDF化
tfidf = gensim.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

with open('./jawiki_model/jawiki_tfidf_resample_v2.dump', mode='wb') as f:
    pickle.dump(corpus_tfidf, f)

In [None]:
# 上記で作成したコーパスを使い、LDAモデルで最適化する。
NUM_TOPICS = 100  # トピック数は100個とした

dictionary = gensim.corpora.Dictionary.load_from_text('./jawiki_model/jawiki_wordid_resample_v2.txt')
with open('./jawiki_model/jawiki_tfidf_resample_v2.dump', mode='rb') as f:
     corpus_tfidf = pickle.load(f)

lda = gensim.models.ldamulticore.LdaMulticore(corpus=corpus_tfidf, id2word=dictionary,
                                               num_topics=NUM_TOPICS, workers=3, minimum_probability=0.001,
                                               passes=20, chunksize=10000)
lda.save('./jawiki_model/jawiki_lda_v2.model')