In [38]:
import glob
import os
import re
import sqlite3
from collections import Counter
from bs4 import BeautifulSoup
from tqdm import tqdm
import MeCab
from pathlib import Path

In [39]:
WIKI_FILES_DIRECTORY = '../data/wikipedia/*/*'
ABSOLUTE_PATH = Path().resolve()
DB_FILE = 'corpus_data.db'

In [40]:
class WikiProcessing():
    ''' Wikipedia関連の処理を行うclass '''
    def __init__(self):
        return

    def get_wikifiles(self, wiki_files_directory):
        return glob.glob(wiki_files_directory)

    def separate_paragraph_array(self, document):
        contents = document.read()
        # docタグ以外のタグは削除
        contents = re.sub(r'\<((?!doc).)*?\>', '', contents)
        contents = re.sub(r'\</((?!doc).)*?\>', '', contents)
        # BeautifulSoupの仕様?頭に何かタグを入れないと、<doc>タグを最初の一つしか読まない
        contents = '<docs>\n' + contents + '</docs>'
        soup = BeautifulSoup(contents, "xml")
        wiki_items = soup.find_all('doc')

        wiki_text = ''
        for wiki_item in wiki_items:
            wiki_text += wiki_item.get_text()
        return wiki_text.split('\n')

In [41]:
class DbModel:
    ''' Sqliteの処理を行うclass '''
    def __init__(self, db_file):
        self.db_file = db_file
        if not os.path.isfile(os.path.join(ABSOLUTE_PATH, db_file)):
            self._init_process()
            self._create_table()
        else:
            self._init_process()

    def _init_process(self):
        self.conn = sqlite3.connect(self.db_file)
        self.cur = self.conn.cursor()

    def _create_table(self):
        self.cur.execute('''CREATE TABLE words (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            word STRING,
            frequency INTEGER
        )''')
        self.cur.execute('''CREATE TABLE sentences (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            file_name STRING,
            word_sentence STRING,
            index_sentence STRING
        )''')
        return self.conn, self.cur

    def select_all_records_words_table(self):
        select_word_sql = 'SELECT * FROM words'
        self.cur.execute(select_word_sql)
        return self.cur.fetchall()

    def insert_records_words_table(self, words_frequency):
        insert_word_sql = 'INSERT INTO words (word, frequency) values (?,?)'
        inserted_info = []
        for word, frequency in dict(words_frequency).items():
            inserted_info.append((word, frequency))
        self.cur.executemany(insert_word_sql, inserted_info)

    def insert_records_sentences_table(self, inserted_info):
        insert_sentence_sql = '''
            INSERT INTO sentences (
                file_name, word_sentence, index_sentence
            ) values (?,?,?)
        '''
        self.cur.executemany(insert_sentence_sql, inserted_info)

    def close_connection(self):
        self.conn.commit()
        self.conn.close()

In [42]:
class WordProcessing():
    ''' 文章から単語の出現頻度を求めたり、単語をindexにしたりするclass '''
    def __init__(self, corpus, docs_files):
        self.corpus = corpus
        self.docs_files = docs_files
        self.file_num = len(docs_files)
        # 日本語の表層形でなく、基本形を使用するために、パーサーとしてOchasenを使用
        self.tagger = MeCab.Tagger('-Ochasen')

    def extract_words_fequency(self):
        words_frequency = Counter({})
        unnecessary_words = []

        pbar = tqdm(total=self.file_num)
        pbar.set_description('extract words fequency')
        for docs_file in self.docs_files:
            pbar.update(1)
            with open(docs_file) as doc:
                paragraph_sentence_list =\
                    self.corpus.separate_paragraph_array(doc)
                words_list = []
                for paragraph_sentence in paragraph_sentence_list:
                    # サンプル文を形態素解析した単語群にパース
                    node = self.tagger.parseToNode(paragraph_sentence)
                    paragraph_words = []
                    while node:
                        #  node.featureで、該当単語に対して以下の特徴抽出ができる
                        # [品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,原形,読み,発音]
                        # 表層形でなく、基本形(原形)を採用
                        word = node.feature.split(",")[6]
                        # 日本語の表現だけを抽出
                        if re.search(r'[ぁ-んァ-ヶ一-龥]+', word):
                            paragraph_words.append(word)
                        else:
                            if word not in unnecessary_words:
                                unnecessary_words.append(word)
                        node = node.next
                    words_list.extend(paragraph_words)

                words_frequency += Counter(words_list)
        pbar.close()
        return words_frequency

    def _create_word2idx_dict(self, words_info):
        word_stoi = {}
        for word_info in words_info:
            word_stoi[word_info[1]] = word_info[0]
        return word_stoi

    def transfer_sentence_word2idx(self, words_info, db_model):
        word_stoi = self._create_word2idx_dict(words_info)

        pbar = tqdm(total=self.file_num)
        pbar.set_description('transfer sentence word2idx')
        for wiki_file in self.docs_files:
            pbar.update(1)
            with open(wiki_file) as doc:
                inserted_info = []

                paragraph_sentence_list =\
                    self.corpus.separate_paragraph_array(doc)
                for paragraph_sentence in paragraph_sentence_list:
                    # サンプル文を形態素解析した単語群にパース
                    node = self.tagger.parseToNode(paragraph_sentence)
                    paragraph_words = ''
                    while node:
                        #  node.featureで、該当単語に対して以下の特徴抽出ができる
                        # [品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,原形,読み,発音]
                        # 表層形でなく、基本形(原形)を採用
                        word = node.feature.split(",")[6]
                        # 日本語の表現だけを抽出
                        if re.search(r'[ぁ-んァ-ヶ一-龥]+', word):
                            paragraph_words += str(word_stoi[word]) + ', '
                        node = node.next

                    if not paragraph_words:
                        inserted_info.append((
                            wiki_file, paragraph_sentence, paragraph_words
                        ))

                # 文章情報のDB登録
                db_model.insert_records_sentences_table(inserted_info)
        pbar.close()

In [43]:
wiki = WikiProcessing()

# wikipediaのファイルpath一覧を取得
wiki_files = wiki.get_wikifiles(WIKI_FILES_DIRECTORY)

# wikipedia上に出現する単語とその出現頻度を算出
wiki_word_processing = WordProcessing(wiki, wiki_files)
wiki_words_frequency = wiki_word_processing.extract_words_fequency()

# 単語とその出現頻度をDB格納
db_model = DbModel(DB_FILE)
db_model.insert_records_words_table(wiki_words_frequency)

# テキストデータを単語インデックスの配列情報としてDB格納
words_info = db_model.select_all_records_words_table()
wiki_word_processing.transfer_sentence_word2idx(words_info, db_model)

db_model.close_connection()

extract words fequency: 100%|██████████| 100/100 [00:32<00:00,  2.53it/s]
transfer sentence word2idx: 100%|██████████| 100/100 [00:32<00:00,  3.13it/s]
