In [1]:
import pathlib
import pandas as pd
import MeCab
import sqlite3

In [2]:
# https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp
# encoding: utf8
from __future__ import unicode_literals
import re
import unicodedata

def remove_speaker(s):
    sep = " "
    s1 = s.split(sep)[0]
    s2 = sep.join(s.split(sep)[1:])
    if "委員長" in s1 or "議長" in s1:
        return ""
    return s2

def unicode_normalize(cls, s):
    pt = re.compile('([{}]+)'.format(cls))

    def norm(c):
        return unicodedata.normalize('NFKC', c) if pt.match(c) else c

    s = ''.join(norm(x) for x in re.split(pt, s))
    s = re.sub('－', '-', s)
    return s

def remove_extra_spaces(s):
    s = re.sub('[ 　]+', ' ', s)
    blocks = ''.join(('\u4E00-\u9FFF',  # CJK UNIFIED IDEOGRAPHS
                      '\u3040-\u309F',  # HIRAGANA
                      '\u30A0-\u30FF',  # KATAKANA
                      '\u3000-\u303F',  # CJK SYMBOLS AND PUNCTUATION
                      '\uFF00-\uFFEF'   # HALFWIDTH AND FULLWIDTH FORMS
                      ))
    basic_latin = '\u0000-\u007F'

    def remove_space_between(cls1, cls2, s):
        p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
        while p.search(s):
            s = p.sub(r'\1\2', s)
        return s

    s = remove_space_between(blocks, blocks, s)
    s = remove_space_between(blocks, basic_latin, s)
    s = remove_space_between(basic_latin, blocks, s)
    return s

def normalize_neologd(s):
    s = remove_speaker(s)
    s = s.strip()
    s = unicode_normalize('０-９Ａ-Ｚａ-ｚ｡-ﾟ', s)

    def maketrans(f, t):
        return {ord(x): ord(y) for x, y in zip(f, t)}

    s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s)  # normalize hyphens
    s = re.sub('[﹣－ｰ—―─━ー]+', 'ー', s)  # normalize choonpus
    s = re.sub('[~∼∾〜〰～]', '', s)  # remove tildes
    s = s.translate(
        maketrans('!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~｡､･｢｣',
              '！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」'))

    s = remove_extra_spaces(s)
    s = unicode_normalize('！”＃＄％＆’（）＊＋，－．／：；＜＞？＠［￥］＾＿｀｛｜｝〜', s)  # keep ＝,・,「,」
    s = re.sub('[’]', '\'', s)
    s = re.sub('[”]', '"', s)
    return s

In [3]:
def wakati(df, stop_words, speakers):
    text = df["発言内容"].apply(normalize_neologd)
    mecab = MeCab.Tagger('-d /Users/ayakowatanabe/opt/anaconda3/lib/mecab/dic/mecab-ipadic-neologd')
    mecab_text = [mecab.parse(t).strip() for t in text]

    all = []
    for line in mecab_text:
        morphemes = []
        
        cols = line.split('\n')
        for col in cols:
            c = col.split('\t')
            if(len(c) < 2):
                continue
            res_cols = c[1].split(',')

            morpheme = {
                'surface': c[0],
                'pos': res_cols[0],
                'pos1': res_cols[1],
                'pos2' : res_cols[2],
                'base': res_cols[6]
            }
            morphemes.append(morpheme)
        all.append(morphemes)
    # https://qiita.com/8_hisakichi_8/items/d6894803d2ebabf3d33b
    word_lst = []
    for line in all:
        each_row = []
        for morphemes in line:
            if morphemes['pos'] == '名詞':
                if not morphemes['surface'] in stop_words and not morphemes['surface'] in speakers and not any([morphemes["pos1"] in ["非自立","代名詞","数","副詞可能","接尾"]]) and morphemes["pos2"] != "人名":
                    each_row.append(morphemes['surface'])
        s = ' '.join(each_row)
        word_lst.append(s)
    return word_lst

In [4]:
#  https://aidemy.net/magazine/688/
def sloth():
    import urllib3
    from bs4 import BeautifulSoup

    slothlib_path = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'
    http = urllib3.PoolManager()
    #↑urlib3系のおまじない
    slothlib_file =http.request('GET',slothlib_path)
    soup=BeautifulSoup(slothlib_file.data,'lxml')
    soup=str(soup).split()#soupは文字列じゃないので注意
    return soup

In [5]:
def insert_sql(df):
    conn = sqlite3.connect('speech.db') 
    c = conn.cursor()
    df.to_sql('speech', con=conn, if_exists='append', index=None)

In [6]:
STOPWORDS = sloth()

In [11]:
for path in range(2002, 2023):
    csv_path = pathlib.Path(str(path))
    file_lst = sorted([str(path) for path in csv_path.glob('*.csv')])
    for file in file_lst:
        df = pd.read_csv(file, encoding= 'utf-8')
        speakers = [speaker for speaker in df["発言者名"]]
        word_lst = wakati(df, STOPWORDS, speakers)
        df["発言内容"] = pd.Series(word_lst)
        df = df[df["発言者肩書き"].isnull()]
        df = df[df["発言者名"] != "会議録情報"]
        df = df[df["発言内容"] != '']
        df = df[["発言ID", "院名", "会議名", "号数", "日付", "発言者名", "発言者所属会派","発言内容"]]
        df = df.rename(columns={"発言ID": "id", "院名": "house", "会議名": "committee", "号数": "vol", "日付": "speech_date", \
            "発言者名": "speaker", "発言者所属会派": "party", "発言内容": "speech"})
        insert_sql(df)

  df["発言内容"] = pd.Series(word_lst)
  df["発言内容"] = pd.Series(word_lst)
