In [1]:
import pathlib
import pandas as pd
import MeCab
import sqlite3

In [20]:
conn = sqlite3.connect('raw_speech.db') 
c = conn.cursor()

In [11]:
# https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp
# encoding: utf8
from __future__ import unicode_literals
import re
import unicodedata

def remove_speaker(s):
    sep = " "
    s1 = s.split(sep)[0]
    s2 = sep.join(s.split(sep)[1:])
    if "委員長" in s1 or "議長" in s1:
        return ""
    return s2

def unicode_normalize(cls, s):
    pt = re.compile('([{}]+)'.format(cls))

    def norm(c):
        return unicodedata.normalize('NFKC', c) if pt.match(c) else c

    s = ''.join(norm(x) for x in re.split(pt, s))
    s = re.sub('－', '-', s)
    return s

def remove_extra_spaces(s):
    s = re.sub('[ 　]+', ' ', s)
    blocks = ''.join(('\u4E00-\u9FFF',  # CJK UNIFIED IDEOGRAPHS
                      '\u3040-\u309F',  # HIRAGANA
                      '\u30A0-\u30FF',  # KATAKANA
                      '\u3000-\u303F',  # CJK SYMBOLS AND PUNCTUATION
                      '\uFF00-\uFFEF'   # HALFWIDTH AND FULLWIDTH FORMS
                      ))
    basic_latin = '\u0000-\u007F'

    def remove_space_between(cls1, cls2, s):
        p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
        while p.search(s):
            s = p.sub(r'\1\2', s)
        return s

    s = remove_space_between(blocks, blocks, s)
    s = remove_space_between(blocks, basic_latin, s)
    s = remove_space_between(basic_latin, blocks, s)
    return s

def normalize_neologd(s):
    s = remove_speaker(s)
    s = s.strip()
    s = unicode_normalize('０-９Ａ-Ｚａ-ｚ｡-ﾟ', s)

    def maketrans(f, t):
        return {ord(x): ord(y) for x, y in zip(f, t)}

    s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s)  # normalize hyphens
    s = re.sub('[﹣－ｰ—―─━ー]+', 'ー', s)  # normalize choonpus
    s = re.sub('[~∼∾〜〰～]', '', s)  # remove tildes
    s = s.translate(
        maketrans('!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~｡､･｢｣',
              '！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」'))

    s = remove_extra_spaces(s)
    s = unicode_normalize('！”＃＄％＆’（）＊＋，－．／：；＜＞？＠［￥］＾＿｀｛｜｝〜', s)  # keep ＝,・,「,」
    s = re.sub('[’]', '\'', s)
    s = re.sub('[”]', '"', s)
    return s

In [12]:
def wakati(df):
    text = df["発言内容"].apply(normalize_neologd)
    mecab = MeCab.Tagger('-d /Users/ayakowatanabe/opt/anaconda3/lib/mecab/dic/mecab-ipadic-neologd')
    mecab_text = [mecab.parse(t).strip() for t in text]

    all = []
    for line in mecab_text:
        morphemes = []
        
        cols = line.split('\n')
        for col in cols:
            c = col.split('\t')
            if(len(c) < 2):
                continue
            res_cols = c[1].split(',')

            morpheme = {
                'surface': c[0],
                'pos': res_cols[0],
                'pos1': res_cols[1],
                'pos2' : res_cols[2],
                'base': res_cols[6]
            }
            morphemes.append(morpheme)
        all.append(morphemes)
    # https://qiita.com/8_hisakichi_8/items/d6894803d2ebabf3d33b
    word_lst = []
    morpheme_lst = []
    for line in all:
        word_row = []
        morpheme_row = []
        for morphemes in line:
            word_row.append(morphemes['surface'])
            morpheme_row.append(morphemes['pos']+morphemes['pos1'])
        word_lst.append(' '.join(word_row))
        morpheme_lst.append(' '.join(morpheme_row))
    return word_lst, morpheme_lst

In [4]:
# Stopwords 使う場合
# #  https://aidemy.net/magazine/688/
# def sloth():
#     import urllib3
#     from bs4 import BeautifulSoup

#     slothlib_path = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'
#     http = urllib3.PoolManager()
#     #↑urlib3系のおまじない
#     slothlib_file =http.request('GET',slothlib_path)
#     soup=BeautifulSoup(slothlib_file.data,'lxml')
#     soup=str(soup).split()#soupは文字列じゃないので注意
#     return soup

In [21]:
for path in range(2002, 2024):
    csv_path = pathlib.Path(str(path))
    file_lst = sorted([str(path) for path in csv_path.glob('*.csv')])
    print(csv_path)
    for file in file_lst:
        print(file)
        df = pd.read_csv(file, encoding= 'utf-8')
        speakers = [speaker for speaker in df["発言者名"]]
        word_lst, mor_lst = wakati(df)
        df["発言内容"] = pd.Series(word_lst)
        df["形態素"] = pd.Series(mor_lst)
        df = df[df["発言者肩書き"].isnull()]
        df = df[df["発言者名"] != "会議録情報"]
        df = df[df["発言内容"] != '']
        df = df[["発言ID", "院名", "会議名", "号数", "日付", "発言者名", "発言者所属会派", "発言内容", "形態素"]]
        df = df.rename(columns={"発言ID": "id", "院名": "house", "会議名": "committee", "号数": "vol", "日付": "speech_date", \
            "発言者名": "speaker", "発言者所属会派": "party", "発言内容": "speech", "形態素": "morpheme"})
        df.to_sql('raw_speech', con=conn, if_exists='append', index=None)

2002
2002/kokkai_speech_20021.csv
2002/kokkai_speech_200210.csv
2002/kokkai_speech_200211.csv
2002/kokkai_speech_200212.csv
2002/kokkai_speech_20022.csv
2002/kokkai_speech_20023.csv
2002/kokkai_speech_20024.csv
2002/kokkai_speech_20025.csv
2002/kokkai_speech_20026.csv
2002/kokkai_speech_20027.csv
2002/kokkai_speech_20028.csv
2002/kokkai_speech_20029.csv
2003
2003/kokkai_speech_20031.csv
2003/kokkai_speech_200310.csv
2003/kokkai_speech_200311.csv
2003/kokkai_speech_200312.csv
2003/kokkai_speech_20032.csv
2003/kokkai_speech_20033.csv
2003/kokkai_speech_20034.csv
2003/kokkai_speech_20035.csv
2003/kokkai_speech_20036.csv
2003/kokkai_speech_20037.csv
2003/kokkai_speech_20038.csv
2003/kokkai_speech_20039.csv
2004
2004/kokkai_speech_20041.csv
2004/kokkai_speech_200410.csv
2004/kokkai_speech_200411.csv
2004/kokkai_speech_200412.csv
2004/kokkai_speech_20042.csv
2004/kokkai_speech_20043.csv
2004/kokkai_speech_20044.csv
2004/kokkai_speech_20045.csv
2004/kokkai_speech_20046.csv
2004/kokkai_speech_

  df["発言内容"] = pd.Series(word_lst)
  df["形態素"] = pd.Series(mor_lst)


2010/kokkai_speech_201010.csv
2010/kokkai_speech_201011.csv
2010/kokkai_speech_201012.csv
2010/kokkai_speech_20102.csv
2010/kokkai_speech_20103.csv
2010/kokkai_speech_20104.csv
2010/kokkai_speech_20105.csv
2010/kokkai_speech_20106.csv
2010/kokkai_speech_20107.csv
2010/kokkai_speech_20108.csv
2010/kokkai_speech_20109.csv
2011
2011/kokkai_speech_20111.csv
2011/kokkai_speech_201110.csv
2011/kokkai_speech_201111.csv
2011/kokkai_speech_201112.csv
2011/kokkai_speech_20112.csv
2011/kokkai_speech_20113.csv
2011/kokkai_speech_20114.csv
2011/kokkai_speech_20115.csv
2011/kokkai_speech_20116.csv
2011/kokkai_speech_20117.csv
2011/kokkai_speech_20118.csv
2011/kokkai_speech_20119.csv
2012
2012/kokkai_speech_20121.csv
2012/kokkai_speech_201210.csv
2012/kokkai_speech_201211.csv
2012/kokkai_speech_201212.csv
2012/kokkai_speech_20122.csv
2012/kokkai_speech_20123.csv
2012/kokkai_speech_20124.csv
2012/kokkai_speech_20125.csv
2012/kokkai_speech_20126.csv
2012/kokkai_speech_20127.csv
2012/kokkai_speech_20128

  df["発言内容"] = pd.Series(word_lst)
  df["形態素"] = pd.Series(mor_lst)


2015/kokkai_speech_201512.csv
2015/kokkai_speech_20152.csv
2015/kokkai_speech_20153.csv
2015/kokkai_speech_20154.csv
2015/kokkai_speech_20155.csv
2015/kokkai_speech_20156.csv
2015/kokkai_speech_20157.csv
2015/kokkai_speech_20158.csv
2015/kokkai_speech_20159.csv
2016
2016/kokkai_speech_20161.csv
2016/kokkai_speech_201610.csv
2016/kokkai_speech_201611.csv
2016/kokkai_speech_201612.csv
2016/kokkai_speech_20162.csv
2016/kokkai_speech_20163.csv
2016/kokkai_speech_20164.csv
2016/kokkai_speech_20165.csv
2016/kokkai_speech_20166.csv
2016/kokkai_speech_20167.csv


  df["発言内容"] = pd.Series(word_lst)
  df["形態素"] = pd.Series(mor_lst)


2016/kokkai_speech_20168.csv
2016/kokkai_speech_20169.csv
2017
2017/kokkai_speech_20171.csv
2017/kokkai_speech_201710.csv
2017/kokkai_speech_201711.csv


  df["発言内容"] = pd.Series(word_lst)
  df["形態素"] = pd.Series(mor_lst)


2017/kokkai_speech_201712.csv
2017/kokkai_speech_20172.csv
2017/kokkai_speech_20173.csv
2017/kokkai_speech_20174.csv
2017/kokkai_speech_20175.csv
2017/kokkai_speech_20176.csv
2017/kokkai_speech_20177.csv
2017/kokkai_speech_20178.csv
2017/kokkai_speech_20179.csv
2018
2018/kokkai_speech_20181.csv
2018/kokkai_speech_201810.csv
2018/kokkai_speech_201811.csv
2018/kokkai_speech_201812.csv
2018/kokkai_speech_20182.csv
2018/kokkai_speech_20183.csv
2018/kokkai_speech_20184.csv
2018/kokkai_speech_20185.csv
2018/kokkai_speech_20186.csv
2018/kokkai_speech_20187.csv
2018/kokkai_speech_20188.csv
2018/kokkai_speech_20189.csv
2019
2019/kokkai_speech_20191.csv


  df["発言内容"] = pd.Series(word_lst)
  df["形態素"] = pd.Series(mor_lst)


2019/kokkai_speech_201910.csv
2019/kokkai_speech_201911.csv
2019/kokkai_speech_201912.csv
2019/kokkai_speech_20192.csv
2019/kokkai_speech_20193.csv
2019/kokkai_speech_20194.csv
2019/kokkai_speech_20195.csv
2019/kokkai_speech_20196.csv
2019/kokkai_speech_20197.csv
2019/kokkai_speech_20198.csv
2019/kokkai_speech_20199.csv
2020
2020/kokkai_speech_20201.csv


  df["発言内容"] = pd.Series(word_lst)
  df["形態素"] = pd.Series(mor_lst)
  df["発言内容"] = pd.Series(word_lst)
  df["形態素"] = pd.Series(mor_lst)


2020/kokkai_speech_202010.csv
2020/kokkai_speech_202011.csv
2020/kokkai_speech_202012.csv
2020/kokkai_speech_20202.csv
2020/kokkai_speech_20203.csv
2020/kokkai_speech_20204.csv
2020/kokkai_speech_20205.csv
2020/kokkai_speech_20206.csv
2020/kokkai_speech_20207.csv
2020/kokkai_speech_20208.csv
2020/kokkai_speech_20209.csv
2021
2021/kokkai_speech_20211.csv
2021/kokkai_speech_202110.csv
2021/kokkai_speech_202111.csv
2021/kokkai_speech_202112.csv
2021/kokkai_speech_20212.csv
2021/kokkai_speech_20213.csv
2021/kokkai_speech_20214.csv
2021/kokkai_speech_20215.csv
2021/kokkai_speech_20216.csv
2021/kokkai_speech_20217.csv
2021/kokkai_speech_20218.csv
2021/kokkai_speech_20219.csv
2022
2022/kokkai_speech_20221.csv
2022/kokkai_speech_202210.csv
2022/kokkai_speech_202211.csv
2022/kokkai_speech_202212.csv
2022/kokkai_speech_20222.csv
2022/kokkai_speech_20223.csv
2022/kokkai_speech_20224.csv
2022/kokkai_speech_20225.csv
2022/kokkai_speech_20226.csv
2022/kokkai_speech_20227.csv
2022/kokkai_speech_20228

  df["発言内容"] = pd.Series(word_lst)
  df["形態素"] = pd.Series(mor_lst)


2022/kokkai_speech_20229.csv
2023
2023/kokkai_speech_20231.csv
2023/kokkai_speech_20232.csv
2023/kokkai_speech_20233.csv


In [32]:
conn.close()