In [16]:
!cat ai.ja/ai.ja.txt | cabocha -f1 > ai.ja.txt.parsed

In [14]:
!cat neko.txt | cabocha -f1 > neko.txt.cabocha

# No40

In [17]:
class Morph:
    def __init__(self, dc):
        self.surface = dc['surface']
        self.base = dc['base']
        self.pos = dc['pos']
        self.pos1 = dc['pos1']


def parseCabocha(block):
    res = []
    for line in block.split('\n'):
        if line == '':
            return res
        elif line[0] == '*':
            continue
        (surface, attr) = line.split('\t')
        attr = attr.split(',')
        lineDict = {
            'surface': surface,
            'base': attr[6],
            'pos': attr[0],
            'pos1': attr[1]
        }
        res.append(Morph(lineDict))


filename = 'ai.ja.txt.parsed'
with open(filename, mode='rt', encoding='utf-8') as f:
    blockList = f.read().split('EOS\n')
blockList = list(filter(lambda x: x != '', blockList))
blockList = [parseCabocha(block) for block in blockList]
for m in blockList[2]:
    print(vars(m))

{'surface': '『', 'base': '『', 'pos': '記号', 'pos1': '括弧開'}
{'surface': '日本', 'base': '日本', 'pos': '名詞', 'pos1': '固有名詞'}
{'surface': '大', 'base': '大', 'pos': '接頭詞', 'pos1': '名詞接続'}
{'surface': '百科全書', 'base': '百科全書', 'pos': '名詞', 'pos1': '一般'}
{'surface': '(', 'base': '*', 'pos': '名詞', 'pos1': 'サ変接続'}
{'surface': 'ニッポニカ', 'base': '*', 'pos': '名詞', 'pos1': '一般'}
{'surface': ')』', 'base': '*', 'pos': '名詞', 'pos1': 'サ変接続'}
{'surface': 'の', 'base': 'の', 'pos': '助詞', 'pos1': '連体化'}
{'surface': '解説', 'base': '解説', 'pos': '名詞', 'pos1': 'サ変接続'}
{'surface': 'で', 'base': 'で', 'pos': '助詞', 'pos1': '格助詞'}
{'surface': '、', 'base': '、', 'pos': '記号', 'pos1': '読点'}
{'surface': '情報', 'base': '情報', 'pos': '名詞', 'pos1': '一般'}
{'surface': '工学', 'base': '工学', 'pos': '名詞', 'pos1': '一般'}
{'surface': '者', 'base': '者', 'pos': '名詞', 'pos1': '接尾'}
{'surface': '・', 'base': '・', 'pos': '記号', 'pos1': '一般'}
{'surface': '通信', 'base': '通信', 'pos': '名詞', 'pos1': 'サ変接続'}
{'surface': '工学', 'base': '工学', 'pos': '名詞', 'pos1'

# No41

In [18]:
class Morph:
    def __init__(self, dc):
        self.surface = dc['surface']
        self.base = dc['base']
        self.pos = dc['pos']
        self.pos1 = dc['pos1']


class Chunk:
    def __init__(self, morphs, dst):
        self.morphs = morphs    # 形態素（Morphオブジェクト）のリスト
        self.dst = dst          # 係り先文節インデックス番号
        self.srcs = []          # 係り元文節インデックス番号のリスト


def parseCabocha(block):
    def checkCreateChunk(tmp):
        if len(tmp) > 0:
            c = Chunk(tmp, dst)
            res.append(c)
            tmp = []
        return tmp

    res = []
    tmp = []
    dst = None
    for line in block.split('\n'):
        if line == '':
            tmp = checkCreateChunk(tmp)
        elif line[0] == '*':
            dst = line.split(' ')[2].rstrip('D')
            tmp = checkCreateChunk(tmp)
        else:
            (surface, attr) = line.split('\t')
            attr = attr.split(',')
            lineDict = {
                'surface': surface,
                'base': attr[6],
                'pos': attr[0],
                'pos1': attr[1]
            }
            tmp.append(Morph(lineDict))

    for i, r in enumerate(res):
        res[int(r.dst)].srcs.append(i)
    return res


filename = 'ai.ja.txt.parsed'
with open(filename, mode='rt', encoding='utf-8') as f:
    blockList = f.read().split('EOS\n')
blockList = list(filter(lambda x: x != '', blockList))
blockList = [parseCabocha(block) for block in blockList]
for m in blockList[7]:
    print([mo.surface for mo in m.morphs], m.dst, m.srcs)

['一方', '、'] 2 []
['計算', '知能'] 12 []
['（', 'CI', '）', 'は'] 4 [0]
['開発', 'や'] 5 []
['学習', 'を'] 6 [2]
['繰り返す'] 8 [3]
['こと', 'を'] 8 [4]
['基本', 'と'] 9 []
['し', 'て', 'いる'] 12 [5, 6]
['（', '例えば', '、'] 12 [7]
['パラメータ', '調整', '、'] 12 []
['コネクショニズム', 'の'] 21 []
['システム', '）', '。'] 16 [1, 8, 9, 10]
['学習', 'は'] 15 []
['経験', 'に'] 16 []
['基づく'] 21 [13]
['手法', 'で', 'あり', '、'] 18 [12, 14]
['非', '記号', '的', 'AI', '、'] 20 []
['美しく', 'ない'] 20 [16]
['AI', '、'] 21 []
['ソフトコンピューティング', 'と'] 26 [17, 18]
['関係', 'し', 'て', 'いる', '。'] 23 [11, 15, 19]
['その'] 26 []
['手法', 'として', 'は', '、'] 25 [21]
['以下', 'の'] 26 []
['もの', 'が'] -1 [23]
['ある', '。'] -1 [20, 22, 24, 25, 26]
