In [None]:
from itertools import chain
import pycrfsuite
import sklearn
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer

In [None]:
# コーパス読み込み
import codecs
class CorpusReader(object):
    
    def __init__(self, path):
        with codecs.open(path, encoding='utf-8') as f:
            sent = []
            sents = []
            for line in f:
                if line == '\n':
                    sents.append(sent)
                    sent = []
                    continue
                morph_info = line.strip().split('\t')
                sent.append(morph_info) # 形態素の保存 
        train_num = int(len(sents) * 0.9) # 9割を学習に、9割をテストに
        print(train_num)
        self.__train_sents = sents[:train_num]
        self.__test_sents = sents[train_num:]
        
    def iob_sents(self, name):
        if name == 'train':
            return self.__train_sents
        elif name == 'test':
            return self.__test_sents
        else:
            return None
    

In [None]:
# 文字種取得
def is_hiragana(ch):
    return 0x3040 <= ord(ch) <= 0x309F 
    # ひらがな：True or False

def is_katakana(ch):
    return 0x30A0 <= ord(ch) <= 0x30FF
    # カタカタ：True or False

def get_character_type(ch): # 文字種を取得する
    if ch.isspace(): # 空白の場合
        return 'ZSPACE'
    elif ch.isdigit(): # 数字の場合
        return 'ZDIGIT'
    elif ch.islower(): # 小文字の場合
        return 'ZLLET'
    elif ch.isupper(): # 大文字の場合
        return 'ZULET'
    elif is_hiragana(ch): # ひらがなの場合
        return 'HIRAG'
    elif is_katakana(ch): # カタカナの場合
        return 'KATAK'
    else: # それ以外
        return 'OTHER'

def get_character_types(string): # 文字列の文字種を変換する
    character_types = map(get_character_type, string)
    character_types_str = '-'.join(sorted(set(character_types)))

    return character_types_str

In [None]:
# 品詞細分類の取得
def extract_pos_with_subtype(morph):
    idx = morph.index('*')
    return '-'.join(morph[1:idx])

In [None]:
# 単語を特徴量に変換する
def word2features(sent, i):
    word = sent[i][0]
    chtype = get_character_types(sent[i][0]) # 文字種取得
    postag = extract_pos_with_subtype(sent[i]) # 品詞分類取得
    
    # 該当単語の前後2文字の単語の特徴を用意
    features = [ 
        'bias',
        'word=' + word,
        'type=' + chtype,
        'postag=' + postag,
    ]
    
    if i >= 2: # 現在の単語の前に、2単語以上あるとき
        word2 = sent[i-2][0]
        chtype2 = get_character_types(sent[i-2][0])
        postag2 = extract_pos_with_subtype(sent[i-2])
        iobtag2 = sent[i-2][-1]
        features.extend([
            '-2:word=' + word2,
            '-2:type=' + chtype2,
            '-2:postag=' + postag2,
            '-2:iobtag=' + iobtag2,
        ])
    else: # それ以外は、BOS
        features.append('BOS')

    if i >= 1: # 現在の単語の前に、1単語以上あるとき
        word1 = sent[i-1][0]
        chtype1 = get_character_types(sent[i-1][0])
        postag1 = extract_pos_with_subtype(sent[i-1])
        iobtag1 = sent[i-1][-1]
        features.extend([
            '-1:word=' + word1,
            '-1:type=' + chtype1,
            '-1:postag=' + postag1,
            '-1:iobtag=' + iobtag1,
        ])
    else: # それ以外は、BOS
        features.append('BOS')

    if i < len(sent)-1: # 現在の単語の後ろに、1単語以上あるとき
        word1 = sent[i+1][0]
        chtype1 = get_character_types(sent[i+1][0])
        postag1 = extract_pos_with_subtype(sent[i+1])
        features.extend([
            '+1:word=' + word1,
            '+1:type=' + chtype1,
            '+1:postag=' + postag1,
        ])
    else: # それ以外は、BOS
        features.append('EOS')

    if i < len(sent)-2: # 現在の単語の後ろに、2単語以上あるとき
        word2 = sent[i+2][0]
        chtype2 = get_character_types(sent[i+2][0])
        postag2 = extract_pos_with_subtype(sent[i+2])
        features.extend([
            '+2:word=' + word2,
            '+2:type=' + chtype2,
            '+2:postag=' + postag2,
        ])
    else: # それ以外は、BOS
        features.append('EOS')

    return features


def sent2features(sent): # 情報系列から特徴を取得
    # 単語ごとに特徴変換していく
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent): # 情報系列からラベル[B、I、O]を取得
    return [morph[-1] for morph in sent]


def sent2tokens(sent): # 情報系列から単語原文を取得
    return [morph[0] for morph in sent]

In [None]:
# main 
c = CorpusReader('corpus.txt') # ファイル指定
train_sents = c.iob_sents('train') # データの読み込み
test_sents = c.iob_sents('test') # データの読み込み

X_train = [sent2features(s) for s in train_sents] # 学習データの特徴量
y_train = [sent2labels(s) for s in train_sents] # 学習データのラベル

X_test = [sent2features(s) for s in test_sents] # テストデータの特徴量
y_test = [sent2labels(s) for s in test_sents] # テストデータのラベル


In [None]:
# 学習

trainer = pycrfsuite.Trainer(verbose=False) # モデルの定義

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq) # 学習データの追加
    
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
}) # パラメータの設定

trainer.train('model.crfsuite') # モデル学習


In [None]:
# ラベル評価
def bio_classification_report(y_true, y_pred):
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [None]:
# ラベル予測・評価

tagger = pycrfsuite.Tagger() 
tagger.open('model.crfsuite') # モデルを開く

example_sent = test_sents[1] # テストを指定

sent = sent2tokens(example_sent) # 情報系列から単語原文を取得
predicted = tagger.tag(sent2features(example_sent)) # 予測ラベル
correct = sent2labels(example_sent) # 正解ラベル

for s,p,c in zip(sent,predicted,correct):
    print(s,p,c)

y_pred = [tagger.tag(xseq) for xseq in X_test]
print(bio_classification_report(y_test, y_pred))