# 第3章 分類問題

## 3.3 Naive Bayes

In [4]:
# 訓練データのファイルを読み込んで、素性の
# インデックス辞書featsとラベルのインデックス辞書
# labelsを返す関数

def Make_dict(file):
    feats,labels = {},{}
    findex,lindex = 0,0
    with open(file,encoding='utf-8') as f:
        for line in f:
            list=line.split(' ')
            for item in list:
                word,right = item.split(":")
                # 素性の処理
                if (word not in feats) and (word != "#label#"):
                    feats[word] = findex
                    findex += 1
                elif word == "#label#":
                    right = right.replace('\n','')
                    if right not in labels:
                        labels[right] = lindex
                        lindex += 1
    return feats,labels



In [8]:
features,labels = Make_dict(r'.\cls-acl10-processed\jp\books\train.processed')
print(type(features))
print(len(features))
print(labels)

<class 'dict'>
19983
{'positive': 0, 'negative': 1}


In [9]:
# ファイルと素性辞書とラベルの辞書を
# 引数にとり、用例ベクトルのリストと
# 対応する答えのリストを返す関数
def Make_sample_vectors(file,feats,label_dict):
    samples,label_list = [],[]
    with open(file,encoding='utf-8') as f:
        for line in f:
            list = line.split(' ')
            asample = [0] * len(feats)
            for item in list:
                word,right = item.split(":")
                if word == "#label#":
                    label_list.append(int(label_dict[right.replace('\n','')]))
                else:
                    if word in feats:
                        asample[feats[word]] = int(right)
            samples.append(asample)
    return samples,label_list

In [11]:
train_X,train_y = Make_sample_vectors(r'.\cls-acl10-processed\jp\books\train.processed',features,labels)

In [12]:
test_X,test_y = Make_sample_vectors(r'.\cls-acl10-processed\jp\books\train.processed',features,labels)

In [13]:
print(len(train_X))
print(len(train_y))
print(len(test_X))
print(len(test_y))

2000
2000
2000
2000


### naive_bayes モジュールによる Naive Bayes

In [14]:
from sklearn.naive_bayes import BernoulliNB
cl = BernoulliNB()
cl.fit(train_X,train_y)



BernoulliNB()

In [15]:
cl.score(test_X,test_y)

0.939

In [17]:
cl = BernoulliNB(alpha=0.5)

In [18]:
from sklearn.naive_bayes import MultinomialNB
cl = MultinomialNB()
cl.fit(train_X,train_y)
cl.score(test_X,test_y)


0.972

## 3.4 文書分類の評価

#### モジュールによる評価

In [19]:
test_ans_list = cl.predict(test_X)

In [20]:
# 精度を計算する
from sklearn.metrics import precision_score

precision_score(test_y,test_ans_list,average=None)

array([0.99372385, 0.95210728])

In [21]:
test_ans_list_2 = test_ans_list.tolist()
print(test_ans_list_2.count(0))
print(test_ans_list_2.count(1))


956
1044


In [22]:
print([test_ans_list_2[i] == 0 and test_ans_list_2[i] == test_y[i] for i in range(len(test_ans_list_2))].count(True))
print([test_ans_list_2[i] == 1 and test_ans_list_2[i] == test_y[i] for i in range(len(test_ans_list_2))].count(True))


950
994


In [23]:
(950 + 994) / (956 + 1044)

0.972

In [24]:
((950 / 956) + (994 / 1044)) / 2

0.9729155645329357

In [25]:
print(precision_score(test_y,test_ans_list,average='micro'))
print(precision_score(test_y,test_ans_list,average='macro'))

0.972
0.9729155645329357


In [26]:
# 再現率を計算する
from sklearn.metrics import recall_score
print(recall_score(test_y,test_ans_list,average=None))
print(recall_score(test_y,test_ans_list,average='micro'))
print(recall_score(test_y,test_ans_list,average='macro'))


[0.95  0.994]
0.972
0.972


In [27]:
# F値を計算する
from sklearn.metrics import f1_score
print(f1_score(test_y,test_ans_list,average=None))
print(f1_score(test_y,test_ans_list,average='micro'))
print(f1_score(test_y,test_ans_list,average='macro'))



[0.97137014 0.97260274]
0.972
0.9719864414376558
