# 朴素贝叶斯算法

![朴素贝叶斯](../images/朴素贝叶斯/朴素贝叶斯算法.jpg)

![贝叶斯估计](../images/朴素贝叶斯/贝叶斯估计.jpg)

In [1]:
import sys
import time
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

In [2]:
data = load_breast_cancer()
X, y = data['data'], data['target']
train_X, test_X, train_y, test_y = train_test_split(X, y)
print('train size: %s' % len(train_X))
print('test size: %s' % len(test_X))

train size: 426
test size: 143


In [3]:
class NaiveBayes(object):
    def __init__(self, train_X, train_y):
        self.train_X = np.asarray(train_X)
        self.train_y = np.asarray(train_y)
        self.__train()
        
    def __train(self):
        # 总样本数
        N = len(self.train_X)
        # 类别
        self.classes = np.unique(self.train_y)
#         self.classes = {self.classes[i]: i for i in range(len(self.classes))}
        # 特征数
        N_features = self.train_X.shape[1]
        # 每个特征的取值
        Fij = [np.unique(self.train_X[:, i]) for i in range(N_features)]
        # 计算每个类别的样本数目
        self.sum_Yck = {}
        for i in self.classes:
            self.sum_Yck[i] = 0
        
        # 计算在类别等于Ck时，每个特征的子特征出现的次数，采用拉普拉斯平滑
        self.sum_Yck_Aij = {}
        for i in self.classes:
            # 将每个类别下，每个特征的取值都设置为1
            self.sum_Yck_Aij[i] = {i: {j: 1 for j in range(len(Fij[i]))} for i in range(N_features)}
        for i in range(N):
            # 更新类别计数
            cls = self.train_y[i]
            self.sum_Yck[cls] += 1
            for j in range(N_features):
                # 更新不同类别下，特征取值计数
                for k in range(len(Fij[j])):
                    self.sum_Yck_Aij[cls][j][k] += 1
        
        self.Priori = {k: v / N for k, v in self.sum_Yck.items()}
        self.Posterior = self.sum_Yck_Aij
        for i in self.classes:
            for j in range(N_features):
                for k in range(len(Fij[j])):
                    self.Posterior[i][j][k] /= self.sum_Yck[i] + sum([len(i) for i in Fij])
        return self.Priori, self.Posterior
    
    def __predict(self, test_X):
        preds = [1] * len(self.classes)
        for i in self.Posterior.keys():
            preds[i] *= self.Priori[i]
            for j in self.Posterior[i].keys():
                for k in self.Posterior[i][j].keys():
                    try:
                        preds[i] *= self.Posterior[i][j][k]
                    except:
                        preds[i] *= (1 / self.sum_Yck[i])

        return self.classes[np.argmax(preds)]
    
    def predict(self, test_X):
        res = []
        for i in test_X:
            res.append(self.__predict(i))
        return res

In [4]:
%time
nb = NaiveBayes(train_X, train_y)
preds = nb.predict(test_X)
print('算法准确率: %s' % (sum(preds == test_y) / len(test_y)))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
算法准确率: 0.3706293706293706
