### Naive Bayes

In [13]:
import numpy as np
import pandas as pd

In [14]:
def nb_fit(X, y):
    # 标签类别
    classes = y[y.columns[0]].unique()
    # 标签类别统计
    class_count = y[y.columns[0]].value_counts()
    # 类先验概率
    class_prior = class_count/len(y)
    
    # 类条件概率：字典初始化
    class_condition_prob = dict()
    
    # 遍历计算类条件概率
    # 遍历特征
    for col in X.columns:
        # 遍历类别
        for j in classes:
            # 统计当前类别下特征的不同取值
            p_x_y = X[(y==j).values][col].value_counts()
            # 遍历计算类条件概率
            for i in p_x_y.index:
                class_condition_prob[(col, i, j)] = p_x_y[i]/class_count[j]
    return classes, class_prior, class_condition_prob

In [15]:
def predict(X_test):
    # 初始化结果列表
    res = []
    # 遍历样本类别
    for c in classes:
        # 获取当前类的先验概率
        p_y = class_prior[c]
        # 初始化类条件概率
        p_x_y = 1
        # 遍历测试样例的每个元素
        for i in X_test.items():
            # 类条件概率连乘
            p_x_y *= class_condition_prob[tuple(list(i)+[c])]
        #类先验概率与类条件概率乘积，计算贝叶斯公式的分子
        res.append(p_y*p_x_y)
    return classes[np.argmax(res)]

In [16]:
### 构造数据集
x1 = [1,1,1,1,1,2,2,2,2,2,3,3,3,3,3]
x2 = ['S','M','M','S','S','S','M','M','L','L','L','M','M','L','L']
y = [-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1]

df = pd.DataFrame({'x1':x1, 'x2':x2, 'y':y})
df.head()

Unnamed: 0,x1,x2,y
0,1,S,-1
1,1,M,-1
2,1,M,1
3,1,S,1
4,1,S,-1


In [17]:
df.tail()

Unnamed: 0,x1,x2,y
10,3,L,1
11,3,M,1
12,3,M,1
13,3,L,1
14,3,L,-1


In [18]:
X = df[['x1', 'x2']]
y = df[['y']]

In [19]:
classes, class_prior, class_condition_prob = nb_fit(X, y)
print(classes)
print(class_prior)
print(class_condition_prob)

[-1  1]
 1    0.6
-1    0.4
Name: y, dtype: float64
{('x1', 1, -1): 0.5, ('x1', 2, -1): 0.3333333333333333, ('x1', 3, -1): 0.16666666666666666, ('x1', 3, 1): 0.4444444444444444, ('x1', 2, 1): 0.3333333333333333, ('x1', 1, 1): 0.2222222222222222, ('x2', 'S', -1): 0.5, ('x2', 'M', -1): 0.3333333333333333, ('x2', 'L', -1): 0.16666666666666666, ('x2', 'M', 1): 0.4444444444444444, ('x2', 'L', 1): 0.4444444444444444, ('x2', 'S', 1): 0.1111111111111111}


In [20]:
X_test = {'x1': 2, 'x2': 'S'}

In [21]:
print('测试数据预测类别为：', predict(X_test))

测试数据预测类别为： -1
