# 贝叶斯公式

𝑃(𝐴|𝐵)=𝑃(𝐵|𝐴)𝑃(𝐴)𝑃(𝐵)

## 示例一

In [1]:
import numpy as np
# 假设已经抽出红球为事件 B，
# 选中容器 A 为事件 A，
# 则有：P(B) = 8/20，P(A) = 1/2，
# 从A容器中选中红球的概率：P(B|A) = 7/10，按照公式，则有：

# 选中了球是红球，请问来自A容器的概率是多少
# P(A|B) = P(B|A)*P(A)/P(B) = (7/10)*(1/2)/(8/20) = 0.875

np.round(7/10 * 1/2 /(8/20),3) # 7/8

0.875

## 示例二

例如：一座别墅在过去的 20 年里一共发生过 2 次被盗，
别墅的主人有一条狗，狗平均每周晚上叫 3 次，
在盗贼入侵时狗叫的概率被估计为 0.9，问题是：在狗叫的时候发生入侵的概率是多少？

In [2]:
# 被盗，事件A P(A) = 2/(20 * 365)
# 狗叫，事件B P(B) = 3/7
# 盗贼入侵狗叫 P(B|A) = 0.9

# 狗叫时发生入侵 P(A|B) = P(B|A) * P(A)/P(B)
np.round(0.9 * 2/(20 * 365)/(3/7),5)

0.00058

# 三种贝叶斯模型使用

## 高斯分布的朴素贝叶斯

In [3]:
import numpy as np
from sklearn import datasets
# navie 天真，朴素
from sklearn.naive_bayes import GaussianNB # 高斯NB，Naive Bayes
from sklearn.model_selection import train_test_split
# 自然界中鸢尾花，自然属性，符合正态分布
# 花萼长宽，花瓣长宽
X,y = datasets.load_iris(return_X_y=True)

In [5]:
# 正太分布，属性
score = 0
model = GaussianNB()
for i in range(1000):
    X_train,X_test,y_train,y_test = train_test_split(X,y)
    model.fit(X_train,y_train)
    score += model.score(X_test,y_test)/1000
print('高斯朴素贝叶斯算法平均预测准确率是：',score)

高斯朴素贝叶斯算法平均预测准确率是： 0.9534999999999985


## 伯努利分布朴素贝叶斯

In [7]:
# 你想，我们的数据特征分布，是二项分布？？？
from sklearn.naive_bayes import BernoulliNB
score = 0
model = BernoulliNB()
for i in range(1000):
    X_train,X_test,y_train,y_test = train_test_split(X,y)
    model.fit(X_train,y_train)
    score += model.score(X_test,y_test)/1000
print('伯努利分布素贝叶斯算法平均预测准确率是：',score)

伯努利分布素贝叶斯算法平均预测准确率是： 0.26752631578947483


## 多项式分布

In [8]:
# 植物，数据，符合多项式分布
# 人身高：离散，极矮、矮、中等、高、特别高（满足多项分布）
# 多项分布 和 高斯分布，一定的类似
from sklearn.naive_bayes import MultinomialNB # 二项分布的延伸
score = 0
model = MultinomialNB()
for i in range(1000):
    X_train,X_test,y_train,y_test = train_test_split(X,y)
    model.fit(X_train,y_train)
    score += model.score(X_test,y_test)/1000
print('多项式分布朴素贝叶斯算法平均预测准确率是：',score)

多项式分布朴素贝叶斯算法平均预测准确率是： 0.8154473684210523


In [9]:
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

# 文本分类

## 英文one-hot编码

In [10]:
# jieba分词，中国人写的Python库
# 一句话，分成一个个词
import jieba # pip install jieba

In [12]:
data = ['My dog ate my homework.',
        'My cat ate the fish.',
        'Precision things are very few in the world,that is the reason there is only you!']

result = []
for s in data:
    result.extend([i for i in jieba.lcut(s) if i not in [' ',',','.','!']])
    
result = np.array(result)
result = np.unique(result) # 去重
result

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/jt/qjpzhhld5znczz2yp8zyt7xw0000gn/T/jieba.cache
Loading model cost 0.582 seconds.
Prefix dict has been built successfully.


array(['My', 'Precision', 'are', 'ate', 'cat', 'dog', 'few', 'fish',
       'homework', 'in', 'is', 'my', 'only', 'reason', 'that', 'the',
       'there', 'things', 'very', 'world', 'you'], dtype='<U9')

In [14]:
jieba.lcut('My dog ate my homework.')

['My', ' ', 'dog', ' ', 'ate', ' ', 'my', ' ', 'homework', '.']

In [15]:
# 词向量转换，一个个句子，进行转换
# 每个句子中一个个词，进行转换
for s in data:
    sentense = [i for i in jieba.lcut(s) if i not in [' ',',','.','!']]
    
    # 嵌入，词嵌入
    # 词，向量化，数字化
    word_embeding = [(word == result).astype(np.int8) for word in sentense]
    print(s)
    print(np.array(word_embeding))

My dog ate my homework.
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]]
My cat ate the fish.
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Precision things are very few in the world,that is the reason there is only you!
[[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0

## 中文one-hot编码

In [16]:
import jieba
import numpy as np

In [17]:
# 和尚的嘴巴亲吻了尼姑
data = ['喜欢上一个人','尼姑亲吻了和尚的嘴巴','老师你教的都是没用的东西']
result = []

for s in data:
    result.extend([word for word in jieba.lcut(s)])
result

['喜欢',
 '上',
 '一个',
 '人',
 '尼姑',
 '亲吻',
 '了',
 '和尚',
 '的',
 '嘴巴',
 '老师',
 '你',
 '教',
 '的',
 '都',
 '是',
 '没用',
 '的',
 '东西']

In [18]:
# 去重,字典
result = np.unique(np.array(result))
result

array(['一个', '上', '东西', '了', '亲吻', '人', '你', '和尚', '喜欢', '嘴巴', '尼姑', '教',
       '是', '没用', '的', '老师', '都'], dtype='<U2')

In [21]:
for s in data:
    sentence = jieba.lcut(s)
    # 词向量，数组，表示每一句话
    word_embedding = [(word == result).astype(np.int8) for word in sentence]
    print(s,np.array(word_embedding),sep = '\n')

喜欢上一个人
[[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]]
尼姑亲吻了和尚的嘴巴
[[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]]
老师你教的都是没用的东西
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
