In [1]:
# -*- coding:utf-8 -*- #

## 9.6.2 朴素贝叶斯

In [2]:
import numpy as np
import jieba

def load():
    arr = ['不知道该说什么, 这么烂的抄袭片也能上映, 我感到很尴尬',
       '天呐。一个大写的滑稽。',
       '剧情太狗血，演技太浮夸，结局太无语。总体太渣了。这一个半小时废了。',
       '画面很美，音乐很好听，主角演的很到位，很值得一看的电影，男主角很帅很帅，赞赞赞',
       '超级喜欢的一部爱情影片',
       '故事情节吸引人，演员演的也很好，电影里的歌也好听，总之值得一看，看了之后也会很感动的。']
    ret = []
    for i in arr:
        words = jieba.lcut(i) # 将句子切分成词
        ret.append(words)
    return ret,[0,0,0,1,1,1]

def create_vocab(data):
    vocab_set = set([])# 使用set集合操作去掉重复出现的词汇
    for document in data:
        vocab_set = vocab_set | set(document) 
    return list(vocab_set)

def words_to_vec(vocab_list, vocab_set):  # 将句转换成词表格式
    ret = np.zeros(len(vocab_list)) # 创建数据表中的一行，并置初值为0（不存在）
    for word in vocab_set:
        if word in vocab_list:
            ret[vocab_list.index(word)] = 1  # 若该词在本句中出现，则设置为1
    return ret

def train(X, y):
    rows = X.shape[0]
    cols = X.shape[1]
    percent = sum(y)/float(rows) # 正例占比
    p0_arr = np.ones(cols) # 设置初值为1，后作为分子
    p1_arr = np.ones(cols)
    p0_count = 2.0 # 设初值为2，后作为分母
    p1_count = 2.0
    for i in range(rows): # 按每句遍历
        if y[i] == 1:
            p1_arr += X[i] # 数组按每个值相加
            p1_count += sum(X[i]) # 句子所有词个数相加(只计词汇表中词)
        else:
            p0_arr += X[i]
            p0_count += sum(X[i])
    p1_vec = np.log(p1_arr/p1_count) # 正例时，每个词出现概率
    p0_vec = np.log(p0_arr/p0_count)
    return p0_vec, p1_vec, percent

def predict(X, p0_vec, p1_vec, percent):
    p1 = sum(X * p1_vec) + np.log(percent) # 为1的概率
    p0 = sum(X * p0_vec) + np.log(1.0 - percent) #为0的概率
    if p1 > p0:
        return 1
    else:
        return 0

if __name__ == '__main__':
    sentences,y = load()
    vocab_list = create_vocab(sentences)
    X=[]
    for sentence in sentences:
        X.append(words_to_vec(vocab_list, sentence))
    p0_vec, p1_vec, percent = train(np.array(X), np.array(y))
    test = jieba.lcut('抄袭得那么明显也是醉了！')
    test_X = np.array(words_to_vec(vocab_list, test))
    print(test,'分类',predict(test_X, p0_vec, p1_vec, percent))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.458 seconds.
Prefix dict has been built succesfully.


['抄袭', '得', '那么', '明显', '也', '是', '醉', '了', '！'] 分类 0


## 9.6.3 贝叶斯网络

In [None]:
# 此程序段需要在 Python 2系统中运行

from bayesian.bbn import build_bbn

def f_prize_door(prize_door):
    return 0.33333333
def f_guest_door(guest_door):
    return 0.33333333
def f_monty_door(prize_door, guest_door, monty_door):
    if prize_door == guest_door:  # 参赛者猜对了
        if prize_door == monty_door:
            return 0     # Monty不会打开有车的那扇门，不可能发生
        else:
            return 0.5   # Monty会打开其它两扇门，二选一
    elif prize_door == monty_door:
        return 0         #  Monty不会打开有车的那扇门，不可能发生
    elif guest_door == monty_door:
        return 0         # 门已经由参赛者选定，不可能发生
    else:
        return 1    # Monty打开另一扇有羊的门

if __name__ == '__main__':
    g = build_bbn(f_prize_door, f_guest_door, f_monty_door,
        domains=dict(
            prize_door=['A', 'B', 'C'],
            guest_door=['A', 'B', 'C'],
            monty_door=['A', 'B', 'C']))
    g.q(guest_door='A', monty_door='B') # 假设参赛者打开门A，Monty打开门B