In [1]:
# -*- coding:utf-8 -*- #

## 9.6.2 樸素貝葉斯

In [2]:
import numpy as np
import jieba

def load():
    arr = ['不知道該說什麼, 這麼爛的抄襲片也能上映, 我感到很尷尬',
       '天吶。一個大寫的滑稽。',
       '劇情太狗血，演技太浮誇，結局太無語。總體太渣了。這一個半小時廢了。',
       '畫面很美，音樂很好聽，主角演的很到位，很值得一看的電影，男主角很帥很帥，贊贊贊',
       '超級喜歡的一部愛情影片',
       '故事情節吸引人，演員演的也很好，電影裏的歌也好聽，總之值得一看，看了之後也會很感動的。']
    ret = []
    for i in arr:
        words = jieba.lcut(i) # 將句子切分成詞
        ret.append(words)
    return ret,[0,0,0,1,1,1]

def create_vocab(data):
    vocab_set = set([])# 使用set集合操作去掉重複出現的詞彙
    for document in data:
        vocab_set = vocab_set | set(document) 
    return list(vocab_set)

def words_to_vec(vocab_list, vocab_set):  # 將句轉換成詞表格式
    ret = np.zeros(len(vocab_list)) # 創建數據表中的一行，並置初值爲0（不存在）
    for word in vocab_set:
        if word in vocab_list:
            ret[vocab_list.index(word)] = 1  # 若該詞在本句中出現，則設置爲1
    return ret

def train(X, y):
    rows = X.shape[0]
    cols = X.shape[1]
    percent = sum(y)/float(rows) # 正例佔比
    p0_arr = np.ones(cols) # 設置初值爲1，後作爲分子
    p1_arr = np.ones(cols)
    p0_count = 2.0 # 設初值爲2，後作爲分母
    p1_count = 2.0
    for i in range(rows): # 按每句遍歷
        if y[i] == 1:
            p1_arr += X[i] # 數組按每個值相加
            p1_count += sum(X[i]) # 句子所有詞個數相加(只計詞彙表中詞)
        else:
            p0_arr += X[i]
            p0_count += sum(X[i])
    p1_vec = np.log(p1_arr/p1_count) # 正例時，每個詞出現概率
    p0_vec = np.log(p0_arr/p0_count)
    return p0_vec, p1_vec, percent

def predict(X, p0_vec, p1_vec, percent):
    p1 = sum(X * p1_vec) + np.log(percent) # 爲1的概率
    p0 = sum(X * p0_vec) + np.log(1.0 - percent) #爲0的概率
    if p1 > p0:
        return 1
    else:
        return 0

if __name__ == '__main__':
    sentences,y = load()
    vocab_list = create_vocab(sentences)
    X=[]
    for sentence in sentences:
        X.append(words_to_vec(vocab_list, sentence))
    p0_vec, p1_vec, percent = train(np.array(X), np.array(y))
    test = jieba.lcut('抄襲得那麼明顯也是醉了！')
    test_X = np.array(words_to_vec(vocab_list, test))
    print(test,'分類',predict(test_X, p0_vec, p1_vec, percent))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.458 seconds.
Prefix dict has been built succesfully.


['抄襲', '得', '那麼', '明顯', '也', '是', '醉', '了', '！'] 分類 0


## 9.6.3 貝葉斯網絡

In [None]:
# 此程序段需要在 Python 2系統中運行

from bayesian.bbn import build_bbn

def f_prize_door(prize_door):
    return 0.33333333
def f_guest_door(guest_door):
    return 0.33333333
def f_monty_door(prize_door, guest_door, monty_door):
    if prize_door == guest_door:  # 參賽者猜對了
        if prize_door == monty_door:
            return 0     # Monty不會打開有車的那扇門，不可能發生
        else:
            return 0.5   # Monty會打開其它兩扇門，二選一
    elif prize_door == monty_door:
        return 0         #  Monty不會打開有車的那扇門，不可能發生
    elif guest_door == monty_door:
        return 0         # 門已經由參賽者選定，不可能發生
    else:
        return 1    # Monty打開另一扇有羊的門

if __name__ == '__main__':
    g = build_bbn(f_prize_door, f_guest_door, f_monty_door,
        domains=dict(
            prize_door=['A', 'B', 'C'],
            guest_door=['A', 'B', 'C'],
            monty_door=['A', 'B', 'C']))
    g.q(guest_door='A', monty_door='B') # 假設參賽者打開門A，Monty打開門B