In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0], (y_test != y_pred).sum()))
print("accruacy: ", (75-4)/75)

Number of mislabeled points out of a total 75 points : 4
accruacy:  0.9466666666666667


In [70]:
import glob, re
from collections import defaultdict

#处理词语
def tokenize(message):
    message = message.lower()
    all_words = re.findall("[a-z0-9']+", message) #找出所有单词
    return set(all_words)   #set 去重


# 训练数据的格式：(message, is_spam)
# 返回dict, key是单个单词， value是list [spam_count, non_spam_count]
# 在spam和non_spam的出现次数
def count_words(training_set):
    counts = defaultdict(lambda: [0,0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1] += 1
    return counts

# 计算单词出现概率
# 返回 三元的list [word, p(w|spam), p(w|not spam)]
# 可调整 k 的输入
def find_word_probs(counts, total_spams, total_non_spams, k=0.5):
    return [(w,
            (spam + k) / (total_spams + 2*k), 
            (non_spam+k) / (total_non_spams + 2*k))
           for w, (spam, non_spam) in counts.items()]

#根据每个词在spam 非spam的概率，计算新的邮件为垃圾的概率
def find_spam_probs(word_probs, new_message):
    message_words = tokenize(new_message)
    #概率转为log 方便计算
    log_prob_if_spam = log_prob_if_not_spam = 0.0
    
    # 遍历每个单词， 如果单词出现在新的  message里，计算log_prob
    for word, prob_if_spam, prob_if_not_spam in word_probs:
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)
            
        #单词没有出现在message里，概率为 1-出现的概率
        else:
            log_prob_if_spam += math.log(1.0 - prob_if_spam)
            log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
            
        #log换回正常的数值
        prob_if_spam = math.exp(log_prob_if_spam)
        prob_if_not_spam = math.exp(log_prob_if_not_spam)
        
        return prob_if_spam / (prob_if_spam + prob_if_not_spam)

            
#训练
class NaiveBayesClassifier:
    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []
        
    def train(self, dataset):
        
        #计算spam在训练集的数量
        spam_lst = [is_spam for message, is_spam in dataset if is_spam]
        num_spams = len(spam_lst)
        num_non_spams = len(dataset) - num_spams
        print('num-spams: ', num_spams, 'num_non_spams: ',num_non_spams)
        
        word_counts = count_words(dataset)
        self.word_probs = find_word_probs(word_counts, num_spams, 
                                          num_non_spams,self.k)
        
    def classify(self, message):
        return find_spam_probs(self.word_probs, message)
    
    
# '''
# 函数输出的例子 
# 输入两个邮件，一封spam 另一个non_spam
# input str list:
# [('this is a spam tesing', 1), ('not a spam, noring cat dog', 0)]

# #计算出每一个单词在spam non_spam出现的次数
# count = {'tesing': [1, 0],
#              'spam': [1, 1],
#              'is': [1, 0],
#              'a': [1, 1],
#              'this': [1, 0],
#              'not': [0, 1],
#              'dog': [0, 1],
#              'cat': [0, 1],
#              'noring': [0, 1]})

# #该单词分别在两种邮件的概率
# word_prob = 
# [('tesing', 0.6666666666666666, 0.3333333333333333),
#  ('spam', 0.6666666666666666, 0.6666666666666666),
#  ('is', 0.6666666666666666, 0.3333333333333333),
#  ('a', 0.6666666666666666, 0.6666666666666666),
#  ('this', 0.6666666666666666, 0.3333333333333333),
#  ('not', 0.3333333333333333, 0.6666666666666666),
#  ('dog', 0.3333333333333333, 0.6666666666666666),
#  ('cat', 0.3333333333333333, 0.6666666666666666),
#  ('noring', 0.3333333333333333, 0.6666666666666666)]
# '''

In [71]:
#辅助函数
# """split data into fractions [prob, 1 - prob]"""
def split_data(data, prob):
    results = [], []
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
        
    return results

def train_test_split(x, y, test_pct):
    data = zip(x, y) # pair corresponding values
    train, test = split_data(data, 1 - test_pct) # split the data set of pairs
    x_train, y_train = zip(*train) # magical un-zip trick
    x_test, y_test = zip(*test)
    
    return x_train, x_test, y_train, y_test


In [76]:
#导入数据
import glob, re
import random
from collections import Counter
import math
path = 'spam/*/*'

# glob.glob 返回所有匹配的字符
def read_from_file(path):
    
    data = []
    
    for fn in glob.glob(path):
        
        is_spam = "ham" not in fn

        with open(fn, 'r', encoding='utf-8',errors='ignore') as file:
            for line in file:
                #print(line)
                if line.startswith("Subject:"):
                    # remove the leading "Subject: " and keep what's left
                    subject = re.sub(r"^Subject: ", "", line).strip()
                    data.append((subject, is_spam))
                    
    return data


random.seed(0)
data = read_from_file(path)
train_data, test_data = split_data(data, 0.75)
classifier = NaiveBayesClassifier()
classifier.train(train_data)

#计算结果
# triplets (subject, actual is_spam, predicted spam probability)
predictions = [(subject, is_spam, classifier.classify(subject))
              for subject, is_spam in test_data]

#print(predictions)
# 计算(actual is_spam, predicted is_spam)
counts = Counter((is_spam, spam_probability > 0.5)
                 for _, is_spam, spam_probability in predictions)

#print(counts)


num-spams:  364 num_non_spams:  2183


In [75]:
#输出
#Counter({(False, False): 673, (True, True): 84, 
#(True, False): 47, (False, True): 26})

#
acc = (673+47) / (673+84+47+26)
print(acc)

0.8674698795180723
