## 1. 糖尿病预测

In [50]:
import csv

def loadCsv(filename):
    """读取数据并将所有数据转换成float型"""
    
    lines = csv.reader(open(filename, "rt")) # 每一行都是字符串组成列表
    dataset = list(lines)
    # print(dataset[0]) # 打印第一行
    for i in range(len(dataset)):
        dataset[i] = [ float(x) for x in dataset[i] ]
    return dataset

filename = "pima-indians-diabetes.csv"
dataset = loadCsv(filename)
print("%s has %d rows" % (filename, len(dataset)))
#print(dataset)

pima-indians-diabetes.csv has 768 rows


In [51]:
import random

def splitDataset(dataset, radio):
    """根据比例划分训练和开发数据集"""
    
    trainSize = int(len(dataset)*radio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet)<trainSize:
        index = random.randrange(len(copy)) # 返回len内的随机数
        trainSet.append(copy.pop(index)) # train:append; test:pop
    return [trainSet, copy]

dataset = [[1], [2], [3], [4], [5]]
splitRatio = 0.67
train, test = splitDataset(dataset, splitRatio)
print('Split {0} rows into train with {1} and test with {2}'.format(len(dataset), train, test))

Split 5 rows into train with [[4], [1], [3]] and test with [[2], [5]]


In [52]:
def splitByClass(dataset):
    """根据类别划分数据"""
    
    seperated = {} # 用字典存储划分后的数据
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in seperated):
            seperated[vector[-1]] = []
        seperated[vector[-1]].append(vector)
    return seperated

dataset = [[1,20,1], [2,21,0], [3,22,1]]
separated = splitByClass(dataset)
print('Separated instances: {0}'.format(separated))

Separated instances: {1: [[1, 20, 1], [3, 22, 1]], 0: [[2, 21, 0]]}


In [62]:
import math

def mean(numbers):
    """计算均值"""
    
    return sum(numbers) / float(len(numbers))

def stddev(numbers):
    """计算标准差"""
    
    avg = mean(numbers)
    var = sum([pow(x-avg, 2) for x in numbers]) / float(len(numbers)-1)
    return math.sqrt(var)

def summarize(dataset):
    """计算每个属性的均值和标准差
    
    parameters:
        dataset -- 2维列表，shape=(size, feature)
    return:
        summaries -- 元组列表，每一个元素是1个元组
    """
    
    summaries = [ (mean(attr), stddev(attr)) for attr in zip(*dataset) ] # zip(*)表示自身连接
    del summaries[-1] # 删除最后一项
    return summaries

numbers = [1,2,3,4,5]
print('Summary of {0}: mean={1}, stdev={2}'.format(numbers, mean(numbers), stddev(numbers)))
dataset = [[1,20,0], [2,21,1], [3,22,0]]
summary = summarize(dataset)
print('Attribute summaries: {0}'.format(summary))

Summary of [1, 2, 3, 4, 5]: mean=3.0, stdev=1.5811388300841898
Attribute summaries: [(2.0, 1.0), (21.0, 1.0)]


In [54]:
def summarizeByClass(dataset):
    """按类别提取属性特征
    
    return:
        summaries: dict, key是类别，value是对应的特征的均值和标准差,此处应完成对于特征的提取
    """
    
    seperated = splitByClass(dataset) # dict: key---类别  value---特征
    summaries = {}
    for k, v in seperated.items():
        summaries[k] = summarize(v)
    return summaries

dataset = [[1,20,1], [2,21,0], [3,22,1], [4,22,0]]
summary = summarizeByClass(dataset)
print('Summary by class value: {0}'.format(summary))

Summary by class value: {1: [(2.0, 1.4142135623730951), (21.0, 1.4142135623730951)], 0: [(3.0, 1.4142135623730951), (21.5, 0.7071067811865476)]}


In [55]:
def calGsProb(x, mean, stddev):
    """计算高斯密度函数，即某个特征属于某个类的条件概率"""
    
    e = math.exp(-math.pow(x-mean, 2) / (2*math.pow(stddev, 2)))
    return (1 / (math.sqrt(2*math.pi) * stddev) * e)

x = 71.5
mean = 73
stdev = 6.2
probability = calGsProb(x, mean, stdev)
print('Probability of belonging to this class: {0}'.format(probability))

Probability of belonging to this class: 0.06248965759370005


In [56]:
def calClassProb(summaries, inputVector):
    """计算多个特征对应某个类的条件概率
    
    parameters:
        summaries -- dict，存储类别对应的均值和标准差
        inputVector -- 输入向量，代表数据，包含类别属性
    return:
        prob -- 多个特征对应某个类的条件概率
    """
    
    prob = {}
    for cvalue, csummaries in summaries.items():
        prob[cvalue] = 1
        for i in range(len(csummaries)):
            mean, stddev = csummaries[i]
            x = inputVector[i]
            prob[cvalue] *= calGsProb(x, mean, stddev)
    return prob

summaries = {0:[(1, 0.5)], 1:[(20, 5.0)]}
inputVector = [1.1, '?']
probabilities = calClassProb(summaries, inputVector)
print('Probabilities for each class: {0}'.format(probabilities))

Probabilities for each class: {0: 0.7820853879509118, 1: 6.298736258150442e-05}


In [57]:
def predict(summaries, inputVector):
    """对单个样本进行预测
    
    parameters:
        summaries -- dict，存储类别对应的均值和标准差
        inputVector -- 输入向量，代表数据，包含类别属性
    return:
        bestlabel -- 返回概率最大的类别
    """
    
    prob = calClassProb(summaries, inputVector)
    bestlabel, bestprob = None, -1
    for cv, p in prob.items():
        if bestlabel is None or p>bestprob:
            bestlabel = cv
            bestprob = p
    return bestlabel

summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]} # 只有一个特征维度
inputVector = [1.1, '?']
result = predict(summaries, inputVector)
print('Prediction: {0}'.format(result))

Prediction: A


In [58]:
def getPredictions(summaries, testSet):
    """对多个样本进行预测"""
    
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

summaries = {'A':[(1, 0.5)], 'B':[(20, 5.0)]}
testSet = [[1.1, '?'], [19.1, '?']]
predictions = getPredictions(summaries, testSet)
print('Predictions: {0}'.format(predictions))

Predictions: ['A', 'B']


In [59]:
def getAcc(testSet, predictions):
    """计算预测正确率"""
    
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct / float(len(testSet)))*100

testSet = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]
predictions = ['a', 'a', 'a']
accuracy = getAcc(testSet, predictions)
print('Accuracy: {0}'.format(accuracy))

Accuracy: 66.66666666666666


In [71]:
import csv
import random
import math

def loadCsv(filename):
    """读取数据并将所有数据转换成float型"""
    
    lines = csv.reader(open(filename, "rt")) # 每一行都是字符串组成列表
    dataset = list(lines)
    # print(dataset[0]) # 打印第一行
    for i in range(len(dataset)):
        dataset[i] = [ float(x) for x in dataset[i] ]
    return dataset

def splitDataset(dataset, radio):
    """根据比例划分训练和开发数据集"""
    
    trainSize = int(len(dataset)*radio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet)<trainSize:
        index = random.randrange(len(copy)) # 返回len内的随机数
        trainSet.append(copy.pop(index)) # train:append; test:pop
    return [trainSet, copy]

def splitByClass(dataset):
    """根据类别划分数据"""
    
    seperated = {} # 用字典存储划分后的数据
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in seperated):
            seperated[vector[-1]] = []
        seperated[vector[-1]].append(vector)
    return seperated

def mean(numbers):
    """计算均值"""
    
    return sum(numbers) / float(len(numbers))

def stddev(numbers):
    """计算标准差"""
    
    avg = mean(numbers)
    var = sum([pow(x-avg, 2) for x in numbers]) / float(len(numbers)-1)
    return math.sqrt(var)

def summarize(dataset):
    """计算每个属性的均值和标准差
    
    parameters:
        dataset -- 2维列表，shape=(size, feature)
    return:
        summaries -- 元组列表，每一个元素是1个元组
    """
    
    summaries = [ (mean(attr), stddev(attr)) for attr in zip(*dataset) ] # zip(*)表示自身连接
    del summaries[-1] # 删除最后一项
    return summaries

def summarizeByClass(dataset):
    """按类别提取属性特征
    
    return:
        summaries: dict, key是类别，value是对应的特征的均值和标准差,此处应完成对于特征的提取
    """
    
    seperated = splitByClass(dataset) # dict: key---类别  value---特征
    summaries = {}
    for k, v in seperated.items():
        summaries[k] = summarize(v)
    return summaries

def calGsProb(x, mean, stddev):
    """计算高斯密度函数，即某个特征属于某个类的条件概率"""
    
    e = math.exp(-math.pow(x-mean, 2) / (2*math.pow(stddev, 2)))
    return (1 / (math.sqrt(2*math.pi) * stddev) * e)

def calClassProb(summaries, inputVector):
    """计算多个特征对应某个类的条件概率
    
    parameters:
        summaries -- dict，存储类别对应的均值和标准差
        inputVector -- 输入向量，代表数据，包含类别属性
    return:
        prob -- 多个特征对应某个类的条件概率
    """
    
    prob = {}
    for cvalue, csummaries in summaries.items():
        prob[cvalue] = 1
        for i in range(len(csummaries)):
            mean, stddev = csummaries[i]
            x = inputVector[i]
            prob[cvalue] *= calGsProb(x, mean, stddev)
    return prob

def predict(summaries, inputVector):
    """对单个样本进行预测
    
    parameters:
        summaries -- dict，存储类别对应的均值和标准差
        inputVector -- 输入向量，代表数据，包含类别属性
    return:
        bestlabel -- 返回概率最大的类别
    """
    
    prob = calClassProb(summaries, inputVector)
    bestlabel, bestprob = None, -1
    for cv, p in prob.items():
        if bestlabel is None or p>bestprob:
            bestlabel = cv
            bestprob = p
    return bestlabel

def getPredictions(summaries, testSet):
    """对多个样本进行预测"""
    
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

def getAcc(testSet, predictions):
    """计算预测正确率"""
    
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct / float(len(testSet)))*100

filename = 'pima-indians-diabetes.csv'
dataset = loadCsv(filename)
radio = 0.8
trainSet, testSet = splitDataset(dataset, radio)
print("trainSet has %d data\ntestSet has %d data" %(len(trainSet), len(testSet)))
# 准备模型
model = summarizeByClass(trainSet)
# 预测
pred = getPredictions(model, testSet)
acc = getAcc(testSet, pred)
print("Accuracy is %d%%" % acc)

trainSet has 614 data
testSet has 154 data
Accuracy is 79%


## 2. 红酒预测

### 所有属性都是连续的，每个特征维度表示：    
- 0) 类别
- 1）酒精
- 2）苹果酸
- 3）灰
- 4）灰分的碱度  
- 5）镁
- 6）总酚
- 7）类黄酮
- 8）非黄烷类酚
- 9）原花青素
- 10）颜色强度
- 11）色调
- 12）稀释葡萄酒的OD280 / OD315
- 13）脯氨酸  
样本分布:
- 1级59个  
- 2级71个  
- 3级48个  

In [72]:
import csv
import random
import math

def loadCsv(filename):
    """读取数据并将所有数据转换成float型
    return:
        dataset -- 二维数组，shape=(size, feature)
    """
    
    lines = csv.reader(open(filename, "rt")) # 每一行都是字符串组成列表
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [ float(x) for x in dataset[i] ]
    return dataset

def splitDataset(dataset, radio):
    """根据比例划分训练和开发数据集
    
    return:
        trainSet -- 训练集
        copy -- 验证集
    """
    
    trainSize = int(len(dataset)*radio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet)<trainSize:
        index = random.randrange(len(copy)) # 返回len内的随机数
        trainSet.append(copy.pop(index)) # train:append; test:pop
    return [trainSet, copy]

def splitByClass(dataset):
    """根据类别划分数据
    
    return:
        seperated -- dict，key是类别，value是对应的数据,shape=(size, feature)
    """
    
    seperated = {} # 用字典存储划分后的数据
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[0] not in seperated):
            seperated[vector[0]] = []
        seperated[vector[0]].append(vector)
    return seperated

def mean(numbers):
    """计算均值"""
    
    return sum(numbers) / float(len(numbers))

def stddev(numbers):
    """计算标准差"""
    
    avg = mean(numbers)
    var = sum([pow(x-avg, 2) for x in numbers]) / float(len(numbers)-1)
    return math.sqrt(var)

def summarize(dataset):
    """计算每个属性的均值和标准差
    
    parameters:
        dataset -- 2维列表，shape=(size, feature)
    return:
        summaries -- 元组列表，每一个元素是1个元组
    """
    
    summaries = [ (mean(attr), stddev(attr)) for attr in zip(*dataset) ] # zip(*)表示自身连接
    del summaries[0] # 删除第一项
    return summaries

def summarizeByClass(dataset):
    """按类别提取属性特征
    
    return:
        summaries: dict, key是类别，value是对应的特征的均值和标准差,此处应完成对于特征的提取
    """
    
    seperated = splitByClass(dataset) # dict: key---类别  value---特征
    summaries = {}
    for k, v in seperated.items():
        summaries[k] = summarize(v)
    return summaries

def calGsProb(x, mean, stddev):
    """计算高斯密度函数，即某个特征属于某个类的条件概率"""
    
    e = math.exp(-math.pow(x-mean, 2) / (2*math.pow(stddev, 2)))
    return (1 / (math.sqrt(2*math.pi) * stddev) * e)

def calClassProb(summaries, inputVector):
    """计算多个特征对应某个类的条件概率
    
    parameters:
        summaries -- dict，存储类别对应的均值和标准差
        inputVector -- 输入向量，代表数据，包含类别属性
    return:
        prob -- 多个特征对应某个类的条件概率
    """
    
    prob = {}
    for cvalue, csummaries in summaries.items():
        prob[cvalue] = 1
        for i in range(len(csummaries)):
            mean, stddev = csummaries[i]
            x = inputVector[i+1] # 此处应+1，因为第一个维度是类别
            prob[cvalue] *= calGsProb(x, mean, stddev)
    return prob

def predict(summaries, inputVector):
    """对单个样本进行预测
    
    parameters:
        summaries -- dict，存储类别对应的均值和标准差
        inputVector -- 输入向量，代表数据，包含类别属性
    return:
        bestlabel -- 返回概率最大的类别
    """
    
    prob = calClassProb(summaries, inputVector)
    bestlabel, bestprob = None, -1
    for cv, p in prob.items():
        if bestlabel is None or p>bestprob:
            bestlabel = cv
            bestprob = p
    return bestlabel

def getPredictions(summaries, testSet):
    """对多个样本进行预测"""
    
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

def getAcc(testSet, predictions):
    """计算预测正确率"""
    
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][0] == predictions[i]: # 此处应更改
            correct += 1
    return (correct / float(len(testSet)))*100

filename = "wine.csv"
dataset = loadCsv(filename)
radio = 0.8
trainSet, testSet = splitDataset(dataset, radio)
print("trainSet has %d data\ntestSet has %d data" %(len(trainSet), len(testSet)))
model = summarizeByClass(trainSet)
pred = getPredictions(model, testSet)
acc = getAcc(testSet, pred)
print("Accuracy is %d%%" % acc)

trainSet has 142 data
testSet has 36 data
Accuracy is 94%


## 3. sklearn实现

In [82]:
import csv
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

def loadCsv(filename):
    """读取数据并将所有数据转换成float型
    return:
        dataset -- 二维数组，shape=(size, feature)
    """
    
    lines = csv.reader(open(filename, "rt")) # 每一行都是字符串组成列表
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [ float(x) for x in dataset[i] ]
    return dataset

filename = "wine.csv"
dataset = np.array(loadCsv(filename))
x = dataset[:, 1:]
y = dataset[:, 0]
radio = 0.8
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
print("trainSet has %d data\ntestSet has %d data" %(x_train.shape[0], x_test.shape[0]))
clf = GaussianNB()
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
print("Accaracy is %d%%" % (100*clf.score(x_test, y_test)))

trainSet has 142 data
testSet has 36 data
Accaracy is 91%


In [85]:
print("每个类的先验概率: \n", clf.class_prior_)
print("每个类包含的训练数据: \n", clf.class_count_)
print("每个类每个特征的均值: \n", clf.theta_)
print("每个类每个特征的方差: \n", clf.sigma_)

每个类的先验概率: 
 [0.31690141 0.38732394 0.29577465]
每个类包含的训练数据: 
 [45. 55. 42.]
每个类每个特征的均值: 
 [[1.37246667e+01 2.03088889e+00 2.45066667e+00 1.69111111e+01
  1.05133333e+02 2.83311111e+00 3.00066667e+00 2.85777778e-01
  1.93933333e+00 5.49644444e+00 1.06844444e+00 3.17688889e+00
  1.10460000e+03]
 [1.22656364e+01 1.90800000e+00 2.24127273e+00 2.03145455e+01
  9.59090909e+01 2.21309091e+00 1.97963636e+00 3.56181818e-01
  1.66563636e+00 2.93181818e+00 1.05974545e+00 2.79654545e+00
  5.27800000e+02]
 [1.31340476e+01 3.34714286e+00 2.44166667e+00 2.13928571e+01
  1.00071429e+02 1.70285714e+00 7.82619048e-01 4.56428571e-01
  1.17095238e+00 7.32928569e+00 6.90714286e-01 1.67809524e+00
  6.35119048e+02]]
每个类每个特征的方差: 
 [[1.96885413e-01 4.82993067e-01 5.35245243e-02 7.15841262e+00
  9.79823139e+01 1.25886401e-01 1.61240080e-01 4.60047490e-03
  1.69715635e-01 1.59395455e+00 1.23936601e-02 1.38841956e-01
  4.68123734e+04]
 [3.21981685e-01 8.38954908e-01 9.04154701e-02 1.15540619e+01
  3.17282736e+02 2