Skip to content

Commit a19ffd4

Browse files
committed
test
0 parents  commit a19ffd4

File tree

5 files changed

+530
-0
lines changed

5 files changed

+530
-0
lines changed

Apriori/AprioriTest.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
#coding:utf-8
2+
3+
class Apriori():
4+
def __init__(self):
5+
pass
6+
7+
'''
8+
关联分析的目标包括两项:发现频繁项集和发现关联规则
9+
'''
10+
11+
'''
12+
频繁项集:{}
13+
对于包含N种物品的数据集共有2^N-1种项集组合。
14+
支持度(support)
15+
一个项集的支持度被定义为数据集中包含该项集的记录所占的比例。
16+
Apriori算法:如果某个项集是频繁的,那么它的所有子集也是频繁的。
17+
如果一个项集是非频繁集,那么它的所有超集也是非频繁集。
18+
'''
19+
20+
def _createC1(self, dataSet):
21+
C1 = []
22+
for transaction in dataSet:
23+
for item in transaction:
24+
if [item] not in C1:
25+
C1.append([item])
26+
C1.sort()
27+
return map(frozenset, C1) # use frozen set so we can use it as a key in a dict
28+
29+
def _scanD(self, D, Ck, minSupport=0.5):
30+
ssCnt = {}
31+
for tid in D:
32+
for can in Ck:
33+
if can.issubset(tid):
34+
if can in ssCnt:
35+
ssCnt[can] += 1
36+
else:
37+
ssCnt[can] = 1
38+
# if can not in ssCnt:
39+
# ssCnt[can] = 0
40+
# ssCnt[can] += 1
41+
# print ssCnt
42+
numItems = len(D)
43+
retList = []
44+
supportK = {}
45+
for key in ssCnt:
46+
support = ssCnt[key]/float(numItems) # 计算支持度
47+
if support >= minSupport:
48+
retList.append(key)
49+
supportK[key] = support
50+
return retList, supportK
51+
52+
def aprioriGen(self, Lk, k): # k>=2
53+
retList = []
54+
lenLk = len(Lk)
55+
for i in range(lenLk):
56+
for j in range(i+1, lenLk):
57+
L1 = list(Lk[i])[:k-2]
58+
L2 = list(Lk[j])[:k-2]
59+
L1.sort()
60+
L2.sort()
61+
if L1 == L2: # if first k-2 elements are equal. when k is 3, {0,1},{0,2},{1,2}→{0,1}U{0,2}→{0,1,2}
62+
retList.append(Lk[i] | Lk[j])
63+
return retList
64+
65+
def apriori(self, dataSet, minSupport=0.5): # minSupport 最小支持度
66+
D = map(set, dataSet) # 转换为集合set
67+
C1 = self._createC1(dataSet) # 创建C1,转换为集合frozenset
68+
L1, supp1 = self._scanD(D, C1, minSupport) # 基于C1和minSupport创建L1
69+
L = []
70+
supportData = {}
71+
L.append(L1)
72+
supportData.update(supp1)
73+
k = 2
74+
while len(L[k-2]) > 1:
75+
Ck = self.aprioriGen(L[k-2], k) # 创建Ck
76+
Lk, suppK = self._scanD(D, Ck, minSupport) # 基于Ck和minSupport创建Lk
77+
L.append(Lk)
78+
supportData.update(suppK)
79+
k += 1
80+
return L, supportData
81+
82+
'''
83+
关联规则:→
84+
可信度(confidence):也称置信度
85+
可信度(尿布→葡萄酒) = 支持度({尿布,葡萄酒})/支持度({尿布})
86+
如果某条规则并不满足最小可信度要求,那么该规则的所有子集也不会满足最小可信度要求。
87+
'''
88+
89+
def _calcConf(self, freqSet, H, supportData, brl, minConf=0.7): # H为出现在右部的规则列表,如{0},{1}
90+
prunedH = []
91+
for conseq in H:
92+
conf = supportData[freqSet]/supportData[freqSet-conseq] # 计算可信度
93+
if conf >= minConf:
94+
print freqSet-conseq, '-->', conseq, 'conf:', conf
95+
brl.append((freqSet-conseq, conseq, conf))
96+
prunedH.append(conseq)
97+
return prunedH
98+
99+
def _rulesFromConseq(self, freqSet, H, supportData, brl, minConf=0.7): # H为出现在右部的规则列表,如{0},{1}
100+
m = len(H[0])
101+
if len(freqSet) > (m+1):
102+
Hmp1 = self.aprioriGen(H, m+1) # 合并规则
103+
Hmp = self._calcConf(freqSet, Hmp1, supportData, brl, minConf) # Hmp为出现在右部的合并规则列表,如{0,1}
104+
if len(Hmp) > 1: # 如果规则列表长度大于1,则进一步合并
105+
self._rulesFromConseq(freqSet, Hmp, supportData, brl, minConf)
106+
107+
def generateRules(self, L, supportData, minConf=0.7): # minConf 最小可信度
108+
bigRuleList = []
109+
for i in range(1, len(L)): # 从包含两个或者更多元素的项集开始规则构建过程
110+
for freqSet in L[i]:
111+
H1 = [frozenset([item]) for item in freqSet] # 构建只包含单个元素的列表,即出现在规则右部的规则列表,如{0},{1}
112+
if i > 1:
113+
self._rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) # 生成候选规则
114+
else:
115+
self._calcConf(freqSet, H1, supportData, bigRuleList, minConf) # 对规则进行评估
116+
return bigRuleList
117+
118+
def loadDataSet():
119+
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
120+
121+
if __name__ == '__main__':
122+
dataSet = loadDataSet()
123+
ap = Apriori()
124+
L, suppData = ap.apriori(dataSet, minSupport=0.5)
125+
print L
126+
print suppData
127+
rules = ap.generateRules(L, suppData, minConf=0.6)
128+
print rules

DesicionTree/DesicionTreeTest.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
#coding:utf-8
2+
3+
import math
4+
5+
class DesicionTree():
6+
def __init__(self):
7+
pass
8+
9+
def _calcShannonEnt(self, dataSet): ## 计算数据集的熵
10+
numEntries = len(dataSet)
11+
classCounts = {}
12+
for data in dataSet:
13+
currentLabel = data[-1]
14+
if currentLabel not in classCounts:
15+
classCounts[currentLabel] = 1
16+
else:
17+
classCounts[currentLabel] += 1
18+
# if currentLabel not in classCounts:
19+
# classCounts[currentLabel] = 0
20+
# classCounts[currentLabel] += 1
21+
'''
22+
信息 -log2(pi)
23+
熵:信息的期望 sigma(-pi*log2(pi))
24+
'''
25+
shannonEnt = 0.0
26+
for key in classCounts:
27+
prob = classCounts[key]/float(numEntries)
28+
shannonEnt -= prob*math.log(prob, 2) # log base 2
29+
return shannonEnt
30+
31+
def _splitDataSet(self, dataSet, axis, value):
32+
retDataSet = []
33+
for data in dataSet:
34+
# print data[axis]
35+
if data[axis] == value:
36+
reduceddata = data[:axis]
37+
reduceddata.extend(data[axis+1:])
38+
retDataSet.append(reduceddata)
39+
return retDataSet
40+
41+
def _chooseBestFeatureToSplit(self, dataSet):
42+
numFeatures = len(dataSet[0])-1 # 最后一列是类标签
43+
baseEntropy = self._calcShannonEnt(dataSet)
44+
bestInfoGain = 0
45+
bestFeature = -1
46+
for i in range(numFeatures): # 依次迭代所有的特征
47+
featList = [data[i] for data in dataSet]
48+
values = set(featList)
49+
'''
50+
条件熵:sigma(pj*子数据集的熵)
51+
'''
52+
## 计算每个特征对数据集的条件熵
53+
newEntropy = 0.0
54+
for value in values:
55+
subDataSet = self._splitDataSet(dataSet, i, value)
56+
prob = len(subDataSet)/float(len(dataSet))
57+
newEntropy += prob*self._calcShannonEnt(subDataSet)
58+
'''
59+
信息增益 = 熵-条件熵
60+
'''
61+
infoGain = baseEntropy-newEntropy
62+
if infoGain > bestInfoGain:
63+
bestInfoGain = infoGain
64+
bestFeature = i
65+
return bestFeature
66+
67+
def _majorityCnt(self, classList):
68+
classCount = {}
69+
for vote in classList:
70+
if vote not in classCount:
71+
classCount[vote] = 1
72+
else:
73+
classCount[vote] += 1
74+
# if vote not in classCount:
75+
# classCount[vote] = 0
76+
# classCount[vote] += 1
77+
sortedClassCount = sorted(classCount.items(), key=lambda xx:xx[1], reverse=True)
78+
return sortedClassCount[0][0]
79+
80+
def fit(self, dataSet, featLabels):
81+
classList = [data[-1] for data in dataSet]
82+
if classList.count(classList[0]) == len(classList):
83+
return classList[0] # 所有的类标签都相同,则返回类标签
84+
if len(dataSet[0]) == 1: # 所有的类标签不完全相同,但用完所有特征,则返回次数最多的类标签
85+
return self._majorityCnt(classList)
86+
bestFeat = self._chooseBestFeatureToSplit(dataSet)
87+
bestFeatLabel = featLabels[bestFeat]
88+
tree = {bestFeatLabel:{}}
89+
featLabels_copy = featLabels[:] # 这样不会改变输入的featLabels
90+
featLabels_copy.remove(bestFeatLabel)
91+
featList = [data[bestFeat] for data in dataSet]
92+
values = set(featList)
93+
for value in values:
94+
subfeatLabels_copy = featLabels_copy[:] # 列表复制,非列表引用
95+
tree[bestFeatLabel][value] = self.fit(self._splitDataSet(dataSet, bestFeat, value), subfeatLabels_copy)
96+
return tree
97+
98+
def predict(self, tree, featLabels, testVec):
99+
firstStr = tree.keys()[0]
100+
secondDict = tree[firstStr]
101+
featIndex = featLabels.index(firstStr)
102+
key = testVec[featIndex]
103+
valueOfFeat = secondDict[key]
104+
if isinstance(valueOfFeat, dict):
105+
classLabel = self.predict(valueOfFeat, featLabels, testVec)
106+
else:
107+
classLabel = valueOfFeat
108+
return classLabel
109+
110+
def loadDataSet():
111+
dataSet = [[1, 1, 'yes'],
112+
[1, 1, 'yes'],
113+
[1, 0, 'no'],
114+
[0, 1, 'no'],
115+
[0, 1, 'no']]
116+
featLabels = ['no surfacing', 'flippers'] # 特征标签
117+
return dataSet, featLabels
118+
119+
if __name__ == '__main__':
120+
myDataSet, myFeatLabels = loadDataSet()
121+
print myDataSet, myFeatLabels
122+
dt = DesicionTree()
123+
myTree = dt.fit(myDataSet, myFeatLabels)
124+
print myTree
125+
results = dt.predict(myTree, myFeatLabels, [1, 1])
126+
print results
127+
results = dt.predict(myTree, myFeatLabels, [0, 1])
128+
print results

HMM/ViterbiTest.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
# -*- coding: utf-8 -*-
2+
'''
3+
HMM(隐马尔可夫模型)是用来描述隐含未知参数的统计模型
4+
举一个经典的例子:
5+
一个东京的朋友每天根据天气{下雨,天晴}决定当天的活动{公园散步,购物,清理房间}中的一种
6+
我每天只能在twitter上看到她发的推“啊,我前天公园散步、昨天购物、今天清理房间了!”
7+
那么我可以根据她发的twitter推断东京这三天的天气
8+
在这个例子里,显状态是活动,隐状态是天气
9+
求解最可能的隐状态序列是HMM的三个典型问题之一,通常用Viterbi算法解决
10+
Viterbi算法就是求解HMM上的最短路径(-log(prob),也即是最大概率)的算法
11+
'''
12+
13+
# HMM描述 lambda = (states, observations, start_probability, transition_probability, emission_probability)
14+
states = ('Rainy', 'Sunny')
15+
16+
observations = ('walk', 'shop', 'clean')
17+
18+
start_probability = {'Rainy': 0.6, 'Sunny': 0.4}
19+
20+
transition_probability = {
21+
'Rainy' : {'Rainy': 0.7, 'Sunny': 0.3},
22+
'Sunny' : {'Rainy': 0.4, 'Sunny': 0.6},
23+
}
24+
25+
emission_probability = {
26+
'Rainy' : {'walk': 0.1, 'shop': 0.4, 'clean': 0.5},
27+
'Sunny' : {'walk': 0.6, 'shop': 0.3, 'clean': 0.1},
28+
}
29+
30+
# 打印路径概率表
31+
def print_dptable(V):
32+
print '',
33+
for t in range(len(V)):
34+
print "%7d" % t,
35+
print ''
36+
for y in V[0].keys():
37+
print "%.5s:" % y,
38+
for t in range(len(V)):
39+
print "%.7s" % ("%f" % V[t][y]),
40+
print ''
41+
42+
def viterbi(stas, obs, start_p, trans_p, emit_p):
43+
'''
44+
:param stas:隐状态
45+
:param obs:观测序列
46+
:param start_p:初始概率(隐状态)
47+
:param trans_p:转移概率(隐状态)
48+
:param emit_p:发射概率(隐状态表现为显状态的概率)
49+
:return:
50+
思路:
51+
定义V[时间][今天天气] = 概率,注意今天天气指的是,前几天的天气都确定下来了(概率最大)今天天气是X的概率,这里的概率就是一个累乘的概率了
52+
因为第一天我的朋友去散步了,所以第一天下雨的概率V[第一天][下雨] = 初始概率[下雨] * 发射概率[下雨][散步] = 0.6 * 0.1 = 0.06,同理可得V[第一天][天晴] = 0.24。从直觉上来看,因为第一天朋友出门了,她一般喜欢在天晴的时候散步,所以第一天天晴的概率比较大,数字与直觉统一了。
53+
从第二天开始,对于每种天气Y,都有前一天天气是X的概率 * X转移到Y的概率 * Y天气下朋友进行这天这种活动的概率。因为前一天天气X有两种可能,所以Y的概率有两个,选取其中较大一个作为V[第二天][天气Y]的概率,同时将今天的天气加入到结果序列中
54+
比较V[最后一天][下雨]和[最后一天][天晴]的概率,找出较大的哪一个对应的序列,就是最终结果
55+
'''
56+
57+
# 路径概率表 V[时间][隐状态] = 概率
58+
V = [{}]
59+
# 一个中间变量,代表当前状态是哪个隐状态
60+
path = {}
61+
62+
# 初始化初始状态 (对t == 0)
63+
for y in stas:
64+
V[0][y] = start_p[y] * emit_p[y][obs[0]]
65+
path[y] = [y] # 记录初始路径,前面的key对应y状态
66+
print V
67+
print path
68+
69+
# 跑一遍维特比算法 (对 t > 0)
70+
for t in range(1, len(obs)):
71+
V.append({})
72+
73+
new_path = {}
74+
for y in stas:
75+
'''隐状态概率 = 前状态是y0的概率 * y0转移到y的概率 * y表现为当前状态的概率'''
76+
# y的最大概率及对应的前状态sta
77+
(prob, sta) = max([(V[t - 1][y0] * trans_p[y0][y] * emit_p[y][obs[t]], y0) for y0 in stas])
78+
# 记录最大隐状态概率
79+
V[t][y] = prob
80+
# 记录路径
81+
new_path[y] = path[sta] + [y] # 记录当前路径,前面的key对应y状态
82+
print V
83+
print new_path
84+
85+
# 不需要保留旧路径
86+
path = new_path
87+
88+
print_dptable(V)
89+
90+
# 找出概率最大的最后状态
91+
(prob, sta) = max([(V[len(obs) - 1][y], y) for y in stas])
92+
return prob, path[sta]
93+
94+
def example():
95+
return viterbi(states,
96+
observations,
97+
start_probability,
98+
transition_probability,
99+
emission_probability)
100+
101+
print example()

0 commit comments

Comments
 (0)