caichun
diff --git a/‎Apriori/AprioriTest.py‎
Lines changed: 128 additions & 0 deletions b/‎Apriori/AprioriTest.py‎
Lines changed: 128 additions & 0 deletions
diff --git a/‎DesicionTree/DesicionTreeTest.py‎
Lines changed: 128 additions & 0 deletions b/‎DesicionTree/DesicionTreeTest.py‎
Lines changed: 128 additions & 0 deletions
diff --git a/‎HMM/ViterbiTest.py‎
Lines changed: 101 additions & 0 deletions b/‎HMM/ViterbiTest.py‎
Lines changed: 101 additions & 0 deletions
@@ -0,0 +1,128 @@
+#coding:utf-8
+
+class Apriori():
+    def __init__(self):
+        pass
+
+    '''
+    关联分析的目标包括两项：发现频繁项集和发现关联规则
+    '''
+
+    '''
+    频繁项集：{}
+        对于包含N种物品的数据集共有2^N-1种项集组合。
+        支持度(support)
+            一个项集的支持度被定义为数据集中包含该项集的记录所占的比例。
+        Apriori算法：如果某个项集是频繁的，那么它的所有子集也是频繁的。
+        如果一个项集是非频繁集，那么它的所有超集也是非频繁集。
+    '''
+
+    def _createC1(self, dataSet):
+        C1 = []
+        for transaction in dataSet:
+            for item in transaction:
+                if [item] not in C1:
+                    C1.append([item])
+        C1.sort()
+        return map(frozenset, C1) # use frozen set so we can use it as a key in a dict
+
+    def _scanD(self, D, Ck, minSupport=0.5):
+        ssCnt = {}
+        for tid in D:
+            for can in Ck:
+                if can.issubset(tid):
+                    if can in ssCnt:
+                        ssCnt[can] += 1
+                    else:
+                        ssCnt[can] = 1
+                    # if can not in ssCnt:
+                    #     ssCnt[can] = 0
+                    # ssCnt[can] += 1
+        # print ssCnt
+        numItems = len(D)
+        retList = []
+        supportK = {}
+        for key in ssCnt:
+            support = ssCnt[key]/float(numItems) # 计算支持度
+            if support >= minSupport:
+                retList.append(key)
+            supportK[key] = support
+        return retList, supportK
+
+    def aprioriGen(self, Lk, k): # k>=2
+        retList = []
+        lenLk = len(Lk)
+        for i in range(lenLk):
+            for j in range(i+1, lenLk):
+                L1 = list(Lk[i])[:k-2]
+                L2 = list(Lk[j])[:k-2]
+                L1.sort()
+                L2.sort()
+                if L1 == L2: # if first k-2 elements are equal. when k is 3, {0,1},{0,2},{1,2}→{0,1}U{0,2}→{0,1,2}
+                    retList.append(Lk[i] | Lk[j])
+        return retList
+
+    def apriori(self, dataSet, minSupport=0.5): # minSupport 最小支持度
+        D = map(set, dataSet) # 转换为集合set
+        C1 = self._createC1(dataSet) # 创建C1，转换为集合frozenset
+        L1, supp1 = self._scanD(D, C1, minSupport) # 基于C1和minSupport创建L1
+        L = []
+        supportData = {}
+        L.append(L1)
+        supportData.update(supp1)
+        k = 2
+        while len(L[k-2]) > 1:
+            Ck = self.aprioriGen(L[k-2], k) # 创建Ck
+            Lk, suppK = self._scanD(D, Ck, minSupport) # 基于Ck和minSupport创建Lk
+            L.append(Lk)
+            supportData.update(suppK)
+            k += 1
+        return L, supportData
+
+    '''
+    关联规则：→
+        可信度(confidence)：也称置信度
+            可信度(尿布→葡萄酒) = 支持度({尿布,葡萄酒})/支持度({尿布})
+        如果某条规则并不满足最小可信度要求，那么该规则的所有子集也不会满足最小可信度要求。
+    '''
+
+    def _calcConf(self, freqSet, H, supportData, brl, minConf=0.7): # H为出现在右部的规则列表，如{0},{1}
+        prunedH = []
+        for conseq in H:
+            conf = supportData[freqSet]/supportData[freqSet-conseq] # 计算可信度
+            if conf >= minConf:
+                print freqSet-conseq, '-->', conseq, 'conf:', conf
+                brl.append((freqSet-conseq, conseq, conf))
+                prunedH.append(conseq)
+        return prunedH
+
+    def _rulesFromConseq(self, freqSet, H, supportData, brl, minConf=0.7): # H为出现在右部的规则列表，如{0},{1}
+        m = len(H[0])
+        if len(freqSet) > (m+1):
+            Hmp1 = self.aprioriGen(H, m+1) # 合并规则
+            Hmp = self._calcConf(freqSet, Hmp1, supportData, brl, minConf) # Hmp为出现在右部的合并规则列表，如{0,1}
+            if len(Hmp) > 1: # 如果规则列表长度大于1，则进一步合并
+                self._rulesFromConseq(freqSet, Hmp, supportData, brl, minConf)
+
+    def generateRules(self, L, supportData, minConf=0.7): # minConf 最小可信度
+        bigRuleList = []
+        for i in range(1, len(L)): # 从包含两个或者更多元素的项集开始规则构建过程
+            for freqSet in L[i]:
+                H1 = [frozenset([item]) for item in freqSet] # 构建只包含单个元素的列表，即出现在规则右部的规则列表，如{0},{1}
+                if i > 1:
+                    self._rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) # 生成候选规则
+                else:
+                    self._calcConf(freqSet, H1, supportData, bigRuleList, minConf) # 对规则进行评估
+        return bigRuleList
+
+def loadDataSet():
+    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
+
+if __name__ == '__main__':
+    dataSet = loadDataSet()
+    ap = Apriori()
+    L, suppData = ap.apriori(dataSet, minSupport=0.5)
+    print L
+    print suppData
+    rules = ap.generateRules(L, suppData, minConf=0.6)
+    print rules
@@ -0,0 +1,128 @@
+#coding:utf-8
+
+import math
+
+class DesicionTree():
+    def __init__(self):
+        pass
+
+    def _calcShannonEnt(self, dataSet): ## 计算数据集的熵
+        numEntries = len(dataSet)
+        classCounts = {}
+        for data in dataSet:
+            currentLabel = data[-1]
+            if currentLabel not in classCounts:
+                classCounts[currentLabel] = 1
+            else:
+                classCounts[currentLabel] += 1
+            # if currentLabel not in classCounts:
+            #     classCounts[currentLabel] = 0
+            # classCounts[currentLabel] += 1
+        '''
+        信息  -log2(pi)
+        熵：信息的期望 sigma(-pi*log2(pi))
+        '''
+        shannonEnt = 0.0
+        for key in classCounts:
+            prob = classCounts[key]/float(numEntries)
+            shannonEnt -= prob*math.log(prob, 2) # log base 2
+        return shannonEnt
+
+    def _splitDataSet(self, dataSet, axis, value):
+        retDataSet = []
+        for data in dataSet:
+            # print data[axis]
+            if data[axis] == value:
+                reduceddata = data[:axis]
+                reduceddata.extend(data[axis+1:])
+                retDataSet.append(reduceddata)
+        return retDataSet
+
+    def _chooseBestFeatureToSplit(self, dataSet):
+        numFeatures = len(dataSet[0])-1 # 最后一列是类标签
+        baseEntropy = self._calcShannonEnt(dataSet)
+        bestInfoGain = 0
+        bestFeature = -1
+        for i in range(numFeatures): # 依次迭代所有的特征
+            featList = [data[i] for data in dataSet]
+            values = set(featList)
+            '''
+            条件熵：sigma(pj*子数据集的熵)
+            '''
+            ## 计算每个特征对数据集的条件熵
+            newEntropy = 0.0
+            for value in values:
+                subDataSet = self._splitDataSet(dataSet, i, value)
+                prob = len(subDataSet)/float(len(dataSet))
+                newEntropy += prob*self._calcShannonEnt(subDataSet)
+            '''
+            信息增益 = 熵-条件熵
+            '''
+            infoGain = baseEntropy-newEntropy
+            if infoGain > bestInfoGain:
+                bestInfoGain = infoGain
+                bestFeature = i
+        return bestFeature
+
+    def _majorityCnt(self, classList):
+        classCount = {}
+        for vote in classList:
+            if vote not in classCount:
+                classCount[vote] = 1
+            else:
+                classCount[vote] += 1
+            # if vote not in classCount:
+            #     classCount[vote] = 0
+            # classCount[vote] += 1
+        sortedClassCount = sorted(classCount.items(), key=lambda xx:xx[1], reverse=True)
+        return sortedClassCount[0][0]
+
+    def fit(self, dataSet, featLabels):
+        classList = [data[-1] for data in dataSet]
+        if classList.count(classList[0]) == len(classList):
+            return classList[0] # 所有的类标签都相同，则返回类标签
+        if len(dataSet[0]) == 1: # 所有的类标签不完全相同，但用完所有特征，则返回次数最多的类标签
+            return self._majorityCnt(classList)
+        bestFeat = self._chooseBestFeatureToSplit(dataSet)
+        bestFeatLabel = featLabels[bestFeat]
+        tree = {bestFeatLabel:{}}
+        featLabels_copy = featLabels[:] # 这样不会改变输入的featLabels
+        featLabels_copy.remove(bestFeatLabel)
+        featList = [data[bestFeat] for data in dataSet]
+        values = set(featList)
+        for value in values:
+            subfeatLabels_copy = featLabels_copy[:] # 列表复制，非列表引用
+            tree[bestFeatLabel][value] = self.fit(self._splitDataSet(dataSet, bestFeat, value), subfeatLabels_copy)
+        return tree
+
+    def predict(self, tree, featLabels, testVec):
+        firstStr = tree.keys()[0]
+        secondDict = tree[firstStr]
+        featIndex = featLabels.index(firstStr)
+        key = testVec[featIndex]
+        valueOfFeat = secondDict[key]
+        if isinstance(valueOfFeat, dict):
+            classLabel = self.predict(valueOfFeat, featLabels, testVec)
+        else:
+            classLabel = valueOfFeat
+        return classLabel
+
+def loadDataSet():
+    dataSet = [[1, 1, 'yes'],
+               [1, 1, 'yes'],
+               [1, 0, 'no'],
+               [0, 1, 'no'],
+               [0, 1, 'no']]
+    featLabels = ['no surfacing', 'flippers'] # 特征标签
+    return dataSet, featLabels
+
+if __name__ == '__main__':
+    myDataSet, myFeatLabels = loadDataSet()
+    print myDataSet, myFeatLabels
+    dt = DesicionTree()
+    myTree = dt.fit(myDataSet, myFeatLabels)
+    print myTree
+    results = dt.predict(myTree, myFeatLabels, [1, 1])
+    print results
+    results = dt.predict(myTree, myFeatLabels, [0, 1])
+    print results
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+'''
+HMM（隐马尔可夫模型）是用来描述隐含未知参数的统计模型
+举一个经典的例子：
+一个东京的朋友每天根据天气{下雨,天晴}决定当天的活动{公园散步,购物,清理房间}中的一种
+我每天只能在twitter上看到她发的推“啊，我前天公园散步、昨天购物、今天清理房间了！”
+那么我可以根据她发的twitter推断东京这三天的天气
+在这个例子里，显状态是活动，隐状态是天气
+求解最可能的隐状态序列是HMM的三个典型问题之一，通常用Viterbi算法解决
+Viterbi算法就是求解HMM上的最短路径（-log(prob)，也即是最大概率）的算法
+'''
+
+# HMM描述 lambda = (states, observations, start_probability, transition_probability, emission_probability)
+states = ('Rainy', 'Sunny')
+
+observations = ('walk', 'shop', 'clean')
+
+start_probability = {'Rainy': 0.6, 'Sunny': 0.4}
+
+transition_probability = {
+    'Rainy' : {'Rainy': 0.7, 'Sunny': 0.3},
+    'Sunny' : {'Rainy': 0.4, 'Sunny': 0.6},
+    }
+
+emission_probability = {
+    'Rainy' : {'walk': 0.1, 'shop': 0.4, 'clean': 0.5},
+    'Sunny' : {'walk': 0.6, 'shop': 0.3, 'clean': 0.1},
+}
+
+# 打印路径概率表
+def print_dptable(V):
+    print '',
+    for t in range(len(V)):
+        print "%7d" % t,
+    print ''
+    for y in V[0].keys():
+        print "%.5s:" % y,
+        for t in range(len(V)):
+            print "%.7s" % ("%f" % V[t][y]),
+        print ''
+
+def viterbi(stas, obs, start_p, trans_p, emit_p):
+    '''
+    :param stas:隐状态
+    :param obs:观测序列
+    :param start_p:初始概率（隐状态）
+    :param trans_p:转移概率（隐状态）
+    :param emit_p:发射概率（隐状态表现为显状态的概率）
+    :return:
+    思路：
+    定义V[时间][今天天气] = 概率，注意今天天气指的是，前几天的天气都确定下来了（概率最大）今天天气是X的概率，这里的概率就是一个累乘的概率了
+    因为第一天我的朋友去散步了，所以第一天下雨的概率V[第一天][下雨] = 初始概率[下雨] * 发射概率[下雨][散步] = 0.6 * 0.1 = 0.06，同理可得V[第一天][天晴] = 0.24。从直觉上来看，因为第一天朋友出门了，她一般喜欢在天晴的时候散步，所以第一天天晴的概率比较大，数字与直觉统一了。
+    从第二天开始，对于每种天气Y，都有前一天天气是X的概率 * X转移到Y的概率 * Y天气下朋友进行这天这种活动的概率。因为前一天天气X有两种可能，所以Y的概率有两个，选取其中较大一个作为V[第二天][天气Y]的概率，同时将今天的天气加入到结果序列中
+    比较V[最后一天][下雨]和[最后一天][天晴]的概率，找出较大的哪一个对应的序列，就是最终结果
+    '''
+
+    # 路径概率表 V[时间][隐状态] = 概率
+    V = [{}]
+    # 一个中间变量，代表当前状态是哪个隐状态
+    path = {}
+
+    # 初始化初始状态 (对t == 0)
+    for y in stas:
+        V[0][y] = start_p[y] * emit_p[y][obs[0]]
+        path[y] = [y] # 记录初始路径，前面的key对应y状态
+    print V
+    print path
+
+    # 跑一遍维特比算法 (对 t > 0)
+    for t in range(1, len(obs)):
+        V.append({})
+
+        new_path = {}
+        for y in stas:
+            '''隐状态概率 = 前状态是y0的概率 * y0转移到y的概率 * y表现为当前状态的概率'''
+            # y的最大概率及对应的前状态sta
+            (prob, sta) = max([(V[t - 1][y0] * trans_p[y0][y] * emit_p[y][obs[t]], y0) for y0 in stas])
+            # 记录最大隐状态概率
+            V[t][y] = prob
+            # 记录路径
+            new_path[y] = path[sta] + [y] # 记录当前路径，前面的key对应y状态
+        print V
+        print new_path
+
+        # 不需要保留旧路径
+        path = new_path
+
+    print_dptable(V)
+
+    # 找出概率最大的最后状态
+    (prob, sta) = max([(V[len(obs) - 1][y], y) for y in stas])
+    return prob, path[sta]
+
+def example():
+    return viterbi(states,
+                   observations,
+                   start_probability,
+                   transition_probability,
+                   emission_probability)
+
+print example()