In [4]:
from sklearn import datasets
import numpy as np
from sklearn.model_selection import train_test_split
def creatdataset():
    iris = datasets.load_iris() #加载 iris 数据集
    iris_feature = iris.data #特征数据
    iris_target = iris.target #分类数据
    labels = ['sepal length', 'sepal width', 'petal length', 'petal width'] # 定义标签
    
    # 划分训练集和测试集
    feature_train, feature_test, target_train, target_test = train_test_split(iris_feature, iris_target, test_size=0.33, random_state=42)
    
    # 对训练集和测试集做简单处理，对连续float值转化为int处理，离散化连续变量
    traindata = np.rint( np.column_stack((feature_train,target_train)) )
    testdata = np.rint( np.column_stack((feature_test,target_test)) )
    return traindata,testdata
creatdataset()

(array([[6., 3., 4., 1., 1.],
        [8., 3., 7., 2., 2.],
        [6., 3., 4., 2., 1.],
        [5., 4., 1., 0., 0.],
        [8., 3., 7., 2., 2.],
        [6., 3., 4., 1., 1.],
        [5., 3., 1., 0., 0.],
        [5., 4., 1., 0., 0.],
        [5., 4., 2., 0., 0.],
        [5., 2., 4., 1., 1.],
        [6., 3., 5., 2., 2.],
        [5., 3., 2., 0., 0.],
        [5., 3., 2., 0., 0.],
        [5., 3., 2., 0., 0.],
        [6., 3., 4., 1., 1.],
        [5., 3., 2., 0., 0.],
        [6., 3., 4., 1., 1.],
        [8., 4., 7., 2., 2.],
        [5., 3., 1., 0., 0.],
        [6., 3., 4., 1., 1.],
        [6., 2., 5., 2., 2.],
        [6., 4., 1., 0., 0.],
        [6., 3., 5., 2., 2.],
        [6., 3., 5., 2., 2.],
        [6., 2., 4., 1., 1.],
        [5., 3., 4., 2., 1.],
        [6., 3., 5., 2., 2.],
        [6., 2., 4., 1., 1.],
        [5., 4., 2., 0., 0.],
        [5., 2., 3., 1., 1.],
        [6., 3., 5., 2., 2.],
        [5., 3., 1., 0., 0.],
        [5., 3., 1., 0., 0.],
        [6

In [10]:
from math import log

def calcShannonEnt(dataSet):
    numEntries = len(dataSet) #nrows
    #为所有的分类类目创建字典
    labelCounts ={}
    for featVec in dataSet:
        currentLable=featVec[-1] #取得最后一列数据
        if currentLable not in labelCounts.keys():
            labelCounts[currentLable]=0
        labelCounts[currentLable]+=1
    #计算香农熵
    shannonEnt=0.0
    for key in labelCounts:
        prob = float(labelCounts[key]) / numEntries
        shannonEnt -= prob * log(prob, 2)
    return shannonEnt

def splitDataSet(dataSet, axis, value):
    """
    划分数据集, 提取所有满足一个特征的值
    @ param dataSet: 数据集
    @ param axis: 划分数据集的特征
    @ param value: 提取出来满足某特征的list
    """
    retDataSet = []
    for featVec in dataSet:
        # 将相同数据特征的提取出来
        if featVec[axis] == value:
            reducedFeatVec = list(featVec[:axis])
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return np.array(retDataSet)

def chooseBestFeature(dataSet):
    """
    选择最优的划分属性
    @ param dataSet: 数据集
    @ return bestFeature: 最佳划分属性
    """
    # 属性的个数
    numFeature = len(dataSet[0])-1
    baseEntroy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0
    bestFeature = -1
    for i in range(numFeature):
        
        # 获取第i个特征所有可能的取值
        featureList = [example[i] for example in dataSet]
        
        # 去除重复值
        uniqueVals = set(featureList)
        
        newEntropy = 0.0
        
        for value in uniqueVals:
            
            subDataSet = splitDataSet(dataSet, i, value)
            
            # 特征为i的数据集占总数的比例
            prob = len(subDataSet) / float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)
        inforGain = baseEntroy - newEntropy
        
        if inforGain > bestInfoGain:
            bestInfoGain = inforGain
            bestFeature = i
    return bestFeature

def majorityCnt(classList):
    """
    递归构建决策树
    @ param classList: 类别列表
    @ return sortedClassCount[0][0]: 出现次数最多的类别
    """
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount[vote] += 1
    # 排序
    sortedClassCount = sorted(classCount.items(), key=lambda item:item[1], reverse=True)
    # 返回出现次数最多的
    print("当前类别：",sortedClassCount[0][0])
    return sortedClassCount[0][0]

def createTree(dataSet, labels):
    """
    构造决策树
    @ param dataSet: 数据集
    @ param labels: 标签集
    @ return myTree: 决策树
    """
    classList = [example[-1] for example in dataSet]

    # 当类别与属性完全相同时停止
    if classList.count(classList[0]) == len(classList):
        print("同类别：",classList[0])
        return classList[0]
    
    print("当前决策树层次：",len(dataSet[0])-1)
    print("当前结点个数：",len(dataSet))
    # 遍历完所有特征值时，返回数量最多的
    if (len(dataSet[0]) == 1):
        print("多类别：",majorityCnt(classList))
        return majorityCnt(classList)
    
    # 获取最佳划分属性
    bestFeat = chooseBestFeature(dataSet)
    print("最佳划分特征下标：",bestFeat)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel:{}}
    
    # 清空labels[bestFeat]
    del(labels[bestFeat])
    
    # 取出此特征的所有取值
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    print("特征范围：",uniqueVals)
    
    for value in uniqueVals:
        print("划分特征取值：",value)
        subLabels = labels[:]
        print("剩余特征：",labels)
        #print(splitDataSet(dataSet, bestFeat, value))
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
    return myTree

mytree = createTree(traindata,labels)

当前决策树层次： 4
当前结点个数： 100
最佳划分特征下标： 2
特征范围： {1.4, 1.3, 1.9, 4.2, 4.5, 6.6, 6.7, 4.1, 3.5, 4.9, 1.5, 4.3, 5.0, 5.4, 6.1, 3.0, 4.0, 4.6, 4.4, 5.1, 5.6, 5.5, 5.9, 6.0, 1.6, 1.1, 1.7, 1.2, 3.3, 3.8, 4.8, 4.7, 5.3, 5.7, 5.2, 5.8, 6.3, 3.9}
划分特征取值： 1.4
剩余特征： ['sepal length', 'sepal width', 'petal width']
同类别： 0.0
划分特征取值： 1.3
剩余特征： ['sepal length', 'sepal width', 'petal width']
同类别： 0.0
划分特征取值： 1.9
剩余特征： ['sepal length', 'sepal width', 'petal width']
同类别： 0.0
划分特征取值： 4.2
剩余特征： ['sepal length', 'sepal width', 'petal width']
同类别： 1.0
划分特征取值： 4.5
剩余特征： ['sepal length', 'sepal width', 'petal width']
当前决策树层次： 3
当前结点个数： 3
最佳划分特征下标： 0
特征范围： {5.4, 4.9, 5.6}
划分特征取值： 5.4
剩余特征： ['sepal width', 'petal width']
同类别： 1.0
划分特征取值： 4.9
剩余特征： ['sepal width', 'petal width']
同类别： 2.0
划分特征取值： 5.6
剩余特征： ['sepal width', 'petal width']
同类别： 1.0
划分特征取值： 6.6
剩余特征： ['sepal length', 'sepal width', 'petal width']
同类别： 2.0
划分特征取值： 6.7
剩余特征： ['sepal length', 'sepal width', 'petal width']
同类别： 2.0
划分特征取值： 4.1
剩余特征： ['sepal leng

In [12]:
#将测试数据应用到已经训好的决策树上
def classify(inputTree,featLabels,testVec):
    firstStr = inputTree.keys()[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    key = testVec[featIndex]
    valueOfFeat = secondDict[key]
    if isinstance(valueOfFeat, dict): 
        classLabel = classify(valueOfFeat, featLabels, testVec)
    else: classLabel = valueOfFeat
    return classLabel

def TestFunc(dTrees ,featLabels ,testData):
    for temp in testData:
        classLabel = classify(dTrees,featLabels,temp)
        print ("%s\t%s" % (temp,classLabel))


featLabels =  ['sepal length', 'sepal width', 'petal length', 'petal width']
#test Decision Tree
TestFunc(mytree,featLabels,feature_test) 

TypeError: 'dict_keys' object does not support indexing