In [57]:
import numpy as np
from math import log
import operator
from collections import Counter
import pandas as pd

In [32]:
input_file = open("lenses.txt")

In [33]:
lenses = [inst.strip().split('\t') for inst in input_file.readlines()]

In [34]:
np.array(lenses).shape

(24, 5)

In [35]:
lensesLabels = ["age", "prescript", "astigmatic", "tearRate"]
carLabels = ['unacc','acc','good','vgood']

In [17]:
def calc_ent(data):
    data_size = data.shape[0]
    labels = Counter(data[:,-1])  
    shannon_ent = 0.0  
    values = np.array(list(labels.values()))
    values_prob = values / values.sum()
    ent = -(values_prob * np.log2(values_prob)).sum()
    return ent

In [18]:
def splitDataSet(data, axis, value):
    data_f = data[data[:, axis]==value]
    data_r = np.delete(data_f, axis, 1)
    return data_r

In [19]:
def chooseBestFeatureToSplit(data):
    feat_size = data.shape[1] - 1
    baseEntropy = calc_ent(data)  
    bestInfoGain = 0.0 
    bestFeature = -1 
    for i in range(feat_size):
        uniqueVals = set(data[:, i])  
        newEntropy = 0.0  
        for value in uniqueVals:
            subDataSet = splitDataSet(data, i, value)
            prob = subDataSet.shape[0] * 1.0 / data.shape[0] 
            newEntropy += prob * calc_ent(subDataSet)  
        infoGain = baseEntropy - newEntropy  
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain  
            bestFeature = i  
    return bestFeature  

In [20]:
def majorityCnt(classList):
    return Counter(classList).most_common(1)[0][0]

In [48]:
def createTree(data, labels_ori):
    labels = labels_ori[:]
    classList = data[:, -1]
    if Counter(classList)[classList[0]] == classList.shape[0]: 
        return classList[0]  
    if data.shape[0] == 1: 
        return majorityCnt(classList) 
    bestFeat = chooseBestFeatureToSplit(data)  
    # print("best: ", bestFeat)
    bestFeatLabel = labels[bestFeat]
    myTree = {bestFeatLabel: {}} 
    del (labels[bestFeat]) 
    featValues = data[:, bestFeat]
    uniqueVals = set(featValues) 
    for value in uniqueVals: 
        subLabels = labels[:]
        myTree[bestFeatLabel][value] = createTree(
            splitDataSet(data, bestFeat, value), subLabels)
    return myTree

In [49]:
data_ori = pd.read_csv("traindata.txt")

In [53]:
res = createTree(np.array(data_ori), list(data_ori.columns))

In [79]:
def classify(inputTree, featLabels, testVec):
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    classLabel = 'Unknown'
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__ == "dict":
                classLabel = classify(secondDict[key], featLabels, testVec)
            else:
                classLabel = secondDict[key]
    return classLabel

In [80]:
test_data = pd.read_csv("testdata.txt")

In [81]:
test_data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,vhigh,vhigh,2,2,small,med
1,vhigh,vhigh,2,2,med,med
2,vhigh,vhigh,2,2,med,high
3,vhigh,vhigh,2,4,small,high
4,vhigh,vhigh,2,4,med,med


In [82]:
list(test_data.iloc[0])

['vhigh', 'vhigh', '2', '2', 'small', 'med']

In [84]:
pred_l = []
for i in range(test_data.shape[0]):
    pred_l.append(classify(res, list(data_ori.columns), list(test_data.iloc[i])))

In [86]:
from collections import Counter

In [87]:
Counter(pred_l)

Counter({'Unknown': 35, 'acc': 101, 'good': 12, 'unacc': 358, 'vgood': 13})