In [85]:
import numpy as np
import pandas as pd
import math
import json
import sys
import time

In [126]:
# entropy of a series of data
def entropyValCounts(vals):
    size=0
    for v in vals:
        size += v
        
    entropy=0
    for v in vals:
        entropy -= (v/size) * math.log(v/size,2)
    return entropy

def splitEntropy(le, gt):
    sizeLe = 0
    for v in le:
        sizeLe += v
    sizeGt = 0
    
    for v in gt:
        sizeGt += v
    
    size = sizeLe+sizeGt
    
    return sizeLe/size * entropyValCounts(le) + sizeGt/size * entropyValCounts(gt)
    
def calcGainBetter(data,attr,p0):
    vals = data[attr].unique()
    
    bestSplit = None
    bestGain = 0.0
    for v in vals:
        le = data[data[attr] <= v].iloc[:,-1].value_counts()
        gt = data[data[attr] > v].iloc[:,-1].value_counts()
        
        splitGain = p0 - splitEntropy(le,gt)
        if splitGain > bestGain:
            bestGain = splitGain
            bestSplit = v
            
    return bestSplit, bestGain

In [114]:
def findBestSplit(data, attr, p0):
    return calcGainBetter(data, attr, p0)

In [148]:
def selectSplittingAttr(attrs, data, threshold):
    p0 = entropy(data.iloc[:,-1])
    bestGain = 0
    alpha = None
    bestAttr = None
    
    for a in attrs:
        tmpAlpha=None
        tmpGain=0
        if attrs[a] < 1: # if attr is numeric
            tmpAlpha, tmpGain = findBestSplit(data, a, p0)
        else:
            tmpGain = p0 - entropyAttr(data, a)
        if tmpGain > bestGain:
            bestAttr = a
            bestGain = tmpGain
            alpha = tmpAlpha
    
    if bestGain > threshold:
        return bestAttr, alpha
    else:
        return None

df, filename, tmp, attrs = readFiles("./data/heart.csv")   
selectSplittingAttr(attrs, df, 0.2)

('ST_Slope', None)

In [146]:
# entropy of a series of data
def entropy(classcol):
    vals = classcol.value_counts()
    size = classcol.count()
    entropy=0
    for v in vals:
        entropy -= (v/size) * math.log(v/size,2)
    return entropy

# entropy of an attribute in a dataset, over each value of the attribute
def entropyAttr(data, attr):
    vals = data.pivot(columns=attr,values=data.columns[-1])
    entropyTot = 0
    for c in vals.columns:
        entropyTot += (vals[c].count()/len(data)) * entropy(vals[c])
    return entropyTot

    # class must be in last column
def c45(data, attrs, thresh):
    # base case 1
    #print(data)
    classes = data.iloc[:,-1]
    firstclass = None
    allsame=True
    
    for c in classes:
        if firstclass == None:
            firstclass = c
        elif c != firstclass:
            allsame=False
            break
            
    if allsame:
        #create leaf node for perfect purity
        return {"leaf": {
            "decision": firstclass,
            "p": 1.0,
            "type": "allsame"
        }}
    
    pluralityClass = {
        "decision": classes.mode()[0],
        "p": classes.value_counts()[classes.mode()][0]/len(classes)
    }
    
    # base case 2
    if len(attrs) == 0:
        pluralityClass.update({"type": "noAttrs"})
        return {"leaf": pluralityClass}                 # create leaf node with most frequent class
    
    # select splitting attr
    asplit = selectSplittingAttr(attrs, data, thresh)
    if asplit == None:
        pluralityClass.update({"type": "threshold"})
        return {"leaf": pluralityClass}
        
    else:
        attrs.pop(asplit)
        newNode = {"node": {"var": asplit, "plurality": pluralityClass, "edges": []}}
        possibleValues = data[asplit].unique()                # gets unique values in column
        
        for value in possibleValues:
            tic=time.clock()
            relatedData = data[(data == value).any(axis = 1)] # take rows that have that value
            
            if len(relatedData.columns) != 0:
                subtree = c45(relatedData, attrs, thresh) 
                edge = {"value": value}
                edge.update(subtree)
                newNode["node"]["edges"].append({"edge": edge})
        return newNode

# Reads a training set csv file and a restrictions vector text file, returns arranged training set          
def readFiles(filename=None, restrictions=None):
    if filename is None and restrictions is None:
        if len(sys.argv) < 2:
            print("Not enough arguments.")
            exit(1)
        elif len(sys.argv) == 3:
            restrictions = sys.argv[2]
        filename = sys.argv[1]

    restr=None
    if restrictions != None:
        with open(restrictions) as r:
            lines = r.read().replace(', ', ' ')
            restr = [int(x) for x in lines.split(' ')]

    df = pd.read_csv(filename)
    aclass = df.iloc[1,0]
    
    attrs = {}
    for a in df.columns:
        attrs[a] = int(df[a][0])
    
    isLabeled = True
    if not isinstance(aclass, str):
        isLabeled = False
    df = df.drop([0,1], axis=0)
    if restr != None:
        for i,v in enumerate(df.columns):
            if restr[i] == 0:
                df = df.drop(columns=[v])
    if isLabeled:
        df = df[[c for c in df if c not in [aclass]] + [aclass]]
        
    attrs.pop(df.columns[-1])
    return df, filename, isLabeled, attrs

# runs c45 with data from file of name training data with restrictions in filename restrictions
def induceC45(trainingData=None, restrictions=None, threshold=0.2):
    df,filename,tmp, attrs = readFiles(trainingData, restrictions)
    print(attrs)
#     tree={"dataset": filename}
#     tree.update(c45(df, df.columns[:-1].tolist(), threshold))
#     return tree


# prints a decision tree
def printTree(tree):
    with open("tree.json", 'w') as f:
        json.dump(tree, f)
    print(json.dumps(tree, sort_keys=False, indent=2))
    

if __name__ == "__main__":
    printTree(induceC45("./data/heart.csv", threshold=0))

{'Age': 0, 'Sex': 2, 'ChestPainType': 4, 'RestingBP': 0, 'Cholesterol': 0, 'FastingBS': 2, 'RestingECG': 3, 'MaxHR': 0, 'ExerciseAngina': 2, 'Oldpeak': 0, 'ST_Slope': 3}
null
