# Lab 3: Supervised Learning - Classification Algorithms

### Aidan Barbieux and Eric Inman

In [59]:
import numpy as np
import pandas as pd
import math
import json

## C4.5 Algorithm

In [43]:
# entropy of a series of data
def entropy(classcol):
    vals = classcol.value_counts()
    size = classcol.count()
    entropy=0
    for v in vals:
        entropy -= (v/size) * math.log(v/size,2)
    return entropy

# entropy of an attribute in a dataset, over each value of the attribute
def entropyAttr(data, attr):
    vals = data.pivot(columns=attr,values=data.columns[-1])
    entropyTot = 0
    for c in vals.columns:
        entropyTot += (vals[c].count()/len(data)) * entropy(vals[c])
    return entropyTot

In [118]:
def selectSplittingAttr(attrs, data, threshold):
    p0 = entropy(data.iloc[:,-1])
    gain={}
    for a in attrs:
        gain[a] = p0 - entropyAttr(data, a) # info gain
    
    bestAttr=max(gain,key=gain.get)
    if gain[bestAttr] > threshold:
        return bestAttr
    else:
        return None

### C4.5

In [129]:
# class must be in last column
def c45(data, attrs, tree, thresh):
    # base case 1
    #print(data)
    classes = data.iloc[:,-1]
    firstclass = None
    allsame=True
    for c in classes:
        if c == None:
            firstclass = c
        elif c != firstclass:
            allsame=False
            break
            
    if allsame:
        #create leaf node for perfect purity
        tree["leaf"] = {
            "decision": firstclass,
            "p": 1.0
        }
    
    # base case 2
    if len(attrs) == 0:
        # create leaf node with most frequent class
        tree["leaf"] = {
            "decision": classes.mode()[0],
            "p": classes.value_counts()[classes.mode()][0]/len(classes)
        }
    
    # select splitting attr
    asplit = selectSplittingAttr(attrs, data, thresh)
    if asplit == None:
        tree["leaf"] = {
            "decision": classes.mode()[0],
            "p": classes.value_counts()[classes.mode()][0]/len(classes)
        }
        
    else:
        tree["node"] = {"var": asplit, "edges": []}
        possibleValues = data[asplit].unique()                # gets unique values in column
        for value in possibleValues:
            relatedData = data[(data == value).any(axis = 1)] # take rows that have that value
            relatedData = relatedData.drop(asplit, axis=1)    # remove the attribute from the data
            if len(relatedData.columns) != 1:
                treev = {"value": value}
                c45(relatedData, relatedData.columns[:-1], treev, thresh)
                edge = {"edge": treev}
                tree["node"]['edges'].append(edge)
                
        
        # populate node w c45 recursively, add newNode to tree

df = readArrange('agaricus-lepiota.csv')

tree={"dataset": 'agaricus-lepiota.csv'}
c45(df, df.columns[:-1], tree, 0.8)
print(json.dumps(tree, sort_keys=False, indent=4))
 

{
    "dataset": "agaricus-lepiota.csv",
    "node": {
        "var": "odor",
        "edges": [
            {
                "edge": {
                    "value": "p",
                    "leaf": {
                        "decision": "e",
                        "p": 0.517971442639094
                    }
                }
            },
            {
                "edge": {
                    "value": "a",
                    "leaf": {
                        "decision": "e",
                        "p": 0.9818913480885312
                    }
                }
            },
            {
                "edge": {
                    "value": "l",
                    "leaf": {
                        "decision": "p",
                        "p": 0.7468354430379747
                    }
                }
            },
            {
                "edge": {
                    "value": "n",
                    "leaf": {
                        "decision": "e",
               

In [98]:
%%capture cap --no-stderr
print(json.dumps(tree, sort_keys=False, indent=4))

with open('output.txt', 'w') as f:
    f.write(cap.stdout)
    
# saves the output to a file

In [3]:
def readArrange(filename):
    df = pd.read_csv(filename)
    aclass = df.iloc[1,0]
    df = df.drop([0,1], axis=0)
    df = df[[c for c in df if c not in [aclass]] + [aclass]]
    return df

## Data