## Classifier

Takes JSON input (from tree induction) and CSV file

In [2]:
import numpy as np
import pandas as pd
import json
import sys

In [3]:
def readArrange(filename):
    df = pd.read_csv(filename)
    aclass = df.iloc[1,0]
    df = df.drop([0,1], axis=0)
    df = df[[c for c in df if c not in [aclass]] + [aclass]]
    return df

In [6]:
# returns a pandas dataframe from the csvfile and a dictionary from the jsonfile
def readFiles(file1=None, file2=None):
    if file1 is None and file2 is None:
        if len(sys.argv) != 3:
            print("Not enough arguments.")
            exit(1)
        else:
            file1 = sys.argv[1]
            file2 = sys.argv[2]
    
    data = readArrange(file1)
    tree = None
    with open(file2) as f:
        tree = json.load(f)
    
    return data, tree

In [31]:
def traverseTree(row, tree, nodeType):
    if nodeType == "leaf":
        return tree["decision"]        
        
    elif nodeType == "node":
        val = row[tree["var"]]
        for obj in tree["edges"]:
            if obj["edge"]["value"] == val:
                newType = "leaf" if "leaf" in obj["edge"].keys() else "node"
                return traverseTree(row, obj["edge"][newType], newType)

In [59]:
def classify():
    numErrors = 0
    numCorrect = 0
    totalClassified = 0
    accuracy = 0
    errorRate = 0
    
    data, tree = readFiles("nursery.csv", "output.json")
    
    out = []
    for i, row in data.iterrows():
        prediction = traverseTree(row, tree["node"], "node")
        actual = row[data.columns[-1]]
        
        newLine = []
        for c in row:
            newLine.append(c)
        newLine.append(prediction)
        out.append(newLine)
        
        if prediction != actual:
            numErrors += 1
        else:
            numCorrect += 1
            
        totalClassified += 1
            
    cols = [c for c in data.columns] + ["Prediction"]
    
    accuracy = numCorrect / totalClassified
    errorRate = numErrors / totalClassified
    
    print(pd.DataFrame(out, columns=cols))
    print("Total Records Classifed: ", totalClassified)
    print("Total Classified Correctly: ", numCorrect)
    print("Total Classified Incorrectly: ", numErrors)
    print("Accuracy: ", accuracy)
    print("Error Rate: ", errorRate)
    
classify()

          parents   has_nurs      form children     housing     finance  \
0           usual     proper  complete        1  convenient  convenient   
1           usual     proper  complete        1  convenient  convenient   
2           usual     proper  complete        1  convenient  convenient   
3           usual     proper  complete        1  convenient  convenient   
4           usual     proper  complete        1  convenient  convenient   
...           ...        ...       ...      ...         ...         ...   
12955  great_pret  very_crit    foster     more    critical      inconv   
12956  great_pret  very_crit    foster     more    critical      inconv   
12957  great_pret  very_crit    foster     more    critical      inconv   
12958  great_pret  very_crit    foster     more    critical      inconv   
12959  great_pret  very_crit    foster     more    critical      inconv   

              social       health       class Prediction  
0            nonprob  recommended   reco