In [84]:
import pandas as pd
import torch
import numpy as np
from skimage import io, transform
from math import log
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from decimal import Decimal


# Decision Tree Classifier

In [130]:
class DecisionTreeNode:
    def __init__(self, parent):
        self.parent=parent
        self.children= dict()
    
    def add_child(self, child_key, child_value):
        self.children[child_key] = child_value
        
    def get_children(self):
        return self.children
    
    def get_parent(self):
        return self.parent
    
    def get_attribute(self):
        return self.attribute
    
    def set_attribute(self, attribute):
        self.attribute = attribute
        
    def set_label(self, label):
        self.label=label
    
    def get_label(self):
        return self.label
    
    def __str__(self, level=1):
        text=""
        if hasattr(self, 'label'):
            text += "leaf: label = {}".format(self.label)
        else:
            text += "split {}, descendandts(".format(self.attribute)
            for value, child in self.children.items():
                text += "\n"+"   "*level+"branch = {}, child node:{}".format(value, child.__str__(level+1))
            
            text += ")"
        return text

In [127]:
def entropy(target_):
    h = 0
    for label_ in target_.unique():
        h += -((target_[target_==label_].size / target_.size)* log(target_[target_==label_].size / target_.size, 2))
    return h

def determine_split_attribute(data_, label_position_, attributes_):
    best_attribute_ = None
    best_gain_ = 0
    base_entropy_ = entropy(data_.iloc[:,label_position_])
    for attribute_ in attributes_:
        x_select_ = data_.loc[:,[attribute_, data_.columns[label_position_]]]
        information_gain_ = base_entropy_
        for value_ in x_select_.loc[:,attribute_].unique():
            #split_ = pd.concat([x_select_[x_select_==value_], target_], axis=1, join='inner')
            split_entropy_ = entropy(x_select_.iloc[:,label_position_])
            information_gain_ -= split_entropy_ * (x_select_.size / data_.size)
        
        if information_gain_ >= best_gain_:
            best_attribute_ = attribute_
            best_gain = information_gain_
            
    return best_attribute_

In [125]:
#param attributes_ should be a set of attributes
#param target_ should be a series (like y_train)
#param data_ should be a dataframe (like x_train)
def build_decision_tree(data_, attributes_, label_position_):
    node_ = DecisionTreeNode(None)
    if data_.iloc[:,label_position_].unique().size==1:
        node_.set_label(data_.iloc[0,label_position_])
        return node_
        
    if len(attributes_) == 0:
        node_.set_label(data_.iloc[:,label_position_].value_counts().head(1).last_valid_index())
        return node_
        
    else:
        split_attribute_ = determine_split_attribute(data_, label_position_, attributes_)
        print('splitting on: {}'.format(split_attribute_))
        node_.set_attribute(split_attribute_)
        split_select_ = data_.loc[:,split_attribute_]
        for split_value_ in split_select_.unique():
            child_data_ = data_[data_[split_attribute_] == split_value_]
            child_attributes_ = attributes_
            child_attributes_.remove(split_attribute_)
            #print('child_attributes: {}'.format(child_attributes_))
            node_.add_child(split_value_, build_decision_tree(child_data_,child_attributes_, label_position_))
            #print('currend subtree: {}'.format(node_))
            child_attributes_.add(split_attribute_)
            
    return node_

In [51]:
#data should be a dataframe (like x_train)
#root should be a a DecisionTreeNode (returned from build_decision_tree)
def make_prediction(root, data):
    predictions = dict()
    for i, point in data.iterrows():
        current_node_ = root
        not_predicted = True
        while not_predicted:
            if(hasattr(current_node_, 'label')):
                predictions[i]=current_node_.get_label()
                not_predicted = False
            else:
                split_value = point[current_node_.get_attribute()]
                try:
                    current_node_ = current_node_.get_children()[split_value]
                except KeyError:
                    current_node_ = list(current_node_.get_children().values())[0]
    result = pd.Series(predictions)
    return result

In [104]:
# TODO: model.fit() model.predict()
# requires df -> set of attributes

class DecisionTreeClassifier:
    def fit(self, data_, label_position_):
        self.tree=build_decision_tree(data_, extract_attribute_set(data_, label_position_), label_position_)
        
    def predict(self, data_):
        return make_prediction(self.tree, data_)
    
    def print_model(self):
        print(self.tree)

In [108]:
def extract_attribute_set(dataframe_, label_position_):
    indexes = [i for i in range(dataframe_.columns.size) if i != label_position_]
    x = dataframe_.iloc[:,indexes]
    return set(x.columns.values.tolist())

In [106]:
def load_data(path, header_included_):
    if header_included_:
        data = pd.read_csv(path)
    else:
        data = pd.read_csv(path, header=None)
        add_default = read_Bool("Do you want to add the default mushroom header? (True/False)")
        if add_default:
            assign_mushroom_header(data)
        
   # encoder = LabelEncoder()

    #benchmark speedup of encoding
    #for attr in shrooms:
        #data[attr] = encoder.fit_transform(data[attr].astype('str'))
        
    return data

def assign_mushroom_header(dataframe_):
    #Input column names from Mushroom Attributes.txt
    columns=['class','cap-shape','cap-surface','cap-color','bruises','odor','gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type','veil-color','ring-number','ring-type','spore-print-color','population','habitat']

    #Rename columns according to their real attributes
    dataframe_.set_axis(columns, axis='columns', inplace=True)
    
def split_data(dataframe_, test_size_):
    #y is our target class
    #y = dataframe_.iloc[:,label_position]
    #indexes = [i for i in range(dataframe_.columns.size) if i != label_position]
    #x is our attributes
    #x = dataframe_.iloc[:,indexes]
    
    #x_train_, x_test_, y_train_, y_test_ = train_test_split(x,y,test_size = test_size_)
    train_, test_ = train_test_split(dataframe_, test_size = test_size_)
    return train_, test_
    


In [128]:
def print_accuracy(real_values, predicted_values):
    stats= pd.crosstab(index = predicted_values, columns=real_values, margins=True, rownames= ['predicted'], colnames=['actual'])
    accuracy = np.sum(real_values == predicted_values) / predicted_values.size
    print(stats)
    print("The accuracy is: {}".format(accuracy))

In [135]:
# TODO: main for command line interface
def read_Bool(msg):
    text = input(msg)
    if text == "True":
        text = True
    elif text == "False":
        text = False
    else:
        return read_Bool(msg)
    return text
def read_float(msg):
    try:
        d = float(input(msg))
    except:
        print("invalid input, please try again")
        return read_float(msg)
    if 0<d<1:
        return d
    print("the relative size has to be between 0.0 and 1.0")
    return read_float(msg)
    
def read_int(msg):
    try:
        i = int(input(msg))
    except:
        print("invalid input, please try again")
        return read_int(msg)
    return i
    
def main():
    path = input("enter the absolute path of the dataset: ")
    header_included = read_Bool("is the header included in the dataset?: (True/False)")
    data = load_data(path, header_included)
    split_size = read_float("enter the relative size of the test set: ")
    train, test = split_data(data, split_size)
    model = DecisionTreeClassifier()
    label_position = read_int("enter the position of the class label in the dataset: ")
    print("Training the model")
    model.fit(train, label_position)
    print('your model: \n\n')
    model.print_model()
    print("\n\nevaluating on test set:")
    predict = model.predict(test)
    print('\n\n')
    print_accuracy(test.iloc[:,label_position], predict)
    
main()
    

enter the absolute path of the dataset: Data/Mushrooms.txt
is the header included in the dataset?: (True/False)False
Do you want to add the default mushroom header? (True/False)False
enter the relative size of the test set: 0.0
the relative size has to be between 0.0 and 1.0
enter the relative size of the test set: 0.1
enter the position of the class label in the dataset: 0.1
invalid input, please try again
enter the position of the class label in the dataset: 0
Training the model
splitting on: 19
splitting on: 18
splitting on: 17
splitting on: 20
splitting on: 16
splitting on: 15
splitting on: 21
splitting on: 22
splitting on: 14
splitting on: 13
splitting on: 12
splitting on: 11
splitting on: 10
splitting on: 14
splitting on: 13
splitting on: 12
splitting on: 11
splitting on: 10
splitting on: 9
splitting on: 8
splitting on: 7
splitting on: 6
splitting on: 5
splitting on: 8
splitting on: 7
splitting on: 6
splitting on: 5
splitting on: 8
splitting on: 7
splitting on: 6
splitting on: 5





actual        e    p   All
predicted                 
e          1044    0  1044
p             0  987   987
All        1044  987  2031
The accuracy is: 1.0
