In [41]:
# imports
import pandas as pd
import numpy as np
from scipy import stats

In [42]:
def h(plist: list) -> float:
    '''
    Calculates entropy for a given list of probabilities
    :param list plist: List of probabilities for each of classes
    :retrun: Calculated entropy
    :rtype: float
    '''
    if abs(sum(plist)  - 1) > 0.1:
        raise Exception("Invalid list of probabilities")
    return sum([-x*np.log2(x) for x in plist])

def cal_df_entropy(d: pd.DataFrame, label_name: str) -> float:
    '''
    Calculates entropy of a Dataframe
    :param pd.DataFrame d: Selected pd.DataFrame
    :param label_name: Label column name in the given data frame
    :retrun: Calculated entropy
    :rtype: float
    '''
    return h([x / len(d[label_name]) for x in d[label_name].value_counts()])



In [43]:
LABEL = 'final evaluation'
dataset = pd.read_csv("nursery.csv")

# Converting string type categorical features to integers and saving the index in categorical_index
categorical_index = {}
for c in dataset.columns:
    dataset[c] = pd.Categorical(dataset[c])
    categorical_index[c] = list(dataset[c].cat.categories)
    dataset[c] = dataset[c].cat.codes
    
dataset

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,final evaluation
0,2,3,0,0,0,0,0,2,2
1,2,3,0,0,0,0,0,1,1
2,2,3,0,0,0,0,0,0,0
3,2,3,0,0,0,0,2,2,2
4,2,3,0,0,0,0,2,1,1
...,...,...,...,...,...,...,...,...,...
12955,0,4,2,3,1,1,2,1,3
12956,0,4,2,3,1,1,2,0,0
12957,0,4,2,3,1,1,1,2,3
12958,0,4,2,3,1,1,1,1,3


In [44]:
class Node:
    def __init__(self, value=None):
        self.value = value
        self.resolved = False
        self.children = []
        self.parent = None
        self.index = 0
    

    def get_rule_set(self):
        cn = self # current node
        
        ruleSet = []
        traversed_nodes = []

        j = 0
        while True:
            j += 1
            if cn is None:
                break

            if len(cn.children) == 0:
                p = cn.parent
                c = cn
                rule = f"{cn.value} | "
                while p is not None:
                    rule += f"{p.value}={c.index}, "
                    c = p
                    p = p.parent

                if rule not in ruleSet:
                    ruleSet.append(rule)

                cn = cn.parent
                continue
            
            f = False
            for i in range(len(cn.children)):
                if id(cn.children[i]) not in traversed_nodes:
                    traversed_nodes.append(id(cn.children[i])) 
                    cn = cn.children[i]
                    f = True
                    break
            if not f:
                cn = cn.parent
                continue
        
        return ruleSet
    
    def add_children(self, children: list):
        for child in children:
            assert isinstance(child, Node)
            child.parent = self
            child.index = len(self.children)
            self.children.append(child)
    
    def get_filtered_df(self, df: pd.DataFrame, all_features: list):
        '''
        Return filtered dataset and remaining features up to the current node
        '''
        p = self.parent
        c = self
        used_features = []
        while p is not None:
            used_features.append(p.value)
            df = df[df[p.value] == c.index].copy()
            c = p
            p = p.parent

        t = all_features.copy()
        for x in used_features:
            t.remove(x)
        
        return df, t


In [45]:
all_features = list(dataset.columns)
all_features.remove(LABEL)

dTree = Node()
current_node = dTree

cds = dataset.copy() # current dataset


while True:
    if current_node.value is not None:
        found_none = False
        for child in current_node.children:
            if child.value is None:
                found_none = True
                current_node = child
                break
        if not found_none:
            if current_node.parent == None:
                break
            current_node = current_node.parent
            continue
    
    cds, available_features = current_node.get_filtered_df(dataset, all_features)

    if len(available_features) == 0 or cds[LABEL].nunique() == 1:
        current_node.resolved = True
        current_node.value = stats.mode(cds[LABEL].values).mode[0]
        current_node = current_node.parent
        continue

    max_ig = {'val': 0, 'name': 0} # max information gain value and it's index
    for feature in available_features:
        total_h = cal_df_entropy(cds, LABEL) # total entropy of current remaining samples
        feature_n = len(categorical_index[feature]) # number of unique values that the selected feature has

        total_n = len(cds)
        cp = 0 # conditional probibility
        for v in range(feature_n):
            sub_d = cds[cds[feature] == v].copy()
            cp += (len(sub_d) / total_n) * cal_df_entropy(sub_d, LABEL)

        ig = total_h - cp
        if ig > max_ig['val']:
            max_ig['val'] = ig
            max_ig['name'] = feature
    
    current_node.value = max_ig['name']
    current_node.add_children([Node() for x in range(len(categorical_index[max_ig['name']]))])

ruleSet = dTree.get_rule_set()

print(categorical_index)
print(len(ruleSet))
print('\n'.join(ruleSet))

{'parents': ['great_pret', 'pretentious', 'usual'], 'has_nurs': ['critical', 'improper', 'less_proper', 'proper', 'very_crit'], 'form': ['complete', 'completed', 'foster', 'incomplete'], 'children': ['1', '2', '3', 'more'], 'housing': ['convenient', 'critical', 'less_conv'], 'finance': ['convenient', 'inconv'], 'social': ['nonprob', 'problematic', 'slightly_prob'], 'health': ['not_recom', 'priority', 'recommended'], 'final evaluation': ['not_recom', 'priority', 'recommend', 'spec_prior', 'very_recom']}
839
0 | health=0, 
1 | finance=0, housing=0, children=0, form=0, parents=0, has_nurs=0, health=1, 
3 | finance=1, housing=0, children=0, form=0, parents=0, has_nurs=0, health=1, 
3 | housing=1, children=0, form=0, parents=0, has_nurs=0, health=1, 
3 | housing=2, children=0, form=0, parents=0, has_nurs=0, health=1, 
3 | children=1, form=0, parents=0, has_nurs=0, health=1, 
3 | children=2, form=0, parents=0, has_nurs=0, health=1, 
3 | children=3, form=0, parents=0, has_nurs=0, health=1, 
3