In [158]:
import pandas as pd
import numpy as np

from statistics import mode
from collections import defaultdict

### Implementation notes

* Nodes are stored as dictionaries.

* `node["column_name"]` gives name of splitting column.

* `node[column_value]` gives the child node (also a dictionary). 

The functions in this section are common to all decision trees...

Implementation of `best_split_col()` is all that differs in the 3 methods.

In [159]:
def build_decision_tree(df: pd.DataFrame) -> dict:
    # if all values in the target column are the same...
    if len(np.unique(df.iloc[:, -1])) == 1:
        return df.iloc[0, -1]
    
    # if there aren't any columns left (besides the target column)...
    if len(df.columns) == 1:
        return mode(df.iloc[:, -1])

    split_col = best_split_col(df)

    tree = defaultdict(None)
    tree["column_name"] = split_col

    for value in np.unique(df[split_col]):
        subset = df[df[split_col] == value].drop(columns=[split_col])
        tree[value] = build_decision_tree(subset)
        
    return tree


def predict(root: dict, D: dict) -> str:
    node = root
    while isinstance(node, dict):
        col = node["column_name"]
        node = node[D[col]]
    return node

### C4.5

Works by using entropy along with split_info.

In [160]:
def best_split_col(df: pd.DataFrame) -> float:
    # assuming df has at least two cols, the last col is the target col
    best_col, best_ratio = None, 0

    for col in df.columns[:-1]:
        p = information_gain(df, col)
        q = split_info(df, col)

        if (p / q) > best_ratio:
            best_ratio, best_col = (p / q), col
    
    return best_col

def entropy(df: pd.DataFrame) -> float:
    return -sum(
        (c / len(df)) * np.log2(c / len(df)) 
        for c in np.unique(df.iloc[:, -1], return_counts=True)[1]
    )

def information_gain(df: pd.DataFrame, col: str) -> float:
    return entropy(df) - sum(
        (freq / len(df)) * entropy(df[df[col] == x]) 
        for x, freq in zip(*np.unique(df[col], return_counts=True))
    )

def split_info(df: pd.DataFrame, col: str) -> float:
    return -sum(
        (freq / len(df)) * np.log2(freq / len(df))
        for freq in np.unique(df[col], return_counts=1)[1]
    )

In [161]:
path = "data/data09_1.csv"
df = pd.read_csv(path)
tree = build_decision_tree(df)

In [162]:
test = {"Outlook": "Sunny", "Temp": "Mild", "Humidity": "Normal", "Wind": "Weak"}
predict(tree, test)

'Yes'

In [163]:
tree

defaultdict(None,
            {'column_name': 'Outlook',
             'Overcast': 'Yes',
             'Rain': defaultdict(None,
                         {'column_name': 'Wind',
                          'Strong': 'No',
                          'Weak': 'Yes'}),
             'Sunny': defaultdict(None,
                         {'column_name': 'Humidity',
                          'High': 'No',
                          'Normal': 'Yes'})})

### CART

Relies on Gini impurity.

In [171]:
def best_split_col(df: pd.DataFrame) -> float:
    best_col, best_delta = None, 0

    for col in df.columns[:-1]:
        delta = gini_reduction(df, col)

        if delta > best_delta:
            best_delta, best_col = col, delta
    
    return best_col

def gini_impurity(df: pd.DataFrame) -> float:
    return 1 - sum(df.iloc[:, -1].value_counts(normalize=True) ** 2)

def gini_reduction(df: pd.DataFrame, col: str) -> float:
    return gini_impurity(df) - sum(
        (freq / len(df)) * gini_impurity(df[df[col] == x])
        for x, freq in zip(*np.unique(df[col], return_counts=True))
    )


In [172]:
predict(tree, test)

'Yes'

In [173]:
tree

defaultdict(None,
            {'column_name': 'Outlook',
             'Overcast': 'Yes',
             'Rain': defaultdict(None,
                         {'column_name': 'Wind',
                          'Strong': 'No',
                          'Weak': 'Yes'}),
             'Sunny': defaultdict(None,
                         {'column_name': 'Humidity',
                          'High': 'No',
                          'Normal': 'Yes'})})