# Decision Trees

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
from collections import defaultdict
from statistics import mode

Build Decision Tree and predict

In [2]:
def build_decision_tree(df: DataFrame) -> dict:
    # base case where the target column has the same values
    if len(np.unique(df.iloc[:, -1])) == 1:
        return df.iloc[0, -1]
    # base case where all columns are exhausted and target column is left
    elif len(df.columns) == 1:
        return mode(df.iloc[:, -1])
    
    split_col = best_split_col(df)
    tree = defaultdict(None)
    tree["column_name"] = split_col

    for value in np.unique(df[split_col]):
        subset = df[df[split_col] == value].drop(columns=[split_col])
        tree[value] = build_decision_tree(subset)
    
    return tree

def predict(root: dict, test: dict) -> str:
    node = root
    while isinstance(node, dict):
        col = node["column_name"]
        node = node[test[col]]
    return node

### ID 3

In [3]:
def best_split_col(df):
    best_col, best_gain = None, 0

    for col in df.columns[:-1]:
        cur_gain = information_gain(df, col)

        if cur_gain >= best_gain:
            best_col, best_gain = col, cur_gain
    
    return best_col

def information_gain(df, col):
    return entropy(df) - sum(
        (freq / len(df)) * entropy(df[df[col] == x])
        for x, freq in zip(*np.unique(df[col], return_counts=True))
    )

def entropy(df):
    return -sum(
        (freq / len(df)) * np.log2(freq / len(df))
        for freq in np.unique(df.iloc[:, -1], return_counts=True)[1]
    )

In [4]:
path = "data/data08_2.csv"
df = pd.read_csv(path)
tree = build_decision_tree(df)
tree

defaultdict(None,
            {'column_name': 'cholesterol',
             'high': 'sick',
             'normal': 'healthy'})

In [5]:

D = {'age': 50, 'bp': 'low', 'cholesterol': 'high'}
predict(tree, D)

'sick'

### C4.5

In [6]:
def best_split_col(df):
    best_column, best_ratio = None, 0

    for col in df.columns[:-1]:
        p = information_gain(df, col)
        q = split_info(df, col)
        if p / q > best_ratio:
            best_column, best_ratio = col, p / q
        
    return best_column

def information_gain(df, col):
    return entropy(df) - sum(
        (freq / len(df)) * entropy(df[df[col] == x])
        for x, freq in zip(*np.unique(df[col], return_counts=True))
    )

def entropy(df):
    return -sum(
        (freq / len(df)) * np.log2(freq / len(df))
        for freq in np.unique(df.iloc[:, -1], return_counts=True)[1]
    )

def split_info(df, col):
    return -sum(
        (freq / len(df)) * np.log2(freq / len(df))
        for freq in np.unique(df[col], return_counts=True)[1]
    )

In [7]:
path = 'data/data09_1.csv'
df = pd.read_csv(path)
print(df.columns)


Index(['Outlook', 'Temp', 'Humidity', 'Wind', 'Tennis'], dtype='object')


In [8]:
tree = build_decision_tree(df)
tree

defaultdict(None,
            {'column_name': 'Outlook',
             'Overcast': 'Yes',
             'Rain': defaultdict(None,
                         {'column_name': 'Wind',
                          'Strong': 'No',
                          'Weak': 'Yes'}),
             'Sunny': defaultdict(None,
                         {'column_name': 'Humidity',
                          'High': 'No',
                          'Normal': 'Yes'})})

In [9]:
test = {
    'Outlook': 'Sunny',
    'Temp': 'Mild',
    'Humidity': 'Normal',
    'Wind': 'Weak'
}
predict(tree, test)

'Yes'

### CART
Gini index based

In [10]:
def best_split_col(df):
    print('CART IS USED')
    best_col, best_delta = None, 0

    for col in df.columns[:-1]:
        delta = gini_reduction(df, col)

        if delta > best_delta:
            best_col, best_delta = col, delta
        
    return best_col

def gini_reduction(df, col):
    return gini(df) - sum(
        (freq / len(df)) * gini(df[df[col] == x])
        for x, freq in zip(*np.unique(df[col], return_counts=True))
    )

def gini(df):
    return 1 - sum(
        (freq / len(df))**2
        for _, freq in zip(*np.unique(df.iloc[:, -1], return_counts=True))
    )

In [11]:
tree = build_decision_tree(df)
tree

CART IS USED
CART IS USED
CART IS USED


defaultdict(None,
            {'column_name': 'Outlook',
             'Overcast': 'Yes',
             'Rain': defaultdict(None,
                         {'column_name': 'Wind',
                          'Strong': 'No',
                          'Weak': 'Yes'}),
             'Sunny': defaultdict(None,
                         {'column_name': 'Humidity',
                          'High': 'No',
                          'Normal': 'Yes'})})

In [12]:
test = {
    'Outlook': 'Sunny',
    'Temp': 'Mild',
    'Humidity': 'Normal',
    'Wind': 'Weak'
}
predict(tree, test)

'Yes'