<a href="https://colab.research.google.com/github/akshay-r13/ds_from_scratch/blob/main/17_Decision_Trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 17 - Decision Trees

## What is a Decision Tree?

A decision tree uses a tree structure to represent a number of possible decision paths and the corresponding outcome for each path.

In [20]:
import math

In [21]:

# Function to compute entropy

def entropy(class_probabilities):
  return sum( -p * math.log(p, 2) 
              for p in class_probabilities
              if p != 0 # to avoid calculating log(0) = infinity
             )

In [22]:
entropy([0, 0, 0, 1])

0.0

In [23]:
from collections import Counter

In [24]:
# Function to compute class probabilities

def class_probabilities(labels):
  total_count = len(labels) # Total count of labels
  return [label_count / total_count for label_count in Counter(labels).values()]

In [25]:
class_probabilities(['a', 'a', 'a', 'b'])

[0.75, 0.25]

In [26]:
def data_entropy(labelled_data):
  labels = [label for _, label in labelled_data]
  class_probs = class_probabilities(labels)
  return entropy(class_probs)


In [39]:
# Function to compute partition entropy

def partition_entropy(subsets):
  total_count = sum(len(subset) for subset in subsets)

  return sum( (len(subset) / total_count) * data_entropy(subset) # compute weighted entropy
              for subset in subsets) # Iterate each subset
    

## Creating a Decision Tree

In [40]:
inputs = [
  ({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'no'},
  False),
  ({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'yes'},
  False),
  ({'level':'Mid', 'lang':'Python', 'tweets':'no', 'phd':'no'},
  True),
  ({'level':'Junior', 'lang':'Python', 'tweets':'no', 'phd':'no'},
  True),
  ({'level':'Junior', 'lang':'R', 'tweets':'yes', 'phd':'no'},
  True),
  ({'level':'Junior', 'lang':'R', 'tweets':'yes', 'phd':'yes'},
  False),
  ({'level':'Mid', 'lang':'R', 'tweets':'yes', 'phd':'yes'},
  True),
  ({'level':'Senior', 'lang':'Python', 'tweets':'no', 'phd':'no'}, False),
  ({'level':'Senior', 'lang':'R', 'tweets':'yes', 'phd':'no'},
  True),
  ({'level':'Junior', 'lang':'Python', 'tweets':'yes', 'phd':'no'}, True),
  ({'level':'Senior', 'lang':'Python', 'tweets':'yes', 'phd':'yes'}, True),
  ({'level':'Mid', 'lang':'Python', 'tweets':'no', 'phd':'yes'},
  True),
  ({'level':'Mid', 'lang':'Java', 'tweets':'yes', 'phd':'no'},
  True),
  ({'level':'Junior', 'lang':'Python', 'tweets':'no', 'phd':'yes'}, False)
]

In [41]:
from collections import defaultdict

In [42]:
# Function to partition data by an attribute

def partition_by(inputs, attribute):
  groups = defaultdict(list)
  for input in inputs:
    groups[input[0][attribute]].append(input)
  return groups

In [43]:
# Function to compute partition entropy based on attribute

def partition_entropy_by(inputs, attribute):
  partitions = partition_by(inputs, attribute)
  return partition_entropy(partitions.values())

In [47]:
partition_entropy_by(inputs, 'level')

0.6935361388961918

In [50]:
for key in ['level', 'lang', 'phd', 'tweets']:
  print(key, partition_entropy_by(inputs, key))

level 0.6935361388961918
lang 0.8601317128547441
phd 0.8921589282623617
tweets 0.7884504573082896


In [54]:
partition_by(inputs, 'level')['Mid']

[({'lang': 'Python', 'level': 'Mid', 'phd': 'no', 'tweets': 'no'}, True),
 ({'lang': 'R', 'level': 'Mid', 'phd': 'yes', 'tweets': 'yes'}, True),
 ({'lang': 'Python', 'level': 'Mid', 'phd': 'yes', 'tweets': 'no'}, True),
 ({'lang': 'Java', 'level': 'Mid', 'phd': 'no', 'tweets': 'yes'}, True)]

In [56]:
senior_inputs =  partition_by(inputs, 'level')['Senior']
print(senior_inputs)

[({'level': 'Senior', 'lang': 'Java', 'tweets': 'no', 'phd': 'no'}, False), ({'level': 'Senior', 'lang': 'Java', 'tweets': 'no', 'phd': 'yes'}, False), ({'level': 'Senior', 'lang': 'Python', 'tweets': 'no', 'phd': 'no'}, False), ({'level': 'Senior', 'lang': 'R', 'tweets': 'yes', 'phd': 'no'}, True), ({'level': 'Senior', 'lang': 'Python', 'tweets': 'yes', 'phd': 'yes'}, True)]


In [57]:
for key in ['level', 'lang', 'phd', 'tweets']:
  print(key, partition_entropy_by(senior_inputs, key))

level 0.9709505944546686
lang 0.4
phd 0.9509775004326937
tweets 0.0
