In [19]:
inputs = [
    ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'},   False),
    ({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'},  False),
    ({'level':'Mid','lang':'Python','tweets':'no','phd':'no'},     True),
    ({'level':'Junior','lang':'Python','tweets':'no','phd':'no'},  True),
    ({'level':'Junior','lang':'R','tweets':'yes','phd':'no'},      True),
    ({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'},    False),
    ({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'},        True),
    ({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),
    ({'level':'Senior','lang':'R','tweets':'yes','phd':'no'},      True),
    ({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),
    ({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),
    ({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'},    True),
    ({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'},      True),
    ({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)
]

In [20]:
import math

def entropy(class_probabilities):
    """Given a list of class probabilities, compute the entropy."""
    return sum(-p * math.log(p, 2)
               for p in class_probabilities
               if p) # Ignore zero probabilities.

In [21]:
from collections import Counter

def class_probabilities(labels):    
    total_count = len(labels)
    return [count / total_count
            for count in Counter(labels).values()]

In [22]:
def data_entropy(labeled_data):
    labels = [label for _, label in labeled_data]
    probabilities = class_probabilities(labels)
    return entropy(probabilities)

In [23]:
def partition_entropy(subsets):
    """Find the entropy from this partition of data into subsets
    subsets is a list of labeled data."""
    total_count = sum(len(subset) for subset in subsets)

    return sum(data_entropy(subset) * len(subset) / total_count
               for subset in subsets)

In [24]:
from collections import defaultdict

def partition_by(inputs, attribute):
    """Each input is a pair (attribute_dict, label).
    Returns a dict: attribute_value -> inputs."""
    groups = defaultdict(list)
    for input in inputs:
        key = input[0][attribute] # Get the value of the specified attribute.
        groups[key].append(input) # Then add this input to the correct list.
    return groups

In [25]:
def partition_entropy_by(inputs, attribute):
    """Computes the entropy corresponding to the given partition."""
    partitions = partition_by(inputs, attribute)
    return partition_entropy(partitions.values())

In [26]:
for key in ['level', 'lang', 'tweets', 'phd']:
    print(key, partition_entropy_by(inputs, key))

level 0.6935361388961919
lang 0.8601317128547441
tweets 0.7884504573082896
phd 0.8921589282623617


In [27]:
senior_inputs = [(input, label)
                 for input, label in inputs if input['level'] == 'Senior']

In [28]:
for key in ['lang', 'tweets', 'phd']:
    print(key, partition_entropy_by(senior_inputs, key))

lang 0.4
tweets 0.0
phd 0.9509775004326938


In [43]:
def classify(tree, input):
    """Classify the input using the given decision tree."""
    
    # If this is a leaf node, return its value.
    if tree in [True, False]:
        return tree
    
    # Otherwise this tree consists of attribute to split on 
    # and a dictionary whose keys are values of that attribute
    # and whose values of are subtrees to consider next.
    attribute, subtree_dict = tree
    
    subtree_key = input.get(attribute) # None if input is missing.
    
    if subtree_key not in subtree_dict: # If not subtree fo key.
        subtree_key = None # Use the None subtree.
    
    subtree = subtree_dict[subtree_key]
    return classify(subtree, input)

In [44]:
from functools import partial

def build_tree_id3(inputs, split_candidates=None):
    # If this is out first pass, all keys of the first input are split cndidates.
    if split_candidates is None:
        split_candidates = inputs[0][0].keys()
    
    # count Trues and Falses in the input.
    num_inputs = len(inputs)
    num_trues = len([label for item, label in inputs if label])
    num_falses = num_inputs - num_trues
    
    if num_trues == 0: return False # No Trues? Return a "False" leaf.
    if num_falses == 0: return True # No Falses? Return a "True" leaf.
    
    if not split_candidates: # If no split candidates left.
        return num_trues >= num_falses # Return the majority leaf.
    
    # Otherwise split on best attribute.
    best_attribute = min(split_candidates, 
                         key=partial(partition_entropy_by, inputs))
    
    partitions = partition_by(inputs, best_attribute)
    new_candidates = [a for a in split_candidates 
                      if a != best_attribute]
    
    # Recursively build subtrees.
    subtrees = { attribute_value: build_tree_id3(subset, new_candidates)
                 for attribute_value, subset in partitions.items() }
    
    subtrees[None] = num_trues > num_falses # Default case.
    return (best_attribute, subtrees)

In [45]:
tree = build_tree_id3(inputs)
classify(tree, { 'level': 'Junior',
                 'lang': 'Java',
                 'tweets': 'yes',
                 'phd': 'no'}) # True

True

In [47]:
classify(tree, { 'level': 'Junior',
                 'lang': 'Java',
                 'tweets': 'yes',
                 'phd': 'yes'}) # False

False

In [48]:
classify(tree, { 'level': 'Intern' }) # True

True

In [50]:
classify(tree, { 'level': 'Senior' }) # False

False

In [52]:
from collections import Counter

def forest_classify(trees, input):
    votes = [classify(tree, input) for tree in trees]
    vote_counts = Counter(votes)
    return vote_counts.most_common(1)[0][0]