In [2]:
import math
from collections import Counter, defaultdict

def entropy(class_probabilities):
    return sum([-p_class_i * math.log(p_class_i, 2) for p_class_i in class_probabilities if p_class_i != 0])

def class_probabilities(labels):
    total_count = len(labels)
    return [count / total_count for count in Counter(labels).values()]

ex_data = [1, 2, 2, 3, 3, 3, 3]

print(class_probabilities(ex_data))

def data_entropy(labeled_data):
    labels = [label for _, label in labeled_data]
    probabilities = class_probabilities(labels)
    return entropy(probabilities)    

def partition_entropy(subsets):
    total_count = sum([len(subset) for subset in subsets])

    return sum([data_entropy(subset) * len(subset) / total_count for subset in subsets])

[0.14285714285714285, 0.2857142857142857, 0.5714285714285714]


In [3]:
inputs = [
    ({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'no'}, False),
    ({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'yes'}, False),
    ({'level':'Mid', 'lang':'Python', 'tweets':'no', 'phd':'no'}, True),
    ({'level':'Junior', 'lang':'Python', 'tweets':'no', 'phd':'no'}, True),
    ({'level':'Junior', 'lang':'R', 'tweets':'yes', 'phd':'no'}, True),
    ({'level':'Junior', 'lang':'R', 'tweets':'yes', 'phd':'yes'}, False),
    ({'level':'Mid', 'lang':'R', 'tweets':'yes', 'phd':'yes'}, True),
    ({'level':'Senior', 'lang':'Python', 'tweets':'no', 'phd':'no'}, False),
    ({'level':'Senior', 'lang':'R', 'tweets':'yes', 'phd':'no'}, True),
    ({'level':'Junior', 'lang':'Python', 'tweets':'yes', 'phd':'no'}, True),
    ({'level':'Senior', 'lang':'Python', 'tweets':'yes', 'phd':'yes'}, True),
    ({'level':'Mid', 'lang':'Python', 'tweets':'no', 'phd':'yes'}, True),
    ({'level':'Mid', 'lang':'Java', 'tweets':'yes', 'phd':'no'}, True),
    ({'level':'Junior', 'lang':'Python', 'tweets':'no', 'phd':'yes'}, False)
]

In [11]:
def partition_by(inputs, attribute):
    groups = defaultdict(list)
    for input in inputs:
        key = input[0][attribute]
        groups[key].append(input)
    return groups

def partition_entropy_by(inputs, attribute):
    partitions = partition_by(inputs, attribute)

    return partition_entropy(partitions.values())
 
for key in ["level", "lang", "tweets", "phd"]:
    print(key, partition_entropy_by(inputs, key))  


""" we are gonna select 'level' cuz it has the minimum entropy """

level 0.6935361388961919
lang 0.8601317128547441
tweets 0.7884504573082896
phd 0.8921589282623617


" we are gonna select 'level' cuz it has the minimum entropy "

In [5]:
senior_inputs = [(input, label) for input, label in inputs if input["level"] == "Senior"]

for key in ["lang", "tweets", "phd"]:
    print(key, partition_entropy_by(senior_inputs, key))

""" now we are gonna select 'tweets' cuz it has the minimum entropy 
    and this will go on line this...
"""


lang 0.4
tweets 0.0
phd 0.9509775004326938


" now we are gonna select 'tweets' cuz it has the minimum entropy \n    and this will go on line this...\n"

In [6]:
from functools import partial

def classify(tree, input):
    """Classify the input using the given decision tree."""

    # if this is a leaf node, return its value
    if tree in [True, False]:
        return tree
    
    # otherwise this tree consits of an attribute to split on
    # and a dictionary whose keys are values of that attribute
    # and whose values of are subtrees to consider next
    attribute, subtree_dict = tree

    subtree_key = input.get(attribute)

    if subtree_key not in subtree_dict:
        subtree_key = None

    subtree = subtree_dict[subtree_key]
    return classify(subtree, input)

def build_tree_id3(inputs, split_candidates=None):

    # if this is our first pass,
    # all keys of the first input are split candidates
    if split_candidates is None:
        split_candidates = inputs[0][0].keys()

    # count Trues and Falses in the inputs
    num_inputs = len(inputs)
    num_trues = len([label for _, label in inputs if label])
    num_falses = num_inputs - num_trues

    if num_trues == 0: return False
    if num_falses == 0: return True

    # otherwise, split on the best attribute
    if not split_candidates:
        return num_trues >= num_falses

    best_attribute = min(split_candidates, key=partial(partition_entropy_by, inputs))
    partitions = partition_by(inputs, best_attribute)
    new_candidates = [a for a in split_candidates if a != best_attribute]
    
    # recursively build subtrees
    subtrees = { attribute_value: build_tree_id3(subset, new_candidates) for attribute_value, subset in partitions.items() }

    subtrees[None] = num_trues > num_falses # default case

    return (best_attribute, subtrees)

In [7]:
tree = build_tree_id3(inputs)
tree

('level',
 {'Senior': ('tweets', {'no': False, 'yes': True, None: False}),
  'Mid': True,
  'Junior': ('phd', {'no': True, 'yes': False, None: True}),
  None: True})

In [8]:
print(classify(tree, {"level" : "Junior",
    "lang" : "Java",
    "tweets" : "yes",
    "phd" : "no"}))

print(classify(tree, {"level" : "Junior",
    "lang" : "Java",
    "tweets" : "yes",
    "phd" : "yes"}))

True
False


In [9]:
# random forests
def forest_classify(trees, input):
    votes = [classify(tree, input) for tree in trees]
    vote_counts = Counter(votes)
    return vote_counts.most_common(1)[0][0]