In [1]:
# Referrer, Location, ReadFAQ, Pages Viewed, Service Chosen.
data = [['slashdot', 'USA', 'yes', 18, 'None'],
        ['google', 'France', 'yes', 23, 'Premium'],
        ['digg', 'USA', 'yes', 24, 'Basic'],
        ['kiwitobes', 'France', 'yes', 23, 'Basic'],
        ['google', 'UK', 'no', 21, 'Premium'],
        ['(direct)', 'New Zealand', 'no', 12, 'None'],
        ['(direct)', 'UK', 'no', 21, 'Basic'],
        ['google', 'USA', 'no', 24, 'Premium'],
        ['slashdot', 'France', 'yes', 19, 'None'],
        ['digg', 'USA', 'no', 18, 'None'],
        ['google', 'UK', 'no', 18, 'None'],
        ['kiwitobes', 'UK', 'no', 19, 'None'],
        ['digg', 'New Zealand', 'yes', 12, 'Basic'],
        ['slashdot', 'UK', 'no', 21, 'None'],
        ['google', 'UK', 'yes', 18, 'Basic'], 
        ['kiwitobes', 'France', 'yes', 19, 'Basic']]

In [3]:
class DecisionNode:
    def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
        self.col = col
        self.value = value
        self.results = results
        self.tb = tb # Decision nodes, true.
        self.fb = fb # Decision nodes, false.
    
# Divides a set on a specific column. Can handle numeric or nominal values.
def divide_set(rows, column, value):
    # Make the function that tells us if a row is in 
    # the first group (true) or the second group (false).
    split_function = None
    if isinstance(value, int) or isinstance(value, float):
        split_function = lambda row: row[column] >= value
    else:
        split_function = lambda row: row[column] == value

    # Divide the rows into two sets and return them.
    set1 = [row for row in rows if split_function(row)]
    set2 = [row for row in rows if not split_function(row)]
    return (set1, set2)

In [4]:
divide_set(data, 2, 'yes')

([['slashdot', 'USA', 'yes', 18, 'None'],
  ['google', 'France', 'yes', 23, 'Premium'],
  ['digg', 'USA', 'yes', 24, 'Basic'],
  ['kiwitobes', 'France', 'yes', 23, 'Basic'],
  ['slashdot', 'France', 'yes', 19, 'None'],
  ['digg', 'New Zealand', 'yes', 12, 'Basic'],
  ['google', 'UK', 'yes', 18, 'Basic'],
  ['kiwitobes', 'France', 'yes', 19, 'Basic']],
 [['google', 'UK', 'no', 21, 'Premium'],
  ['(direct)', 'New Zealand', 'no', 12, 'None'],
  ['(direct)', 'UK', 'no', 21, 'Basic'],
  ['google', 'USA', 'no', 24, 'Premium'],
  ['digg', 'USA', 'no', 18, 'None'],
  ['google', 'UK', 'no', 18, 'None'],
  ['kiwitobes', 'UK', 'no', 19, 'None'],
  ['slashdot', 'UK', 'no', 21, 'None']])

## Chosing the Best Split

In [7]:
# Create counts of possible results (the last column of each
# row is the result).
def unique_count(rows):
    results = {}
    for row in rows:
        # The result is the last column.
        r = row[len(row) - 1]
        if r not in results: results[r] = 0
        results[r] += 1
    return results

## Gini Impurity

The expected error rate if one of the results from a set is randomly applied to one of the items in the set. If every item in the set is in the same category, the guess will always be correct, so the error rate is 0.

In [8]:
# Probability that a randomly placed item will be in the wrong category.
def gini_impurity(rows):
    total = len(rows)
    counts = unique_count(rows)
    imp = 0
    for k1 in counts:
        p1 = float(counts[k1]) / total
        for k2 in counts:
            if k1 == k2: continue
            p2 = float(counts[k2]) / total
            imp += p1 * p2
    return imp

## Entropy

The amount of disorder in a set - basically, how much mixed a set is.

In [9]:
# Entropy is the sum of p(x)log(p(x)) across all the different possible results.
def entropy(rows):
    from math import log
    log2 = lambda x: log(x) / log(2)
    results = unique_count(rows)
    
    # Now, calculate the entropy.
    ent = 0.0
    for r in results.keys():
        p = float(results[r]) / len(rows)
        ent = ent - p * log2(p)
    return ent

In [10]:
gini_impurity(data)

0.6328125

In [11]:
entropy(data)

1.5052408149441479

In [12]:
set1, set2 = divide_set(data, 2, 'yes')
entropy(set1)

1.2987949406953985

In [13]:
gini_impurity(set1)

0.53125