# DS-SF-25 | Codealong 12 | Decision Trees and Random Forests

In [1]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

import math

## Part A - The 2008 Democratic Primaries

(dataset adapted from http://www.stat.ucla.edu/~cocteau/primaries.csv)

In [2]:
df = pd.read_csv(os.path.join('..', 'datasets', '2008-democrat-primaries.csv'))

In [3]:
df.columns

Index([u'fips', u'county_name', u'state_postal', u'region', u'election_date',
       u'racetype', u'tvotes', u'clinton', u'obama', u'edwards', u'margin',
       u'winner', u'POP05_SQMI', u'popUnder30_00', u'pop65up_00',
       u'presVote04', u'kerry04', u'Bush04', u'pres04margin', u'pres04winner',
       u'pop06', u'pop00', u'hisp06', u'white06', u'black06', u'indian06',
       u'asian06', u'hawaii06', u'mixed06', u'pct_less_30k', u'pct_more_100k',
       u'pct_hs_grad', u'pct_labor_force', u'pct_homeowner', u'unempFeb07',
       u'unempFeb08', u'unempChg', u'pctUnins00', u'subForPctHomes',
       u'poverty05', u'median_hhi05', u'Catholic', u'So.Bapt.Conv',
       u'Un.Methodist', u'E.L.C.A.', u'Construction', u'Manufacturing',
       u'FinancialActivities', u'GoodsProducing', u'ServiceProviding'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,fips,county_name,state_postal,region,election_date,...,Construction,Manufacturing,FinancialActivities,GoodsProducing,ServiceProviding
0,1001,Autauga,AL,S,2/5/08,...,6.797467,17.57751,5.366229,26.776236,73.223764
1,1003,Baldwin,AL,S,2/5/08,...,10.558143,9.230177,7.923872,21.282357,78.717643
2,1005,Barbour,AL,S,2/5/08,...,2.501616,45.067103,3.379843,51.27552,48.72448
3,1007,Bibb,AL,S,2/5/08,...,20.750603,15.723631,3.477562,42.557099,57.442901
4,1009,Blount,AL,S,2/5/08,...,9.754604,22.763883,4.300316,34.129339,65.870661


In [5]:
df.describe()



Unnamed: 0,fips,tvotes,clinton,obama,edwards,...,Construction,Manufacturing,FinancialActivities,GoodsProducing,ServiceProviding
count,2261.0,2261.0,2261.0,2261.0,2261.0,...,2258.0,2258.0,2260.0,2256.0,2259.0
mean,32494.121628,11522.77,5374.345865,5662.964617,329.61831,...,inf,inf,inf,inf,inf
std,16586.65425,46135.43,21682.477151,24456.840787,1189.247283,...,,,,,
min,1001.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0
25%,17147.0,758.0,309.0,249.0,13.0,...,,,,,
50%,36013.0,2405.0,1138.0,916.0,66.0,...,,,,,
75%,48223.0,6223.0,3213.0,2774.0,218.0,...,,,,,
max,56045.0,1271094.0,699743.0,728328.0,26896.0,...,inf,inf,inf,inf,inf


In [7]:
df.clinton.value_counts()

1        16
2        14
10       11
3        10
4         9
         ..
17277     1
4615      1
2568      1
523       1
4111      1
Name: clinton, dtype: int64

### First cut: Is a county more than 20% black?

In [None]:
# TODO

#### First cut/right node

In [None]:
# TODO

In [None]:
def obama_vs_clinton(df):
    obama = (df.c == 1).sum()
    clinton = (df.c == 0).sum()
    if obama > clinton:
        print 'Obama wins these counties {} to {}.'.format(obama, clinton)
    elif clinton > obama:
        print 'Clinton wins these counties {} to {}.'.format(clinton, obama)
    else:
        print 'Obama and Clinton tie in these counties {} {}.'.format(obama, clinton)

In [None]:
obama_vs_clinton(right_child_df)

### Second cut: Is high school graduation rate higher than 78%?

In [None]:
# TODO

In [None]:
obama_vs_clinton(left_child_df)

### Third cut: Is high school graduation rate higher than 87%?

In [None]:
# TODO

In [None]:
obama_vs_clinton(right_child_df)

## Part B - Building the 2008 Democratic Primaries Decision Tree by Hand

In [None]:
class Node:

    @staticmethod
    def root(root_df):
        cs = sorted(set(root_df.c))
        return Node(cs, root_df)

    def decision(self, left_filter):
        # Collect the observations for which the decision split is true and
        # create the corresponding left node

        left_filter = left_filter(self.df)
        left_df = self.df[left_filter]
        self.left = Node(self.cs, left_df)

        # Same thing on the right side but for the observations that don't
        # satisfy the decision split (the "else")

        right_df = self.df.drop(left_df.index)
        self.right = Node(self.cs, right_df)

        # The entropy after the decision split is the weighted average of the
        # children entropy

        self.after = (self.left.samples * self.left.before
                      + self.right.samples * self.right.before) / self.samples

        # The information gain corresponds to the entropy lost between the
        # parent node (this node and the "before") and its child (the "after")

        self.information_gain = self.before - self.after

        return self

    def __init__(self, cs, df):
        self.cs = cs
        self.df = df

        # Counts of the remaining observations in the subspace per classes
        self.counts = [sum(self.df.c == c) for c in self.cs]

        # Number of observations in the subspace
        self.samples = sum(self.counts)

        # For empty subspaces, probabilties and entropy are set to zero
        if self.samples == 0:
            self.probabilities = [0. for count in self.counts]
            self.before = 0.
        else:
            self.probabilities = [1. * count / self.samples for count in self.counts]
            self.before = - sum(map(lambda p: p * math.log(p, 2),
                                    filter(lambda p : p > 0., self.probabilities)))

    def status(self):
        print 'classes                       =', self.cs
        print 'before:'
        print "\tparent:"
        print "\t\tsamples       =", self.samples
        print "\t\tcounts        =", self.counts
        print "\t\tprobabilities =", self.probabilities
        print "\t\tentropy       =", self.before
        print 'after:'
        print "\tleft child:"
        print "\t\tsamples       =", self.left.samples
        print "\t\tcounts        =", self.left.counts
        print "\t\tprobabilities =", self.left.probabilities
        print "\t\tentropy       =", self.left.before
        print "\tright child:"
        print "\t\tsamples       =", self.right.samples
        print "\t\tcounts        =", self.right.counts
        print "\t\tprobabilities =", self.right.probabilities
        print "\t\tentropy       =", self.right.before
        print
        print 'before entropy                =', self.before
        print 'after entropy                 =', self.after
        print 'information gain              =', self.information_gain

In [None]:
df.c = df.winner

### First cut

In [None]:
# TODO

#### Candidate #1: Is a county more than 20% black?

In [None]:
# TODO

#### Candidate #2: Is high school graduation rate higher than 78%?

In [None]:
# TODO

#### Candidate #3: Is high school graduation rate higher than 87%?

In [None]:
# TODO

### Second cut

In [None]:
# TODO

### Third cut

In [None]:
# TODO