# SF-DAT-21 | Codealong 12

In [24]:
import os
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

%matplotlib inline
plt.style.use('ggplot')

## Part A - The 2008 Democratic Primaries

(dataset adapted from http://www.stat.ucla.edu/~cocteau/primaries.csv)

In [25]:
df = pd.read_csv(os.path.join('..', 'datasets', '2008-democrat-primaries.csv'))

In [26]:
df.columns

Index([u'fips', u'county_name', u'state_postal', u'region', u'election_date',
       u'racetype', u'tvotes', u'clinton', u'obama', u'edwards', u'margin',
       u'winner', u'POP05_SQMI', u'popUnder30_00', u'pop65up_00',
       u'presVote04', u'kerry04', u'Bush04', u'pres04margin', u'pres04winner',
       u'pop06', u'pop00', u'hisp06', u'white06', u'black06', u'indian06',
       u'asian06', u'hawaii06', u'mixed06', u'pct_less_30k', u'pct_more_100k',
       u'pct_hs_grad', u'pct_labor_force', u'pct_homeowner', u'unempFeb07',
       u'unempFeb08', u'unempChg', u'pctUnins00', u'subForPctHomes',
       u'poverty05', u'median_hhi05', u'Catholic', u'So.Bapt.Conv',
       u'Un.Methodist', u'E.L.C.A.', u'Construction', u'Manufacturing',
       u'FinancialActivities', u'GoodsProducing', u'ServiceProviding'],
      dtype='object')

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2261 entries, 0 to 2260
Data columns (total 50 columns):
fips                   2261 non-null int64
county_name            2261 non-null object
state_postal           2261 non-null object
region                 2261 non-null object
election_date          2261 non-null object
racetype               2261 non-null object
tvotes                 2261 non-null int64
clinton                2261 non-null int64
obama                  2261 non-null int64
edwards                2261 non-null int64
margin                 2261 non-null float64
winner                 2241 non-null object
POP05_SQMI             2260 non-null float64
popUnder30_00          2260 non-null float64
pop65up_00             2260 non-null float64
presVote04             2260 non-null float64
kerry04                2260 non-null float64
Bush04                 2260 non-null float64
pres04margin           2260 non-null float64
pres04winner           2260 non-null object
pop06     

In [28]:
df['black06_fraction'] = df.black06/df.pop06
df.black06_fraction.head()

0    0.172109
1    0.096363
2    0.462710
3    0.219021
4    0.015451
Name: black06_fraction, dtype: float64

### First cut: Is a county more than 20% black?

In [29]:
df['y'] = (df.winner=='obama')

In [30]:
root_df = df # Set the root dataframe as the dataset dataframe
left_df = root_df[root_df.black06_fraction <= .2] # Row selection
right_df = root_df[root_df.black06_fraction > .2] 

#### First cut/right node

In [31]:
(right_df.y == 1).sum() # Obama

380

In [32]:
(right_df.y == 0).sum() # Clinton

70

In [33]:
def obama_vs_clinton(df):
    obama = (df.y == 1).sum()
    clinton = (df.y == 0).sum()
    if obama > clinton:
        print 'Obama wins these counties {} to {}.'.format(obama, clinton)
    elif clinton > obama:
        print 'Clinton wins these counties {} to {}.'.format(clinton, obama)
    else:
        print 'Obama and Clinton tie in these counties {} {}.'.format(obama, clinton)

In [34]:
obama_vs_clinton(right_df)

Obama wins these counties 380 to 70.


### Second cut: Is high school graduation rate higher than 78%?

In [35]:
root_df = left_df
left_df = root_df[root_df.pct_hs_grad <= 0.78]
right_df = root_df[root_df.pct_hs_grad > 0.78] 

In [36]:
obama_vs_clinton(left_df)

Clinton wins these counties 714 to 93.


### Third cut: Is high school graduation rate higher than 87%?

In [37]:
root_df = right_df
left_df = root_df[root_df.pct_hs_grad <= 0.87]
right_df = root_df[root_df.pct_hs_grad > 0.87] 

In [38]:
obama_vs_clinton(right_df)

Obama wins these counties 176 to 36.


## Part B - Building the 2008 Democratic Primaries Decision Tree by Hand

In [39]:
class Node:

    @staticmethod
    def root(df):
        classes = sorted(set(df.y))
        return Node(classes, df)

    def decision(self, left_filter):
        # collect the observations for which the decision split is true and
        # create the corresponding left node
        left_filter = left_filter(self.df)
        left_df = self.df[left_filter]
        self.left = Node(self.classes, left_df)

        # same thing on the right side but for the observations that don't
        # satisfy the decision split (the "else")
        right_filter = [not(b) for b in left_filter]
        right_df = self.df[right_filter]
        self.right = Node(self.classes, right_df)

        # the entropy after the decision split is the weighted average of the
        # children entropy
        self.after = (self.left.samples * self.left.before
                      + self.right.samples * self.right.before) / self.samples

        # the information gain corresponds to the entropy lost between the
        # parent node (this node and the "before") and its child (the "after")
        self.information_gain = self.before - self.after

        return self

    def __init__(self, classes, df):
        self.classes = classes
        self.df = df

        # counts of the remaining observations in the subspace per classes
        self.counts = [sum(self.df.y == y) for y in self.classes]

        # number of observations in the subspace
        self.samples = sum(self.counts)

        # for empty subspaces, probabilties and entropy are set to zero
        if self.samples == 0:
            self.probabilities = [0. for count in self.counts]
            self.before = 0.
        else:
            self.probabilities = [1. * count / self.samples for count in self.counts]
            self.before = - sum(map(lambda p: p * math.log(p, 2),
                                    filter(lambda p : p > 0., self.probabilities)))

    def status(self):
        print "before:"
        print "\tparent:"
        print "\t\tsamples       =", self.samples
        print "\t\tcounts        =", self.counts
        print "\t\tprobabilities =", self.probabilities
        print "\t\tentropy       =", self.before
        print "after:"
        print "\tleft child:"
        print "\t\tsamples       =", self.left.samples
        print "\t\tcounts        =", self.left.counts
        print "\t\tprobabilities =", self.left.probabilities
        print "\t\tentropy       =", self.left.before
        print "\tright child:"
        print "\t\tsamples       =", self.right.samples
        print "\t\tcounts        =", self.right.counts
        print "\t\tprobabilities =", self.right.probabilities
        print "\t\tentropy       =", self.right.before
        print
        print "before entropy                =", self.before
        print "after entropy                 =", self.after
        print "information gain              =", self.information_gain

### First cut

In [40]:
node = Node.root(df)

#### Candidate #1: Is a county more than 20% black?

In [41]:
node.decision(lambda df: df.black06_fraction <= 0.2)

<__main__.Node instance at 0x0000000003E80188>

In [42]:
node.status()

before:
	parent:
		samples       = 2261
		counts        = [1230, 1031]
		probabilities = [0.5440070765148165, 0.45599292348518355]
		entropy       = 0.994404850929
after:
	left child:
		samples       = 1810
		counts        = [1160, 650]
		probabilities = [0.6408839779005525, 0.35911602209944754]
		entropy       = 0.941946973925
	right child:
		samples       = 451
		counts        = [70, 381]
		probabilities = [0.15521064301552107, 0.844789356984479]
		entropy       = 0.622727770437

before entropy                = 0.994404850929
after entropy                 = 0.878272555184
information gain              = 0.116132295745


#### Candidate #2: Is high school graduation rate higher than 78%?

In [43]:
node.decision(lambda df: df.pct_hs_grad <=0.78).status()

before:
	parent:
		samples       = 2261
		counts        = [1230, 1031]
		probabilities = [0.5440070765148165, 0.45599292348518355]
		entropy       = 0.994404850929
after:
	left child:
		samples       = 1174
		counts        = [781, 393]
		probabilities = [0.6652470187393527, 0.33475298126064734]
		entropy       = 0.919708944493
	right child:
		samples       = 1087
		counts        = [449, 638]
		probabilities = [0.41306347746090155, 0.5869365225390984]
		entropy       = 0.978081108578

before entropy                = 0.994404850929
after entropy                 = 0.947771988438
information gain              = 0.0466328624907


#### Candidate #3: Is high school graduation rate higher than 87%?

In [None]:
# TODO