In [1]:
from highcharts import Highmap
from highcharts import Highchart

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import requests
import seaborn as sns
sns.set_context("talk", font_scale=1.4)

from state_details import electoral_college_votes_by_state, hc_keys_by_state

### Get population by US county from the US Census API

In [2]:
# API key requested from www.census.gov
f = open('/Users/brianna/Dropbox/data_project/census_api_key.txt', 'r')
api_key = f.readline()[:-1]
f.close()

In [3]:
# Once the data is downloaded, just save it to keep from having to continually re-request from the Census API
upload_saved_file = True

In [4]:
# Get population by US county (either from API or from saved file)
if upload_saved_file:
    print('Reading population data locally')
    population = pd.DataFrame.from_csv('population_by_us_county.csv')
else:
    print('Grabbing population data from US Census')
    url = 'http://api.census.gov/data/2015/pep/population?get=POP,GEONAME&for=%s:*&DATE=8&key=%s' % ('county', api_key)
    response = requests.get(url)

    if response.status_code != 200:
        raise ValueError('Unexpected status code: %s' % response.status_code)
    else:
        data = eval(response.content)
        headers = data.pop(0)
        population = pd.DataFrame(data, columns=headers)
        population['state'] = population.state.apply(lambda x: int(x))
        # Replace commas with semicolons and save as csv file
        population['GEONAME'] = population.GEONAME.apply(lambda x: x.replace(',', ';'))
        population.to_csv('population_by_us_county.csv')

Reading population data locally


### Remove non-voting states

In [5]:
print('%s unique states' % population.state.nunique())
print('Remove Puerto Rico (72)')
pop_df = population.drop(population[population.state.isin([72])].index)
print('%s unique states' % pop_df.state.nunique())

52 unique states
Remove Puerto Rico (72)
51 unique states


### Split GEONAME into County, State, Division, Region, Country

In [6]:
pop_df.rename(columns={'state': 'state_no', 'county': 'county_no', 'POP': 'population'}, inplace=True)
pop_df['county'] = pop_df.GEONAME.apply(lambda x: x.split('; ')[0])
pop_df['state'] = pop_df.GEONAME.apply(lambda x: x.split('; ')[1])
pop_df['division'] = pop_df.GEONAME.apply(lambda x: x.split('; ')[2])
pop_df['region'] = pop_df.GEONAME.apply(lambda x: x.split('; ')[3])

In [7]:
# Add hc-keys for plotting in Highcharts
pop_df['hc_key'] = pop_df.state.apply(lambda x: hc_keys_by_state[x])

In [8]:
# Convert population (POP) from string to int
pop_df['population'] = pop_df.population.apply(lambda x: int(x))

### Get population by state

In [9]:
state_pop = pop_df.groupby('hc_key').sum()
state_pop['hc_key'] = state_pop.index

In [10]:
state_by_hc_key = {v: k for k, v in hc_keys_by_state.iteritems()}
state_pop['state'] = state_pop.hc_key.apply(lambda x: state_by_hc_key[x])

### Include electoral college votes by state

In [11]:
state_pop['ec_votes'] = state_pop.state.apply(lambda x: electoral_college_votes_by_state[x])
state_pop['ec_votes_per_voter'] = state_pop.ec_votes / state_pop.population

# These numbers are so small that they're kind of hard to understand so instead let's get the "average"
# portion of an electoral college vote and look at percentage more or less than that.
ave_ec_votes_per_voter = sum(state_pop.ec_votes) * 1.0 / sum(state_pop.population)
state_pop['ec_votes_per_voter_percent_of_average'] = state_pop.ec_votes_per_voter * 100.0 / ave_ec_votes_per_voter
state_pop['ec_votes_per_voter_percent_of_average'] = state_pop.ec_votes_per_voter_percent_of_average.apply(lambda x: round(x))

### Plot By State

In [12]:
# Fill in the data structure for plotting
state_pop_data = []

plot_titles = {'population': 'US Population by State',
               'ec_votes_per_voter': 'Electoral College Vote Portion Per Voter',
               'ec_votes_per_voter_percent_of_average': 'Percent of National Average Representation Per Voter'}

# plot_measure = 'population'
# plot_measure = 'ec_votes_per_voter'
plot_measure = 'ec_votes_per_voter_percent_of_average'

for k in state_pop.index:
    state_pop_data.append({'hc-key': k, 'value': state_pop[plot_measure][k]})

In [18]:
options = { # construct option dict
                                   
    'chart' :{ 'renderTo' : 'container'
    },
                           
    'title' : {
        'text' : plot_titles[plot_measure]
    },

    'mapNavigation': {
        'enabled': True,
        'buttonOptions': {
            'verticalAlign': 'bottom'
        }
    },
    'colorAxis': {
                'type': 'linear',
#                 'min': 5e5,
                'minColor': '#FFFFFF', # white
                'maxColor': '#BC0AD1', # dark magenta
#                 'maxColor': '#699C02', # dark green      
            },
} 

In [None]:
H = Highmap(width = 650, height = 500)

H.set_dict_options(options) # set options
H.add_data_set(state_pop_data, 'map', 'Population', joinBy='hc-key', # set dataset [label string in json, label string in local dicts]
                states={
                    'hover': {
                        'color': '#FFC300'
                    }
                },
                dataLabels={
                    'enabled': True,
                    'format': '{point.properties.postal}'
                })


# source is javascript link from http://code.highcharts.com/mapdata/
# H.set_map_source('http://code.highcharts.com/mapdata/countries/us/custom/us-all-mainland.js')
H.set_map_source('http://code.highcharts.com/mapdata/countries/us/us-all.js')

H

In [None]:
H.save_file(plot_measure)

### Does this over-representation consistently benefit any particular demographic?

In [None]:
# Read in race/age info by state
# if upload_saved_file:
#     print('Reading population data locally')
#     white_pop = pd.DataFrame.from_csv('population_by_us_county.csv')
# else:
#     print('Grabbing population data from US Census')
# #     url = 'http://api.census.gov/data/2015/pep/population?get=POP,GEONAME&for=%s:*&DATE=8&key=%s' % ('county', api_key)
#     url = 'http://api.census.gov/data/2010/sf1?key=%s&get=P0010001,NAME&for=state:*' % api_key
#     response = requests.get(url)

#     if response.status_code != 200:
#         raise ValueError('Unexpected status code: %s' % response.status_code)
#     else:
#         data = eval(response.content)
#         headers = data.pop(0)
#         population = pd.DataFrame(data, columns=headers)
#         population['state'] = population.state.apply(lambda x: int(x))
#         # Replace commas with semicolons and save as csv file
#         population['GEONAME'] = population.GEONAME.apply(lambda x: x.replace(',', ';'))
#         population.to_csv('population_by_us_county.csv')
        

# Documentation here: http://api.census.gov/data/2010/sf1/variables.html
# People who are white alone:
# 'http://api.census.gov/data/2010/sf1?key=%(api_key)s&get=PCT012A001,NAME&for=state:*' % {'api_key': api_key}
# White men age 1 to 109
# PCT012A[003, 105]
# White women age 1 to 109
# PCT012A[107, 209]
# http://api.census.gov/data/2010/sf1?key=%(api_key)s&get=PCT012A186,PCT012A187,NAME&for=state:*

# Rural H0020005

url = 'http://api.census.gov/data/2010/sf1?key=%(api_key)s&get=PCT012A001,NAME&for=state:*' % {'api_key': api_key}
response = requests.get(url)
data = eval(response.content)
headers = data.pop(0)
white = pd.DataFrame(data, columns=headers)
white['state'] = white.state.apply(lambda x: int(x))
white['PCT012A001'] = white.PCT012A001.apply(lambda x: int(x))
white.rename(columns={'PCT012A001': 'white_pop', 'state': 'state_no', 'NAME': 'state'}, inplace=True)

In [None]:
state_pop = state_pop.merge(white[['state', 'white_pop']], on='state', how='outer')

In [None]:
state_pop['percent_white'] = state_pop.white_pop * 1.0 / state_pop.population

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(state_pop.percent_white, state_pop.ec_votes_per_voter_percent_of_average)
plt.xlabel('Percent White')
plt.ylabel('Percent of National Average Voter Representation')
plt.show()

In [None]:
state_pop[state_pop.percent_white < .5].state

# Read in population info by age/gender/race/state

In [None]:
# Make dict with details of each variable, as defined in the documentation:
# http://api.census.gov/data/2010/sf1/variables.html
# race_codes = {'A': 'white',
#               'B': 'black',
#               'C': 'native_american',
#               'D': 'asian',
#               'E': 'pacific_islander',
#               'F': 'other',
#               'G': 'two_or_more_races',
#               'H': 'hispanic'}

# If we use the variables [A-G], we end up double counting people that identify as "White, Hispanic", for example.
# In order to not double-count people, use the variables where people have identified as "[Race], non-hispanic" 
# This means all people that identify as hispanic will be classified only as hispanic (not any other race they've)
# specified.  But otherwise because of the way the census is constructed we end up double counting 50 million people.
race_codes = {'I': 'white',
              'J': 'black',
              'K': 'native_american',
              'L': 'asian',
              'M': 'pacific_islander',
              'N': 'other',
              'O': 'two_or_more_races',
              'H': 'hispanic'}

census_variables = {}
for r in race_codes:
    age = 0
    for i in range(3, 3 + 101):
        census_variables['PCT012%s%s' % (r, str(i).zfill(3))] = {'age': age, 'gender': 'male', 'race': race_codes[r]}
        age += 1

    age = 0
    for i in range(107, 107 + 101):
        census_variables['PCT012%s%s' % (r, str(i).zfill(3))] = {'age': age, 'gender': 'female', 'race': race_codes[r]}
        age += 1

In [None]:
upload_from_file = True

In [None]:
if upload_from_file:
    population = pd.DataFrame.from_csv('population_by_gender_age_race_state_nh.csv')
else:
    # For each variable (ie. each combination of age/gender/race), get the population per state.
    # There are 100*2*8 = 1600 variables here, which means 1600 API calls.
    # In total there will be 1600*50 = 80K rows of data in the final population dataframe.
    population = pd.DataFrame(columns = ['population', 'state', 'age', 'gender', 'race'])
    v_count = 1
    for v in census_variables:
        if v_count%20 == 0:
            print('%s of 1600 variables completed' % v_count)
        # Call the API to get populations by state for this age/gender/race group
        url = 'http://api.census.gov/data/2010/sf1?key=%(api_key)s&get=%(v)s,NAME&for=state:*' % {'v': v, 'api_key': api_key}
        response = requests.get(url)
        data = eval(response.content)

        # Turn the response into a dataframe and make it pretty
        headers = data.pop(0)
        df = pd.DataFrame(data, columns=headers)
        df.drop('state', axis=1, inplace=True)
        df.rename(columns = {v: 'population', 'NAME': 'state'}, inplace=True)
        df['population'] = df.population.apply(lambda x: int(x))

        # Add columns defining the group and append to the final dataframe
        df['age'] = census_variables[v]['age']
        df['gender'] = census_variables[v]['gender']
        df['race'] = census_variables[v]['race']
        population = population.append(df)
        v_count += 1

    population.to_csv('population_by_gender_age_race_state_nh.csv')

In [None]:
sum(population.population)


## Plot Age distribution by race

In [None]:
gr = population.groupby(['race', 'age']).sum()
gr['race'] = gr.index.get_level_values(0)
gr['age'] = gr.index.get_level_values(1)

race_colors = {'white': 'b',
              'black': 'm',
              'native_american': 'k',
              'asian': 'g',
              'pacific_islander': 'grey',
              'other': 'cyan',
              'two_or_more_races': 'r',
              'hispanic': 'orange'}

In [None]:
plt.figure(figsize=(12,8))
for r in gr.race.unique():
    plt.plot(gr[gr.race == r].age, gr[gr.race == r].population, label=r, color=race_colors[r])

plt.xlabel('Age')
plt.ylabel('Number of People')
plt.legend(bbox_to_anchor=(.65, 1), loc=2, borderaxespad=0.)
plt.show()

# Plot normalized
plt.figure(figsize=(12,8))
for r in gr.race.unique():
    total_pop = np.sum(gr[gr.race == r].population)
    plt.plot(gr[gr.race == r].age, gr[gr.race == r].population / total_pop, label=r, color=race_colors[r])
plt.xlabel('Age')
plt.ylabel('Fraction of People')
plt.legend(bbox_to_anchor=(.65, 1), loc=2, borderaxespad=0.)
plt.show()

# Plot normalized 18 and up (ie. eligible voters)
plt.figure(figsize=(12,8))
for r in gr.race.unique():
    inclusion = (gr.race == r) & (gr.age >= 18)
    total_pop = np.sum(gr[inclusion].population)
    plt.plot(gr[inclusion].age, gr[inclusion].population / total_pop, label=r, color=race_colors[r])
plt.xlabel('Age')
plt.ylabel('Fraction of People')
plt.legend(bbox_to_anchor=(.65, 1), loc=2, borderaxespad=0.)
plt.show()


I had never realized this before, but white people are the oldest race, on average.  So this idea that white people tend to vote more conservatively might just be because they are older.

## Plot Electoral College Representation by Race and Age

In [None]:
# Get a weighted average of age by dividing age * population by the population of whatever bin you're interested in
population['age_x_population'] = population.age * population.population

# Get population by state
gr_by_state = population.groupby('state').sum()
gr_by_state['ave_age'] = gr_by_state.age_x_population / gr_by_state.population
gr_by_state.head()

In [None]:
white = population[population.race == 'white'].groupby('state').sum()
white = white.rename(columns = {'population': 'white_population'})

In [None]:
gr_by_state = gr_by_state.merge(white[['white_population']], left_index=True, right_index=True, how='outer')

In [None]:
gr_by_state['percent_white'] = gr_by_state.white_population / gr_by_state.population

In [None]:
plt.scatter(gr_by_state.percent_white, gr_by_state.ave_age)

In [None]:
state_pop[['state', 'ec_votes_per_voter_percent_of_average']].head()

In [None]:
gr_by_state = gr_by_state.merge(state_pop[['state', 'ec_votes_per_voter_percent_of_average']],
                               left_index=True, right_on='state', how='inner')

In [None]:
# Fill in the data structure for plotting
data = []

plot_titles = {'population': 'US Population by State',
               'ec_votes_per_voter': 'Electoral College Vote Portion Per Voter',
               'ec_votes_per_voter_percent_of_average': 'Percent of National Average Representation Per Voter'}

# plot_measure = 'population'
# plot_measure = 'ec_votes_per_voter'
plot_measure = 'ec_votes_per_voter_percent_of_average'

for s in gr_by_state.index:
    data.append({'x': gr_by_state.percent_white[s], 
                 'y': gr_by_state.ave_age[s],
                 'z': gr_by_state.ec_votes_per_voter_percent_of_average[s],
                 'name': s[-2:]})

In [None]:
options = { # construct option dict
                                   
    'chart' :{ 'renderTo' : 'container'
    },
                           
    'title' : {
        'text' : 'Electoral College Representation By % White and Ave Age'
    },

    'colorAxis': {
                'type': 'logarithmic',
#                 'min': 5e5,
                'minColor': '#FFFFFF', # white
                'maxColor': '#BC0AD1', # dark magenta
#                 'maxColor': '#699C02', # dark green      
            },
} 

In [None]:
H = Highchart(width = 650, height = 500)

H.set_dict_options(options) # set options
H.add_data_set(data, 'bubble', 'name',
                states={
                    'hover': {
                        'color': '#FFC300'
                    }
                },
                dataLabels={
                    'enabled': True,
                    'format': '{point.name}'
                })

H

In [None]:
### REDO PROPORTIONS AND NUMBERS WITH ONLY PEOPLE OVER 18 (VOTERS)