## Combine data from different sources to form our dataset
Our dataset consists of several features collected at the county level. We are trying to predict which party won in a county based on several features. These features are:

* Per capita income
* Population density
* Percentage of population without access to the Internet
* Percentage of urban population without access to the Internet
* Percentage of rural population without access to the Internet
* Urban population density
* Rural population density
* Is there greater than 50% Internet access?
* Median download speed
* Median upload speed
* Upper quartile download speed
* Upper quartile upload speed
* Lower quartile download speed
* Lower quartile upload speed
* Party that won

We wish to build a predictor based on these features.

In [10]:
"""Class to hold information of each county in memory. Will write to a file later."""
class County(object):
    def __init__(self, county_id, per_capita_income=None, population_density=None, pop_without_a2i=None, 
                 urban_pop_without_a2i=None, rural_pop_without_a2i=None, urban_pop_density=None, 
                 rural_pop_density=None, gt_50_internet=0, median_dl_speed=None, median_ul_speed=None, 
                 upper_quartile_dl=None, upper_quartile_ul=None, lower_quartile_dl=None, 
                 lower_quartile_ul=None, party_won=None):
        """
        Set all parameters of the county. By default they will be None for most values.
        """
        self.county_id = county_id
        self.per_capita_income = per_capita_income
        self.population_density = population_density
        self.pop_without_a2i = pop_without_a2i
        self.urban_pop_without_a2i = urban_pop_without_a2i
        self.rural_pop_without_a2i = rural_pop_without_a2i
        self.urban_pop_density = urban_pop_density
        self.rural_pop_density = rural_pop_density
        self.gt_50_internet = gt_50_internet
        self.median_dl_speed = median_dl_speed
        self.median_ul_speed = median_ul_speed
        self.upper_quartile_dl = upper_quartile_dl
        self.upper_quartile_ul = upper_quartile_ul
        self.lower_quartile_dl = lower_quartile_dl
        self.lower_quartile_ul = lower_quartile_ul
        self.party_won = party_won
    
    def __repr__(self):
        return str(self.__dict__.items())
    
    def __str__(self):
        return str(self.__dict__.items())
    
    def toCsv(self):
        return f'{self.county_id},{self.per_capita_income},{self.population_density},{self.pop_without_a2i},' + \
               f'{self.urban_pop_density},{self.rural_pop_density},{self.gt_50_internet},{self.median_dl_speed},' + \
               f'{self.median_ul_speed},{self.upper_quartile_dl},{self.upper_quartile_ul},{self.lower_quartile_dl},' + \
               f'{self.lower_quartile_ul},{self.party_won}'

counties = {}

In [11]:
import csv
import pandas as pd

from collections import namedtuple

DemographicsData = namedtuple('DemographicsData', ['state', 'county', 'friendly_key', 'pop_without_access', 
                                                   'percent_total_pop', 'pop_density', 'per_capita_income', 
                                                   'urban_pop_without_access', 'percent_urban_pop', 'urban_pop_density', 
                                                   'rural_pop_without_access', 'percent_rural_pop', 'rural_pop_density'])

county_demographics = []

with open('../Data/FCC-2016-Population-Internet.csv') as freader:
    csvreader = csv.DictReader(freader)
    for row in csvreader:
        if 'County' not in row['\ufeffcounty']:
            state = row['\ufeffcounty'].replace(' ', '_').lower()
        else:
            county = row['\ufeffcounty'].replace(' ', '_').lower()
            friendly_key = '%s_%s' % (state, county)
            
            try:
                pop_without_access = int(row['pop_without_access'].replace(',', ''))
            except ValueError:
                pop_without_access = 0
            
            try:
                percent_total_pop = float(row['percent_total_pop'].strip('%'))
            except ValueError:
                percent_total_pop = 0
            
            try:
                pop_density = float(row['pop_density'].replace(',', ''))
            except ValueError:
                pop_density = 0
            
            try:
                per_capita_income = float(row['per_capita_income'].strip('$').replace(',', ''))
            except ValueError:
                per_capita_income = 0
            
            try:
                urban_pop_without_access = int(row['urban_pop_without_access'].replace(',', ''))
            except ValueError:
                urban_pop_without_access = 0
            
            try:
                percent_urban_pop = float(row['percent_urban_pop'].strip('%').replace(',', ''))
            except ValueError:
                percent_urban_pop = 0
            
            try:
                urban_pop_density = float(row['urban_pop_density'].replace(',', ''))
            except ValueError:
                urban_pop_density = 0
            
            try:
                rural_pop_without_access = float(row['rural_pop_without_access'].replace(',', ''))
            except ValueError:
                rural_pop_without_access = 0
            
            try:
                percent_rural_pop = float(row['percent_rural_pop'].strip('%').replace(',', ''))
            except ValueError:
                percent_rural_pop = 0
            
            try:
                rural_pop_density = float(row['rural_pop_density'].replace(',', ''))
            except ValueError:
                rural_pop_density = 0

            county_demographic = DemographicsData(state, county, friendly_key, 
                                                  pop_without_access, 
                                                  percent_total_pop, 
                                                  pop_density, 
                                                  per_capita_income, 
                                                  urban_pop_without_access, 
                                                  percent_urban_pop, 
                                                  urban_pop_density, 
                                                  rural_pop_without_access, 
                                                  percent_rural_pop, 
                                                  rural_pop_density)
            county_demographics.append(county_demographic)

county_demographics_df = pd.DataFrame(county_demographics, columns=DemographicsData._fields)

In [12]:
"""Populate county information in the dictionary"""
for _, county in county_demographics_df.iterrows():
    counties[county.friendly_key] = County(county.friendly_key, county.per_capita_income, county.pop_density, 
                                           county.percent_total_pop, county.percent_urban_pop, county.percent_rural_pop, 
                                           county.urban_pop_density, county.rural_pop_density)

print("Number of counties:", len(counties))

Number of counties: 3007


In [13]:
"""Create mapping between FIPS code and friendly name and reverse mapping"""
internet_availability_df = pd.DataFrame.from_csv('../Data/StateCountyInternetAvailability.csv')
fips2name = {}
name2fips = {}

for _, county in internet_availability_df.iterrows():
    fips2name[county.countyId] = county.friendlyKey
    name2fips[county.friendlyKey] = county.countyId
    
print("Number of counties:", len(fips2name))

Number of counties: 3234


In [14]:
"""Populate internet availability in each county"""
for _, county in internet_availability_df.iterrows():
    if county.friendlyKey in counties:
        counties[county.friendlyKey].gt_50_internet = 1 if county.gt50Available else 0

In [15]:
"""Calculate the average internet speed across different categories"""
internet_speed_df = pd.DataFrame.from_csv('../Data/CountyInternetSpeed.csv')

class CountyInternetSpeed(object):
    def __init__(self):
        self.num_categories = 0
        self.num_tests = 0
        self.median_dl_speed = 0
        self.median_ul_speed = 0
        self.upper_quartile_dl_speed = 0
        self.upper_quartile_ul_speed = 0
        self.lower_quartile_dl_speed = 0
        self.lower_quartile_ul_speed = 0
        
    def add(self, num_tests, category, median_dl_speed, median_ul_speed, upper_quartile_dl, upper_quartile_ul, 
            lower_quartile_dl, lower_quartile_ul):
        self.num_tests += num_tests
        self.median_dl_speed += median_dl_speed
        self.median_ul_speed += median_ul_speed
        self.upper_quartile_dl_speed += upper_quartile_dl
        self.upper_quartile_ul_speed += upper_quartile_ul
        self.lower_quartile_dl_speed += lower_quartile_dl
        self.lower_quartile_ul_speed += lower_quartile_ul
        self.num_categories += 1
    
    def average(self):
        return (self.median_dl_speed / self.num_categories, self.median_ul_speed / self.num_categories, 
                self.upper_quartile_dl_speed / self.num_categories, self.upper_quartile_ul_speed / self.num_categories, 
                self.lower_quartile_dl_speed / self.num_categories, self.lower_quartile_ul_speed / self.num_categories)

countyInternetSpeeds = {}
for _, county in internet_speed_df.iterrows():
    if county.friendlyKey not in countyInternetSpeeds:
        countyInternetSpeeds[county.friendlyKey] = CountyInternetSpeed()
    countyInternetSpeeds[county.friendlyKey].add(county.numTests, county.category, county.medianDownload, 
                                                 county.medianUpload, county.upperQuartileDownload, 
                                                 county.upperQuartileUpload, county.lowerQuartileDownload, 
                                                 county.lowerQuartileUpload)

for county_id in countyInternetSpeeds:
    avgSpeeds = countyInternetSpeeds[county_id].average()
    if county_id in counties:
        counties[county_id].median_dl_speed = avgSpeeds[0]
        counties[county_id].median_ul_speed = avgSpeeds[1]
        counties[county_id].upper_quartile_dl = avgSpeeds[2]
        counties[county_id].upper_quartile_ul = avgSpeeds[3]
        counties[county_id].lower_quartile_dl = avgSpeeds[4]
        counties[county_id].lower_quartile_ul = avgSpeeds[5]

In [19]:
"""Party that won"""
us_voter_county_df = pd.DataFrame.from_csv('../Data/US_County_Level_Presidential_Results_08-16.csv', sep=',')

for _id, county in us_voter_county_df.iterrows():
    key = fips2name[_id]
    dem = county.dem_2016
    gop = county.gop_2016
    oth = county.oth_2016
    party = 0
    if gop > dem:
        party = 1
    if oth > dem and oth > gop:
        party = 2
    if key in counties:
        counties[key].party_won = party

"""Republican party percentage"""
us_voter_county_df = pd.DataFrame.from_csv('../Data/US_County_Level_Presidential_Results_08-16.csv', sep=',')

for _id, county in us_voter_county_df.iterrows():
    key = fips2name[_id]
    dem = county.dem_2016
    gop = county.gop_2016
    oth = county.oth_2016
    percentage = float(gop) / (dem+gop+oth)
    if key in counties:
        counties[key].party_won = percentage

In [20]:
print(counties[fips2name[54001]])

dict_items([('county_id', 'west_virginia_barbour_county'), ('per_capita_income', 17909.0), ('population_density', 49.071), ('pop_without_a2i', 70.0), ('urban_pop_without_a2i', 50.0), ('rural_pop_without_a2i', 73.0), ('urban_pop_density', 1591.63), ('rural_pop_density', 41.429), ('gt_50_internet', 1), ('median_dl_speed', 0.9104699999999999), ('median_ul_speed', 0.893335), ('upper_quartile_dl', 1.2595675), ('upper_quartile_ul', 1.158595), ('lower_quartile_dl', 0.5719249999999999), ('lower_quartile_ul', 0.4939425), ('party_won', 0.7485059760956175)])


In [21]:
with open('../Data/Dataset2016-RepublicanPercentage.csv', 'w') as fwriter:
    fwriter.write('county,per_capita_income,population_density,pop_without_a2i,' + 
                  'urban_pop_density,rural_pop_density,gt_50_internet,median_dl_speed,' + 
                  'median_ul_speed,upper_quartile_dl,upper_quartile_ul,lower_quartile_dl,lower_quartile_ul,party_won\n')
    for county in counties:
        fwriter.write(counties[county].toCsv())
        fwriter.write('\n')

In [None]:
"""Republican party percentage"""
us_voter_county_df = pd.DataFrame.from_csv('../Data/US_County_Level_Presidential_Results_08-16.csv', sep=',')

for _id, county in us_voter_county_df.iterrows():
    key = fips2name[_id]
    dem = county.dem_2016
    gop = county.gop_2016
    oth = county.oth_2016
    percentage = float(gop) / (dem+gop+oth)
    if key in counties:
        counties[key].party_won = party