In [1]:
# Data Management/Investigation
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
import numpy as np
import missingno as miss
from plotnine import *
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import sqlite3 as sql

Contents of this script:

-process the demographic data

-merge that data into the larger data frame

-export the final cleaned data as a new, separate .csv

### Cleaning the Demographic Data!

Notes:

-This data set is the largest of the three I am using, and it's not even close. Unaltered, it has almost three quarters of a million rows, and processing it takes time. For this project, I am investigating factors which predict a democratic or republican win for a county in the 2016 presidential election. To make this data set workable, I am going to drop all the non-2016 data (as much as I'd love to see the effects of different demographic makeups over time, I just don't have the processing power to make that happen right now).

-I am also going to drop the data for several age categories, specifically the groups where some or all of the included ages are less than 18. Minors cannot vote, and since my overall question pertains to voting, I do not think it makes sense to keep those rows in the data frame. (Plus, anything which helps trim this behemoth down seems like a good plan to me.)

-Additionally, I am dropping a bunch of columns for different racial categories. The codebook for this data set ("Race_Sex_Age_2010-2019_Codebook.pdf" in the folder "Codebooks") lists each of these columns. For the most part, they reference different combinations of races (i.e. the number of people in an age group-county-year who are white AND hispanic, rather than just white or just hispanic).

In [56]:
#Connect to the database
conn = sql.connect('aja149_ppol564__finalproject_sql_database.db')

In [57]:
pop_data = pd.read_sql_query("""SELECT *
                                FROM demographics;""",conn)

In [58]:
#Take the data from the first row and use it for the column names

new_header = pop_data.iloc[0] #grab the first row for the header
pop_data = pop_data[1:] #take the data less the header row
pop_data.columns = new_header #set the header row as the df header

In [59]:
pop_data.columns

Index(['SUMLEV', 'STATE', 'COUNTY', 'STNAME', 'CTYNAME', 'YEAR', 'AGEGRP',
       'TOT_POP', 'TOT_MALE', 'TOT_FEMALE', 'WA_MALE', 'WA_FEMALE', 'BA_MALE',
       'BA_FEMALE', 'IA_MALE', 'IA_FEMALE', 'AA_MALE', 'AA_FEMALE', 'NA_MALE',
       'NA_FEMALE', 'TOM_MALE', 'TOM_FEMALE', 'WAC_MALE', 'WAC_FEMALE',
       'BAC_MALE', 'BAC_FEMALE', 'IAC_MALE', 'IAC_FEMALE', 'AAC_MALE',
       'AAC_FEMALE', 'NAC_MALE', 'NAC_FEMALE', 'NH_MALE', 'NH_FEMALE',
       'NHWA_MALE', 'NHWA_FEMALE', 'NHBA_MALE', 'NHBA_FEMALE', 'NHIA_MALE',
       'NHIA_FEMALE', 'NHAA_MALE', 'NHAA_FEMALE', 'NHNA_MALE', 'NHNA_FEMALE',
       'NHTOM_MALE', 'NHTOM_FEMALE', 'NHWAC_MALE', 'NHWAC_FEMALE',
       'NHBAC_MALE', 'NHBAC_FEMALE', 'NHIAC_MALE', 'NHIAC_FEMALE',
       'NHAAC_MALE', 'NHAAC_FEMALE', 'NHNAC_MALE', 'NHNAC_FEMALE', 'H_MALE',
       'H_FEMALE', 'HWA_MALE', 'HWA_FEMALE', 'HBA_MALE', 'HBA_FEMALE',
       'HIA_MALE', 'HIA_FEMALE', 'HAA_MALE', 'HAA_FEMALE', 'HNA_MALE',
       'HNA_FEMALE', 'HTOM_MALE', 'HTOM_FEMALE

In [60]:
#These three lines will drop all the unneeded racial categories (there are almost 60)
pop_data=pop_data.drop(pop_data.filter(like='C_',axis=1).columns,axis=1)
pop_data=pop_data.drop(pop_data.filter(like='NH',axis=1).columns,axis=1)
pop_data=pop_data.drop(columns = ['HWA_MALE','HWA_FEMALE','HBA_MALE','HBA_FEMALE','HIA_MALE','HIA_FEMALE',
                                   'HAA_MALE','HAA_FEMALE','HNA_MALE','HNA_FEMALE','HTOM_MALE','HTOM_FEMALE'])

In [65]:
#Also, create a new column containing the FIPS for each county, to provide a common unit to make it easier to merge this
#data with the other data
pop_data['FIPS'] = pop_data['STATE']+pop_data['COUNTY']

In [66]:
#Keep only the rows where the codebook specifies the year is 2016
pop_data = pop_data[pop_data.YEAR.isin(['9'])]

#Drop the rows where age is less than 18 (including the group which spans 15-19 year olds)
pop_data = pop_data[~pop_data.AGEGRP.isin(['1','2','3','4'])]

In [67]:
#Convert the number data to numeric
cols_num = pop_data.drop(columns = ['SUMLEV', 'STATE', 'COUNTY','STNAME','CTYNAME','YEAR','AGEGRP','FIPS'])
cols_num = cols_num.apply(lambda x: x.str.replace(',', '').astype(float), axis=1)
cols_num = cols_num.apply(pd.to_numeric)

#Create a second slice with just the string columns
cols_str = pop_data[['SUMLEV', 'STATE', 'COUNTY','STNAME','CTYNAME','YEAR','AGEGRP']]
#Concatenate the two slices, first string, then num
pop_data = pd.concat([cols_str, cols_num], axis = 1)

AttributeError: Can only use .str accessor with string values!

In [69]:
cols_num.head()

Unnamed: 0,TOT_POP,TOT_MALE,TOT_FEMALE,WA_MALE,WA_FEMALE,BA_MALE,BA_FEMALE,IA_MALE,IA_FEMALE,AA_MALE,AA_FEMALE,NA_MALE,NA_FEMALE,TOM_MALE,TOM_FEMALE,H_MALE,H_FEMALE
153,55243.0,26970.0,28273.0,20979.0,21700.0,5040.0,5562.0,119.0,139.0,287.0,359.0,29.0,22.0,516.0,491.0,783.0,715.0
158,3361.0,1718.0,1643.0,1237.0,1194.0,397.0,388.0,7.0,9.0,34.0,19.0,3.0,1.0,40.0,32.0,55.0,36.0
159,3555.0,1780.0,1775.0,1329.0,1335.0,383.0,394.0,8.0,8.0,23.0,15.0,2.0,3.0,35.0,20.0,74.0,45.0
160,3464.0,1708.0,1756.0,1304.0,1328.0,352.0,364.0,7.0,8.0,20.0,33.0,4.0,4.0,21.0,19.0,91.0,72.0
161,3555.0,1696.0,1859.0,1311.0,1343.0,329.0,439.0,5.0,7.0,25.0,48.0,3.0,0.0,23.0,22.0,53.0,54.0


In [75]:
def youth_sum():
    '''Converts the data from age bands of 5 years to 3 categories roughly corresponding to 18-45, 45-65, and 65+.'''
    
    new_list = []
    
    age_df = pop_data.drop(columns =['SUMLEV','STATE','COUNTY','STNAME','CTYNAME','YEAR'])
    
    for ii in age_df.FIPS.unique():
        youth = age_df[age_df.AGEGRP.isin(['5','6','7','8','9'])]
        #midage = age_df[age_df.AGEGRP.isin(['10','11','12','13'])]
        #old = age_df[age_df.AGEGRP.isin(['14','15','16','17','18'])]
                                              
        youth_x = youth.sum()
        #midage_x = midage.sum()
        #old_x = old.sum()
        
        new_list.append(youth_x)
        #new_list.append(midage_x)
        #new_list.append(old_x)
        
    new_list = pd.DataFrame(new_list)
    
    return(new_list)

In [None]:
youth_sum()

In [70]:
pop_data.head(15)

Unnamed: 0,SUMLEV,STATE,COUNTY,STNAME,CTYNAME,YEAR,AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE,WA_MALE,WA_FEMALE,BA_MALE,BA_FEMALE,IA_MALE,IA_FEMALE,AA_MALE,AA_FEMALE,NA_MALE,NA_FEMALE,TOM_MALE,TOM_FEMALE,H_MALE,H_FEMALE,FIPS
153,50,1,1,Alabama,Autauga County,9,0,55243.0,26970.0,28273.0,20979.0,21700.0,5040.0,5562.0,119.0,139.0,287.0,359.0,29.0,22.0,516.0,491.0,783.0,715.0,1001
158,50,1,1,Alabama,Autauga County,9,5,3361.0,1718.0,1643.0,1237.0,1194.0,397.0,388.0,7.0,9.0,34.0,19.0,3.0,1.0,40.0,32.0,55.0,36.0,1001
159,50,1,1,Alabama,Autauga County,9,6,3555.0,1780.0,1775.0,1329.0,1335.0,383.0,394.0,8.0,8.0,23.0,15.0,2.0,3.0,35.0,20.0,74.0,45.0,1001
160,50,1,1,Alabama,Autauga County,9,7,3464.0,1708.0,1756.0,1304.0,1328.0,352.0,364.0,7.0,8.0,20.0,33.0,4.0,4.0,21.0,19.0,91.0,72.0,1001
161,50,1,1,Alabama,Autauga County,9,8,3555.0,1696.0,1859.0,1311.0,1343.0,329.0,439.0,5.0,7.0,25.0,48.0,3.0,0.0,23.0,22.0,53.0,54.0,1001
162,50,1,1,Alabama,Autauga County,9,9,3813.0,1842.0,1971.0,1407.0,1450.0,371.0,452.0,7.0,10.0,31.0,33.0,1.0,1.0,25.0,25.0,50.0,51.0,1001
163,50,1,1,Alabama,Autauga County,9,10,3949.0,1921.0,2028.0,1559.0,1568.0,312.0,395.0,10.0,11.0,15.0,36.0,1.0,3.0,24.0,15.0,57.0,35.0,1001
164,50,1,1,Alabama,Autauga County,9,11,3988.0,1947.0,2041.0,1576.0,1615.0,323.0,374.0,17.0,8.0,17.0,25.0,3.0,0.0,11.0,19.0,22.0,62.0,1001
165,50,1,1,Alabama,Autauga County,9,12,3775.0,1831.0,1944.0,1495.0,1538.0,299.0,370.0,10.0,11.0,12.0,17.0,0.0,0.0,15.0,8.0,31.0,21.0,1001
166,50,1,1,Alabama,Autauga County,9,13,3102.0,1525.0,1577.0,1208.0,1243.0,285.0,296.0,10.0,9.0,10.0,11.0,1.0,1.0,11.0,17.0,20.0,20.0,1001
