In [1]:
# Data Management/Investigation
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
import numpy as np
import missingno as miss
from plotnine import *
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import sqlite3 as sql

Contents of this script:

-process the demographic data

-merge that data into the larger data frame

-export the final cleaned data as a new, separate .csv

### Cleaning the Demographic Data!

Notes:

-This data set is the largest of the three I am using, and it's not even close. Unaltered, it has almost three quarters of a million rows, and processing it takes time. For this project, I am investigating factors which predict a democratic or republican win for a county in the 2016 presidential election. To make this data set workable, I am going to drop all the non-2016 data (as much as I'd love to see the effects of different demographic makeups over time, I just don't have the processing power to make that happen right now).

-I am also going to drop the data for several age categories, specifically the groups where some or all of the included ages are less than 18. Minors cannot vote, and since my overall question pertains to voting, I do not think it makes sense to keep those rows in the data frame. (Plus, anything which helps trim this behemoth down seems like a good plan to me.)

-Additionally, I am dropping a bunch of columns for different racial categories. The codebook for this data set ("Race_Sex_Age_2010-2019_Codebook.pdf" in the folder "Codebooks") lists each of these columns. For the most part, they reference different combinations of races (i.e. the number of people in an age group-county-year who are white AND hispanic, rather than just white or just hispanic).

-Finally, I am not going to incorporate age as a variable in this analysis. Before I added this part to this initial notes section, I tried multiple times to write functions which would rearrange the data based on age category, and each time the function would run for upwards of 10 to 15 minutes and still not be finished. As much as I would love to use age in this analysis, I do not have the ability to properly process it for this project. 

In [2]:
#Connect to the database
conn = sql.connect('aja149_sql_database.db')

In [3]:
#con = sql.connect('aja149_sql_database.db')
#cursor = con.cursor()
#cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
#print(cursor.fetchall())

[('alaska_potus',), ('county_pres_2000-2016',), ('Unemployment',), ('demographics',)]


In [15]:
#There are almost 80 columns in this data set. I'm just going to import it all and drop what I don't want then.
pop_data = pd.read_sql_query("""SELECT *
                                FROM demographics;""",conn)

In [16]:
#Take the data from the first row and use it for the column names

new_header = pop_data.iloc[0] #grab the first row for the header
pop_data = pop_data[1:] #take the data less the header row
pop_data.columns = new_header #set the header row as the df header

In [21]:
pop_data.columns

Index(['SUMLEV', 'STATE', 'COUNTY', 'STNAME', 'CTYNAME', 'YEAR', 'AGEGRP',
       'TOT_POP', 'TOT_MALE', 'TOT_FEMALE', 'WA_MALE', 'WA_FEMALE', 'BA_MALE',
       'BA_FEMALE', 'IA_MALE', 'IA_FEMALE', 'AA_MALE', 'AA_FEMALE', 'NA_MALE',
       'NA_FEMALE', 'TOM_MALE', 'TOM_FEMALE', 'H_MALE', 'H_FEMALE', 'FIPS'],
      dtype='object', name=0)

In [18]:
#These three lines will drop all the unneeded racial categories (there are almost 60)
pop_data=pop_data.drop(pop_data.filter(like='C_',axis=1).columns,axis=1)
pop_data=pop_data.drop(pop_data.filter(like='NH',axis=1).columns,axis=1)
pop_data=pop_data.drop(columns = ['HWA_MALE','HWA_FEMALE','HBA_MALE','HBA_FEMALE','HIA_MALE','HIA_FEMALE',
                                   'HAA_MALE','HAA_FEMALE','HNA_MALE','HNA_FEMALE','HTOM_MALE','HTOM_FEMALE'])

In [20]:
#Also, create a new column containing the FIPS for each county, to provide a common unit to make it easier to merge this
#data with the other data
pop_data['FIPS'] = pop_data['STATE']+pop_data['COUNTY']

In [22]:
#Keep only the rows where the codebook specifies the year is 2016
pop_data = pop_data[pop_data.YEAR.isin(['9'])]

#Drop the rows where age is less than 18 (including the group which spans 15-19 year olds)
pop_data = pop_data[~pop_data.AGEGRP.isin(['0','1','2','3','4'])]

In [25]:
#Recode the age data
pop_data['AGEGRP'] = np.where(pop_data['AGEGRP'].isin(['5','6','7','8','9']), "1", pop_data['AGEGRP']) #Young
pop_data['AGEGRP'] = np.where(pop_data['AGEGRP'].isin(['10','11','12','13']), "2", pop_data['AGEGRP']) #Mid-age
pop_data['AGEGRP'] = np.where(pop_data['AGEGRP'].isin(['14','15','16','17','18']), "3", pop_data['AGEGRP']) #Old

In [26]:
#Convert the number data to numeric
cols_num = pop_data.drop(columns = ['SUMLEV', 'STATE', 'COUNTY','STNAME','CTYNAME','YEAR','AGEGRP','FIPS'])
cols_num = cols_num.apply(lambda x: x.str.replace(',', '').astype(float), axis=1)
cols_num = cols_num.apply(pd.to_numeric)

#Create a second slice with just the desired string columns
cols_str = pop_data[['FIPS','AGEGRP']]
#Concatenate the two slices, first string, then num
pop_data = pd.concat([cols_str, cols_num], axis = 1)

In [52]:
pop_data.head(10)

Unnamed: 0,FIPS,AGEGRP,TOT_POP,TOT_MALE,TOT_FEMALE,WA_MALE,WA_FEMALE,BA_MALE,BA_FEMALE,IA_MALE,IA_FEMALE,AA_MALE,AA_FEMALE,NA_MALE,NA_FEMALE,TOM_MALE,TOM_FEMALE,H_MALE,H_FEMALE
158,1001,1,3361.0,1718.0,1643.0,1237.0,1194.0,397.0,388.0,7.0,9.0,34.0,19.0,3.0,1.0,40.0,32.0,55.0,36.0
159,1001,1,3555.0,1780.0,1775.0,1329.0,1335.0,383.0,394.0,8.0,8.0,23.0,15.0,2.0,3.0,35.0,20.0,74.0,45.0
160,1001,1,3464.0,1708.0,1756.0,1304.0,1328.0,352.0,364.0,7.0,8.0,20.0,33.0,4.0,4.0,21.0,19.0,91.0,72.0
161,1001,1,3555.0,1696.0,1859.0,1311.0,1343.0,329.0,439.0,5.0,7.0,25.0,48.0,3.0,0.0,23.0,22.0,53.0,54.0
162,1001,1,3813.0,1842.0,1971.0,1407.0,1450.0,371.0,452.0,7.0,10.0,31.0,33.0,1.0,1.0,25.0,25.0,50.0,51.0
163,1001,2,3949.0,1921.0,2028.0,1559.0,1568.0,312.0,395.0,10.0,11.0,15.0,36.0,1.0,3.0,24.0,15.0,57.0,35.0
164,1001,2,3988.0,1947.0,2041.0,1576.0,1615.0,323.0,374.0,17.0,8.0,17.0,25.0,3.0,0.0,11.0,19.0,22.0,62.0
165,1001,2,3775.0,1831.0,1944.0,1495.0,1538.0,299.0,370.0,10.0,11.0,12.0,17.0,0.0,0.0,15.0,8.0,31.0,21.0
166,1001,2,3102.0,1525.0,1577.0,1208.0,1243.0,285.0,296.0,10.0,9.0,10.0,11.0,1.0,1.0,11.0,17.0,20.0,20.0
167,1001,3,2649.0,1225.0,1424.0,1039.0,1157.0,169.0,232.0,4.0,13.0,5.0,10.0,0.0,1.0,8.0,11.0,7.0,17.0


In [55]:
def age_split():
    
    """Finds sums of each of three ad-hoc age categories (young, middle-aged, elderly) for each county. Creates new data
    frames and concatenates."""
    
    #Create new list items
    younglist = []
    midlist = []
    oldlist = []
    
    #Create data frame slices based on age
    df1 = pop_data[pop_data['AGEGRP'] == '1'].drop(columns = ['AGEGRP'])
    df2 = pop_data[pop_data['AGEGRP'] == '2'].drop(columns = ['AGEGRP'])
    df3 = pop_data[pop_data['AGEGRP'] == '3'].drop(columns = ['AGEGRP'])
    
    col_names = df1.columns
    
    #For each county, find the sum of the rows in each age category and append that row of sums to a list
    for ii in df1.FIPS.unique():
        x = df1[df1['FIPS']==ii]
        x1 = np.sum(x)
        younglist.append(x1)
    
    for ii in df2.FIPS.unique():
        y = df2[df2['FIPS']==ii]
        y1 = np.sum(y)
        midlist.append(y1)
    
    for ii in df3.FIPS.unique():
        z = df3[df3['FIPS']==ii]
        z1 = np.sum(z)
        oldlist.append(z1)
        
    #Convert all three lists to data frames
    younglist = pd.DataFrame(younglist)
    younglist['FIPS'] = younglist['FIPS'].str[:5]
    
    midlist = pd.DataFrame(midlist)
    midlist['FIPS'] = midlist['FIPS'].str[:5]
    
    oldlist = pd.DataFrame(oldlist)
    oldlist['FIPS'] = oldlist['FIPS'].str[:5]
    
    demog = pd.merge(younglist,midlist, how='outer',on='FIPS')
    demog = pd.merge(demog,oldlist,how = 'outer', on='FIPS')
    
    #Return the completed data frame
    return(demog)

In [57]:
pop_data = age_split()

In [61]:
pop_data.head()

Unnamed: 0,FIPS,TOT_POP_x,TOT_MALE_x,TOT_FEMALE_x,WA_MALE_x,WA_FEMALE_x,BA_MALE_x,BA_FEMALE_x,IA_MALE_x,IA_FEMALE_x,AA_MALE_x,AA_FEMALE_x,NA_MALE_x,NA_FEMALE_x,TOM_MALE_x,TOM_FEMALE_x,H_MALE_x,H_FEMALE_x,TOT_POP_y,TOT_MALE_y,TOT_FEMALE_y,WA_MALE_y,WA_FEMALE_y,BA_MALE_y,BA_FEMALE_y,IA_MALE_y,IA_FEMALE_y,AA_MALE_y,AA_FEMALE_y,NA_MALE_y,NA_FEMALE_y,TOM_MALE_y,TOM_FEMALE_y,H_MALE_y,H_FEMALE_y,TOT_POP,TOT_MALE,TOT_FEMALE,WA_MALE,WA_FEMALE,BA_MALE,BA_FEMALE,IA_MALE,IA_FEMALE,AA_MALE,AA_FEMALE,NA_MALE,NA_FEMALE,TOM_MALE,TOM_FEMALE,H_MALE,H_FEMALE
0,1001,17748.0,0.492675,0.507325,0.371197,0.37469,0.103223,0.114773,0.001916,0.002366,0.007494,0.008339,0.000732,0.000507,0.008114,0.006649,0.018199,0.014537,14814.0,0.487647,0.512353,0.394087,0.402592,0.082287,0.096868,0.003173,0.002633,0.003645,0.006008,0.000338,0.00027,0.004118,0.003983,0.008775,0.009316,8093.0,0.434573,0.565427,0.37588,0.47226,0.052762,0.080193,0.001483,0.004448,0.00173,0.004819,0.000124,0.000124,0.002595,0.003583,0.003707,0.007414
1,1003,59850.0,0.493968,0.506032,0.419315,0.430175,0.05726,0.055923,0.004294,0.003776,0.005714,0.008287,0.000585,0.000301,0.0068,0.007569,0.031997,0.026717,57466.0,0.477082,0.522918,0.427209,0.46417,0.037779,0.044705,0.004403,0.003846,0.002993,0.005046,0.000209,0.000313,0.00449,0.004838,0.014757,0.013312,40494.0,0.4656,0.5344,0.435521,0.496963,0.022053,0.028745,0.003037,0.003013,0.001506,0.002272,4.9e-05,0.000148,0.003433,0.00326,0.005581,0.006124
2,1005,8437.0,0.604243,0.395757,0.267986,0.172455,0.319663,0.214531,0.004741,0.002963,0.002608,0.002133,0.002371,0.00083,0.006874,0.002845,0.036624,0.018846,6727.0,0.521778,0.478222,0.281849,0.252564,0.229077,0.218076,0.00327,0.002081,0.002081,0.002378,0.000446,0.000446,0.005054,0.002676,0.01219,0.009663,4674.0,0.434318,0.565682,0.292683,0.346812,0.136072,0.211168,0.001712,0.001712,0.001498,0.002781,0.000214,0.000214,0.002139,0.002995,0.004707,0.003851
3,1007,7777.0,0.584416,0.415584,0.37238,0.329947,0.203935,0.079979,0.001414,0.001929,0.001157,0.000386,0.002443,0.000257,0.003086,0.003086,0.022502,0.013244,6167.0,0.526026,0.473974,0.406681,0.381871,0.113507,0.083023,0.001459,0.002108,0.000486,0.002919,0.000162,0.0,0.00373,0.004054,0.007459,0.005838,3517.0,0.441854,0.558146,0.379016,0.478533,0.055161,0.07478,0.003412,0.000284,0.000853,0.000284,0.0,0.0,0.003412,0.004265,0.005971,0.006824
4,1009,17150.0,0.501808,0.498192,0.482624,0.479883,0.010379,0.006531,0.002915,0.004082,0.001108,0.001633,0.000933,0.000583,0.003848,0.005481,0.065364,0.051778,15493.0,0.49971,0.50029,0.482992,0.484219,0.007358,0.006648,0.002646,0.002324,0.001226,0.001485,0.000581,0.000258,0.004905,0.005357,0.027174,0.021623,10182.0,0.447456,0.552544,0.433707,0.537419,0.005402,0.00717,0.004027,0.002652,0.000491,0.001277,0.000196,0.000295,0.003634,0.003732,0.010803,0.008839


In [59]:
#Ok, the next step is to convert each demographic column to proportions instead of integers. 
def proportion():
    col_names1 = ['TOT_MALE', 'TOT_FEMALE', 'WA_MALE', 'WA_FEMALE', 'BA_MALE',
       'BA_FEMALE', 'IA_MALE', 'IA_FEMALE', 'AA_MALE', 'AA_FEMALE', 'NA_MALE',
       'NA_FEMALE', 'TOM_MALE', 'TOM_FEMALE', 'H_MALE', 'H_FEMALE']
    
    col_names2 = ['TOT_MALE_x', 'TOT_FEMALE_x', 'WA_MALE_x', 'WA_FEMALE_x', 'BA_MALE_x',
       'BA_FEMALE_x', 'IA_MALE_x', 'IA_FEMALE_x', 'AA_MALE_x', 'AA_FEMALE_x', 'NA_MALE_x',
       'NA_FEMALE_x', 'TOM_MALE_x', 'TOM_FEMALE_x', 'H_MALE_x', 'H_FEMALE_x']
    
    col_names3 = ['TOT_MALE_y', 'TOT_FEMALE_y', 'WA_MALE_y', 'WA_FEMALE_y', 'BA_MALE_y',
       'BA_FEMALE_y', 'IA_MALE_y', 'IA_FEMALE_y', 'AA_MALE_y', 'AA_FEMALE_y', 'NA_MALE_y',
       'NA_FEMALE_y', 'TOM_MALE_y', 'TOM_FEMALE_y', 'H_MALE_y', 'H_FEMALE_y']
    
    for ii in col_names1:
        pop_data[ii] = pop_data[ii]/pop_data['TOT_POP']
        
    for ii in col_names2:
        pop_data[ii] = pop_data[ii]/pop_data['TOT_POP_x']
        
    for ii in col_names3:
        pop_data[ii] = pop_data[ii]/pop_data['TOT_POP_y']
        
    return(pop_data)

In [60]:
pop_data = proportion()

In [64]:
final_df = pd.read_csv('../Datasets/intermediate1.csv')

In [67]:
final_df.columns

Index(['FIPS', 'Stabr', 'area_name', 'Rural_urban_continuum_code_2013',
       'Urban_influence_code_2013', 'Metro_2013',
       ' Civilian_labor_force_2000 ', 'Unemployment_rate_2000',
       ' Civilian_labor_force_2001 ', 'Unemployment_rate_2001',
       ' Civilian_labor_force_2002 ', 'Unemployment_rate_2002',
       ' Civilian_labor_force_2003 ', 'Unemployment_rate_2003',
       ' Civilian_labor_force_2004 ', 'Unemployment_rate_2004',
       ' Civilian_labor_force_2005 ', 'Unemployment_rate_2005',
       ' Civilian_labor_force_2006 ', 'Unemployment_rate_2006',
       ' Civilian_labor_force_2007 ', 'Unemployment_rate_2007',
       ' Civilian_labor_force_2008 ', 'Unemployment_rate_2008',
       'Civilian_labor_force_2009', 'Unemployment_rate_2009',
       ' Civilian_labor_force_2010 ', 'Unemployment_rate_2010',
       ' Civilian_labor_force_2011 ', 'Unemployment_rate_2011',
       ' Civilian_labor_force_2012 ', 'Unemployment_rate_2012',
       ' Civilian_labor_force_2013 ', 'Unemploym

In [66]:
#Drop some columns I should have dropped in the earlier processing stage since I only care about unemployment rate and not
#labor force size
final_df = final_df.drop(columns = [' Employed_2000 ', ' Unemployed_2000 ',
       ' Employed_2001 ', ' Unemployed_2001 ', ' Employed_2002 ', ' Unemployed_2002 ',
       ' Employed_2003 ', ' Unemployed_2003 ', ' Employed_2004 ', ' Unemployed_2004 ',
       ' Employed_2005 ', ' Unemployed_2005 ', ' Employed_2006 ', ' Unemployed_2006 ',
       ' Employed_2007 ', ' Unemployed_2007 ', ' Employed_2008 ', ' Unemployed_2008 ', 
       'Employed_2009','Unemployed_2009', ' Employed_2010 ', ' Unemployed_2010 ',
       ' Employed_2011 ', ' Unemployed_2011 ', ' Employed_2012 ', ' Unemployed_2012 ',
       ' Employed_2013 ', ' Unemployed_2013 ', 
       ' Employed_2014 ', ' Unemployed_2014 ',' Employed_2015 ', ' Unemployed_2015 ',
       ' Employed_2016 ', ' Unemployed_2016 ', 'state', 'state_po', 'County_equivalent'])

In [68]:
#Convert FIPS to string to make merge possible
final_df['FIPS'] = final_df['FIPS'].astype(str)
#Fill in leading zeroes so the merges happen on the correct values
final_df['FIPS'] = final_df['FIPS'].str.zfill(5)
#Merge the population data in with the other data and show the first 5 rows just to check
final_df = pd.merge(final_df,pop_data,how = 'outer',on='FIPS')
final_df.head()

Unnamed: 0,FIPS,Stabr,area_name,Rural_urban_continuum_code_2013,Urban_influence_code_2013,Metro_2013,Civilian_labor_force_2000,Unemployment_rate_2000,Civilian_labor_force_2001,Unemployment_rate_2001,Civilian_labor_force_2002,Unemployment_rate_2002,Civilian_labor_force_2003,Unemployment_rate_2003,Civilian_labor_force_2004,Unemployment_rate_2004,Civilian_labor_force_2005,Unemployment_rate_2005,Civilian_labor_force_2006,Unemployment_rate_2006,Civilian_labor_force_2007,Unemployment_rate_2007,Civilian_labor_force_2008,Unemployment_rate_2008,Civilian_labor_force_2009,Unemployment_rate_2009,Civilian_labor_force_2010,Unemployment_rate_2010,Civilian_labor_force_2011,Unemployment_rate_2011,Civilian_labor_force_2012,Unemployment_rate_2012,Civilian_labor_force_2013,Unemployment_rate_2013,Civilian_labor_force_2014,Unemployment_rate_2014,Civilian_labor_force_2015,Unemployment_rate_2015,Civilian_labor_force_2016,Unemployment_rate_2016,Dvotes2000,Rvotes2000,tvotes2000,Dvotes2004,Rvotes2004,tvotes2004,Dvotes2008,Rvotes2008,tvotes2008,Dvotes2012,Rvotes2012,tvotes2012,Dvotes2016,Rvotes2016,tvotes2016,2P_2000,2P_2004,2P_2008,2P_2012,D_win2016,TOT_POP_x,TOT_MALE_x,TOT_FEMALE_x,WA_MALE_x,WA_FEMALE_x,BA_MALE_x,BA_FEMALE_x,IA_MALE_x,IA_FEMALE_x,AA_MALE_x,AA_FEMALE_x,NA_MALE_x,NA_FEMALE_x,TOM_MALE_x,TOM_FEMALE_x,H_MALE_x,H_FEMALE_x,TOT_POP_y,TOT_MALE_y,TOT_FEMALE_y,WA_MALE_y,WA_FEMALE_y,BA_MALE_y,BA_FEMALE_y,IA_MALE_y,IA_FEMALE_y,AA_MALE_y,AA_FEMALE_y,NA_MALE_y,NA_FEMALE_y,TOM_MALE_y,TOM_FEMALE_y,H_MALE_y,H_FEMALE_y,TOT_POP,TOT_MALE,TOT_FEMALE,WA_MALE,WA_FEMALE,BA_MALE,BA_FEMALE,IA_MALE,IA_FEMALE,AA_MALE,AA_FEMALE,NA_MALE,NA_FEMALE,TOM_MALE,TOM_FEMALE,H_MALE,H_FEMALE
0,1001,AL,"Autauga County, AL",2.0,2.0,1.0,21720.0,4.0,21955.0,4.1,22094.0,4.8,22604.0,5.1,23218.0,4.8,23949.0,3.8,24398.0,3.3,24383.0,3.3,24687.0,5.1,24703.0,9.7,25713.0,8.9,25836.0,8.4,25740.0,6.9,25810.0,6.2,25592.0,5.8,25652.0,5.0,26031.0,5.1,0.291822,0.708178,17208.0,0.238448,0.761552,20081.0,0.259321,0.740679,23641.0,0.268006,0.731994,23932.0,5936.0,18172.0,24973.0,16935.0,19954.0,23496.0,23742.0,0.0,17748.0,0.492675,0.507325,0.371197,0.37469,0.103223,0.114773,0.001916,0.002366,0.007494,0.008339,0.000732,0.000507,0.008114,0.006649,0.018199,0.014537,14814.0,0.487647,0.512353,0.394087,0.402592,0.082287,0.096868,0.003173,0.002633,0.003645,0.006008,0.000338,0.00027,0.004118,0.003983,0.008775,0.009316,8093.0,0.434573,0.565427,0.37588,0.47226,0.052762,0.080193,0.001483,0.004448,0.00173,0.004819,0.000124,0.000124,0.002595,0.003583,0.003707,0.007414
1,1003,AL,"Baldwin County, AL",3.0,2.0,1.0,69533.0,3.7,69161.0,4.3,69169.0,5.0,72299.0,5.0,74772.0,5.2,76804.0,4.0,79711.0,3.2,82659.0,3.1,83223.0,4.6,82451.0,9.8,83459.0,10.0,85045.0,9.0,84414.0,7.5,85280.0,6.6,86384.0,6.1,87872.0,6.0,90895.0,5.3,0.255099,0.744901,56480.0,0.22749,0.77251,69320.0,0.240351,0.759649,81413.0,0.21819,0.78181,85338.0,18458.0,72883.0,95215.0,54869.0,68570.0,80657.0,84440.0,0.0,59850.0,0.493968,0.506032,0.419315,0.430175,0.05726,0.055923,0.004294,0.003776,0.005714,0.008287,0.000585,0.000301,0.0068,0.007569,0.031997,0.026717,57466.0,0.477082,0.522918,0.427209,0.46417,0.037779,0.044705,0.004403,0.003846,0.002993,0.005046,0.000209,0.000313,0.00449,0.004838,0.014757,0.013312,40494.0,0.4656,0.5344,0.435521,0.496963,0.022053,0.028745,0.003037,0.003013,0.001506,0.002272,4.9e-05,0.000148,0.003433,0.00326,0.005581,0.006124
2,1005,AL,"Barbour County, AL",6.0,6.0,0.0,11373.0,5.5,11250.0,7.4,10971.0,7.7,10977.0,7.1,10633.0,7.2,10760.0,5.8,10705.0,5.7,10334.0,6.3,10161.0,8.8,10003.0,14.3,10221.0,12.3,9849.0,11.5,9362.0,11.5,9099.0,10.2,8845.0,10.5,8625.0,9.0,8436.0,8.3,0.504473,0.495527,10395.0,0.450284,0.549716,10777.0,0.492692,0.507308,11630.0,0.515791,0.484209,11509.0,4871.0,5454.0,10469.0,10284.0,10731.0,11563.0,11462.0,0.0,8437.0,0.604243,0.395757,0.267986,0.172455,0.319663,0.214531,0.004741,0.002963,0.002608,0.002133,0.002371,0.00083,0.006874,0.002845,0.036624,0.018846,6727.0,0.521778,0.478222,0.281849,0.252564,0.229077,0.218076,0.00327,0.002081,0.002081,0.002378,0.000446,0.000446,0.005054,0.002676,0.01219,0.009663,4674.0,0.434318,0.565682,0.292683,0.346812,0.136072,0.211168,0.001712,0.001712,0.001498,0.002781,0.000214,0.000214,0.002139,0.002995,0.004707,0.003851
3,1007,AL,"Bibb County, AL",1.0,1.0,1.0,8565.0,5.3,9081.0,6.8,8933.0,7.0,8836.0,6.0,8843.0,5.5,8861.0,4.5,8850.0,4.2,8791.0,4.1,8749.0,5.8,8742.0,13.3,8934.0,11.4,8933.0,10.5,8798.0,8.5,8705.0,7.9,8559.0,7.2,8589.0,7.0,8644.0,6.4,0.388085,0.611915,7101.0,0.276286,0.723714,7600.0,0.268543,0.731457,8644.0,0.264219,0.735781,8420.0,1874.0,6738.0,8819.0,6983.0,7561.0,8561.0,8334.0,0.0,7777.0,0.584416,0.415584,0.37238,0.329947,0.203935,0.079979,0.001414,0.001929,0.001157,0.000386,0.002443,0.000257,0.003086,0.003086,0.022502,0.013244,6167.0,0.526026,0.473974,0.406681,0.381871,0.113507,0.083023,0.001459,0.002108,0.000486,0.002919,0.000162,0.0,0.00373,0.004054,0.007459,0.005838,3517.0,0.441854,0.558146,0.379016,0.478533,0.055161,0.07478,0.003412,0.000284,0.000853,0.000284,0.0,0.0,0.003412,0.004265,0.005971,0.006824
4,1009,AL,"Blount County, AL",1.0,1.0,1.0,25106.0,3.5,25305.0,3.6,25757.0,5.4,25900.0,4.6,26208.0,4.2,26446.0,3.6,26770.0,3.2,26629.0,3.2,26698.0,4.7,26480.0,10.0,24906.0,9.8,25123.0,8.7,24960.0,6.9,24887.0,6.3,24527.0,6.1,24521.0,5.0,24684.0,5.4,0.282079,0.717921,17973.0,0.184675,0.815325,21504.0,0.147296,0.852704,24267.0,0.125174,0.874826,24006.0,2156.0,22859.0,25588.0,17644.0,21324.0,23911.0,23727.0,0.0,17150.0,0.501808,0.498192,0.482624,0.479883,0.010379,0.006531,0.002915,0.004082,0.001108,0.001633,0.000933,0.000583,0.003848,0.005481,0.065364,0.051778,15493.0,0.49971,0.50029,0.482992,0.484219,0.007358,0.006648,0.002646,0.002324,0.001226,0.001485,0.000581,0.000258,0.004905,0.005357,0.027174,0.021623,10182.0,0.447456,0.552544,0.433707,0.537419,0.005402,0.00717,0.004027,0.002652,0.000491,0.001277,0.000196,0.000295,0.003634,0.003732,0.010803,0.008839


In [69]:
#Check for missingness
final_df.isna().sum()

FIPS                                0
Stabr                               3
area_name                           3
Rural_urban_continuum_code_2013     3
Urban_influence_code_2013           3
Metro_2013                          3
 Civilian_labor_force_2000          3
Unemployment_rate_2000              3
 Civilian_labor_force_2001          3
Unemployment_rate_2001              3
 Civilian_labor_force_2002          3
Unemployment_rate_2002              3
 Civilian_labor_force_2003          3
Unemployment_rate_2003              3
 Civilian_labor_force_2004          3
Unemployment_rate_2004              3
 Civilian_labor_force_2005          3
Unemployment_rate_2005              3
 Civilian_labor_force_2006          3
Unemployment_rate_2006              3
 Civilian_labor_force_2007          3
Unemployment_rate_2007              3
 Civilian_labor_force_2008          3
Unemployment_rate_2008              3
Civilian_labor_force_2009           3
Unemployment_rate_2009              3
 Civilian_la

In [70]:
#Drop the rows with missing values
final_df = final_df.dropna()
#Final shape: 3139 rows, 60 columns
final_df.shape

(3139, 111)

I am now going to export this data frame as a .csv so it can be loaded into the visualization and modeling script.

In [71]:
final_df.to_csv('intermediate2.csv', index=False)

In [72]:
conn.close()