## Importing and Cleaning Census Data Set

This notebook imports and cleans the census data set. 

Column names and values are changed in this data set to match the elected officials data set.

In [14]:
# Import statements

import pandas as pd

In [15]:
# For use in turning state names
# into corresponding two letter code
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

In [16]:
# Use dicts to convert values in columns to
# their corresponding meaning in the dataset
# This is taken from the key

region_dict = {
    1 : "Northeast",
    2 : "Midwest",
    3 : "South",
    4 : "West"
}

division_dict = {
    1 : "New England",
    2 : "Middle Atlantic",
    3 : "East North Central",
    4 : "West North Central",
    5 : "South Atlantic",
    6 : "East South Central",
    7 : "West South Central",
    8 : "Mountain",
    9 : "Pacific"
}

sex_dict = {
    0 : "Total",
    1 : "Male",
    2 : "Female",
}

origin_dict = {
    0 : "Total",
    1 : "Not Hispanic",
    2 : "Hispanic"
}

race_dict = {
    1 : "White",
    2 : "Black or African American Alone",
    3 : "American Indian or Alaska Native Alone",
    4 : "Asian Alone", 
    5 : "Native Hawaiian and Other Pacific Islander Alone",
    6 : "Two or more races"
}

In [17]:
# Read in Census data set
df_pop_raw = pd.read_csv("sc-est2019-alldata6.csv")

# List of labels in the csv file we aren't using and can drop
# from the population dataframe
labels_to_drop = ["SUMLEV", "STATE", "CENSUS2010POP",
                 "ESTIMATESBASE2010", "POPESTIMATE2010", "POPESTIMATE2011",
                 "POPESTIMATE2012", "POPESTIMATE2013", "POPESTIMATE2014",
                 "POPESTIMATE2015", "POPESTIMATE2016", "POPESTIMATE2017",
                 "POPESTIMATE2018"]

# Drops the labels
df_pop = df_pop_raw.drop(labels = labels_to_drop, axis = 1)

# List of dicts and column names in the dataframe
# to replace the original numerical encodings
dict_list = [us_state_abbrev, region_dict, division_dict, 
             sex_dict, origin_dict, race_dict]
column_list = ["NAME", "REGION", "DIVISION", "SEX", "ORIGIN", "RACE"]

# Replaces all the numerical encodings
for (dictionary, column) in zip(dict_list, column_list):
    for key in dictionary.keys():
        df_pop.loc[(df_pop[column] == key), column] = dictionary[key]

# Removes data on everyone younger than 18
df_pop = df_pop.drop(df_pop[df_pop["AGE"] < 18].index)

# Creates the groups that the data will be binned into
columns_to_groupby = ["NAME", "REGION", "DIVISION", "SEX", "ORIGIN", "RACE"]
df_pop = df_pop.groupby(by = columns_to_groupby)

# Unpacks the pandas groupby object and sums across columns which
# are not being grouped
df_pop = df_pop.sum().reset_index()

# Drops the age column
df_pop = df_pop.drop(labels = "AGE", axis = 1)
df_pop

Unnamed: 0,NAME,REGION,DIVISION,SEX,ORIGIN,RACE,POPESTIMATE2019
0,AK,West,Pacific,Female,Hispanic,American Indian or Alaska Native Alone,1226
1,AK,West,Pacific,Female,Hispanic,Asian Alone,350
2,AK,West,Pacific,Female,Hispanic,Black or African American Alone,933
3,AK,West,Pacific,Female,Hispanic,Native Hawaiian and Other Pacific Islander Alone,163
4,AK,West,Pacific,Female,Hispanic,Two or more races,1176
...,...,...,...,...,...,...,...
2749,WY,West,Mountain,Total,Total,Asian Alone,5365
2750,WY,West,Mountain,Total,Total,Black or African American Alone,5552
2751,WY,West,Mountain,Total,Total,Native Hawaiian and Other Pacific Islander Alone,456
2752,WY,West,Mountain,Total,Total,Two or more races,7235


In [18]:
# This section focuses on transforming
# the origin and race values in this dataset
# to match that of the elected officials data

# Dict to convert race from current dataset to
# match the elected officials data
second_race_dict = {
    "Black or African American Alone" : "Black or African American",
    "American Indian or Alaska Native Alone" : "American Indian or Alaska Native",
    "Asian Alone" : "Asian American or Pacific Islander",
    "Native Hawaiian and Other Pacific Islander Alone" : "Asian American or Pacific Islander",
    "Two or more races" : "Multiracial"
}

# Converts race values using same method call as above
for race in second_race_dict.keys():
    df_pop.loc[(df_pop["RACE"] == race), "RACE"] = second_race_dict[race]
    
# Overwrites race with Hispanic or Latino if applicable
df_pop.loc[(df_pop["ORIGIN"] == "Hispanic"), "RACE"] = "Hispanic or Latino"
# Drops the Hispanic Origin column
df_pop = df_pop.drop(labels = "ORIGIN", axis = 1)

# Creates the groups that the data will be binned into
columns_to_groupby = ["NAME", "REGION", "DIVISION", "SEX", "RACE"]
df_pop = df_pop.groupby(by = columns_to_groupby)

# Unpacks the pandas groupby object and sums across columns which
# are not being grouped
df_pop = df_pop.sum().reset_index()
df_pop

Unnamed: 0,NAME,REGION,DIVISION,SEX,RACE,POPESTIMATE2019
0,AK,West,Pacific,Female,American Indian or Alaska Native,75564
1,AK,West,Pacific,Female,Asian American or Pacific Islander,46857
2,AK,West,Pacific,Female,Black or African American,16345
3,AK,West,Pacific,Female,Hispanic or Latino,16368
4,AK,West,Pacific,Female,Multiracial,28676
...,...,...,...,...,...,...
913,WY,West,Mountain,Total,Asian American or Pacific Islander,11246
914,WY,West,Mountain,Total,Black or African American,10542
915,WY,West,Mountain,Total,Hispanic or Latino,38580
916,WY,West,Mountain,Total,Multiracial,13177


In [19]:
# This section focuses on converting
# Census population estimates by sex and race
# into percentages of the total state population

# Get all unique states in dataset
states = df_pop["NAME"].unique()

# Creates the groups that the data will be binned into
# for calculating population for race = Total
columns_to_groupby = ["NAME", "SEX", "REGION", "DIVISION"]

# Temporary dataframe to hold the sums of population
# when race = Total
df_race_total = df_pop.groupby(by = columns_to_groupby)

# Unpacks the pandas groupby object and sums across columns which
# are not being grouped
df_race_total = df_race_total.sum().reset_index()

# Adds race column so it matches the original dataframe
df_race_total["RACE"] = "Total"

# Add the total race dataframe back to the full dataframe
df_pop = df_pop.merge(df_race_total, how = "outer")

# Sorts dataframe by state again
df_pop = df_pop.sort_values("NAME")

for state in states:
    # Gets the total state population by summing across
    # all race categories when the state is fixed and the
    # sex category is set to total
    state_total = sum(df_pop["POPESTIMATE2019"].
                   loc[(df_pop["NAME"] == state)
                      & (df_pop["SEX"] == "Total")
                      & (df_pop["RACE"] == "Total")])
    # Creates new column with the percentage of the state population
    df_pop.loc[df_pop["NAME"] == state, "PERCENTSTATE_POP"] = df_pop["POPESTIMATE2019"].div(state_total)
    
# Turns all columns into lowercase  
df_pop.columns = df_pop.columns.str.replace('\s+', '_').str.lower()

# dict to convert current variables to the name in
# the elected officials dataset
variable_dict = {
    "name" : "state",
    "popestimate2019" : "pop_count"
}

# renames the variables
df_pop = df_pop.rename(columns = variable_dict)

# outputs new file to csv
df_pop.to_csv("census_data_cleaned.csv")

In [20]:
df_pop

Unnamed: 0,state,region,division,sex,race,pop_count,percentstate_pop
0,AK,West,Pacific,Female,American Indian or Alaska Native,75564,0.068500
920,AK,West,Pacific,Total,Total,1103124,1.000000
919,AK,West,Pacific,Male,Total,578074,0.524034
17,AK,West,Pacific,Total,White,731440,0.663062
16,AK,West,Pacific,Total,Multiracial,57411,0.052044
...,...,...,...,...,...,...,...
902,WY,West,Mountain,Female,Black or African American,4092,0.004597
901,WY,West,Mountain,Female,Asian American or Pacific Islander,6391,0.007180
900,WY,West,Mountain,Female,American Indian or Alaska Native,9387,0.010547
907,WY,West,Mountain,Male,Asian American or Pacific Islander,4855,0.005455
