## Importing and Cleaning the Elected Officials Data Set

This notebook imports and cleans the elected officials data set.

In [8]:
# Import statements

import pandas as pd

In [9]:
# Reads in the ELOF dataset
df_elof = pd.read_csv('RDC-Americas-Cities-2020-Raw-Data - Raw Data- Top 100 Cities.csv')

# Converts all variables to lowercase
df_elof.columns = df_elof.columns.str.replace('\s+', '_').str.lower()

df_elof.head()

Unnamed: 0,state,office_uuid,office_name,seat,office_level,office_role,office_category,body_name,jurisdiction,jurisdiction_ocdid,...,person_uuid,official_name,official_party,party,wnw,race,sex,level,include,state.1
0,AL,c13e6096-e0aa-4c64-b9d6-1e1ebe328f0f,Birmingham City Council Member,District 1,locality,,,,Birmingham city,,...,07802561-7ea9-4b7b-b49c-6ec401a0d683,Clinton Woods,Nonpartisan,Independent,Non-White,Black or African American,Male,City,1,AL
1,AL,6b865770-e2cd-423e-bfa0-b0a96b463ab8,Birmingham City Council Member,District 4,locality,,,,Birmingham city,,...,8a058100-7fd8-41c6-b8c6-8d46f555c382,William Parker,Nonpartisan,Independent,Non-White,Black or African American,Male,City,1,AL
2,AL,c28f6b10-c58a-418a-8281-40384263eac5,Birmingham City Council Member,District 6,locality,,,,Birmingham city,,...,2d875891-b877-43b4-b678-362c2fbbe2d7,Crystal Smitherman,Nonpartisan,Independent,Non-White,Black or African American,Female,City,1,AL
3,AL,0dda83cc-014e-41ee-98f2-28b9ef5c35c2,Birmingham City Council Member,District 7,locality,,,,Birmingham city,,...,fba7f72c-6df1-4725-a973-0c337ae6e57d,Wardine Alexander,Nonpartisan,Independent,Non-White,Black or African American,Female,City,1,AL
4,AL,cc8204ba-8ba8-4240-8f29-34a80ade9fd3,Birmingham City Council Member,District 8,locality,,,,Birmingham city,,...,bbde59bd-12be-4f70-a298-f10200926ab8,Steven W. Hoyt,Nonpartisan,Independent,Non-White,Black or African American,Male,City,1,AL


In [10]:
# Now we move on to restricting the data set to keep
# the relevant columns for our research questions

# Keeps only columns of relevant variables
df_elof = df_elof[["state", "race", "sex"]]

# Racial categories which will be dropped from the dataset
# due to not having an equivalent in the census data
dropped_categories = ["Other", "Unknown"]

# Drops rows which have race value in a dropped category
df_elof = df_elof[~df_elof["race"].isin(dropped_categories)]

# Placeholder, assigns a count value of one to every official
df_elof["elof_count"] = 1

# Creates the groups that the data will be binned into
columns_to_groupby = ["state", "race", "sex"]
df_elof = df_elof.groupby(by = columns_to_groupby)

# Unpacks the pandas groupby object and sums across columns which
# are not being grouped
df_elof = df_elof.sum().reset_index()

# Creates the groups that the data will be binned into
# for calculating elof_count for sex = Total
columns_to_groupby = ["state", "race"]

# Temporary df to hold the counts of elected officials
# when sex = Total
df_sex_total = df_elof.groupby(by = columns_to_groupby)

# Unpacks the pandas groupby object and sums across columns which
# are not being grouped
df_sex_total = df_sex_total.sum().reset_index()

# Adds sex column so it matches the original dataframe
df_sex_total["sex"] = "Total"

# Adds the total sex df back to the full dataframe
df_elof = df_elof.merge(df_sex_total, how = "outer")

# Sorts dataframe by state again
df_elof = df_elof.sort_values("state")

# creates the groups that the data will be binned into
# for calculating elof_count for race = Total
columns_to_groupby = ["state", "sex"]
# temporary df to hold the counts of elected officials
# when race = Total
df_race_total = df_elof.groupby(by = columns_to_groupby)
# unpacks the pandas groupby object and sums across columns which
# are not being grouped
df_race_total = df_race_total.sum().reset_index()
# adds race column so it matches the original dataframe
df_race_total["race"] = "Total"
# add the total race df back to the full dataframe
df_elof = df_elof.merge(df_race_total, how = "outer")
# sorts dataframe by state again
df_elof = df_elof.sort_values("state")

# Gets iterable of states which are in the dataset
states = df_elof["state"].unique()

for state in states:
    # Gets the total state population by summing across
    # all race categories when the state is fixed and the
    # sex category is set to total
    state_total = sum(df_elof["elof_count"].
                   loc[(df_elof["state"] == state)
                      & (df_elof["sex"] == "Total")
                      & (df_elof["race"] == "Total")])
    # Creates new column with the percentage of the state population
    df_elof.loc[df_elof["state"] == state, "percentstate_elof"] = df_elof["elof_count"].div(state_total)
    
# Outputs new file to csv
df_elof.to_csv("elof_data_cleaned.csv")

In [11]:
df_elof[df_elof.state == "AL"]

Unnamed: 0,state,race,sex,elof_count,percentstate_elof
0,AL,Black or African American,Female,2,0.2
290,AL,Total,Total,10,1.0
289,AL,Total,Male,7,0.7
5,AL,White,Female,1,0.1
288,AL,Total,Female,3,0.3
3,AL,Black or African American,Total,7,0.7
2,AL,White,Male,2,0.2
4,AL,Black or African American,Male,5,0.5
1,AL,White,Total,3,0.3
