![](../additional_materials/logos/darden_rice_logo_SM.png)

### VBM Demographic Analysis - DEV ENVIRONMENT
#### 2021 Municipal Primary, Ballots Mailed

---

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/raw/2021_VBM_mailed_ballots.csv')
# df.head(3)

In [3]:
# Define columns to be renamed based on headers in the first row (index 0) of df
column_names = {'Age': 'Age 18-24', 'Unnamed: 2': 'Age 25-34', 'Unnamed: 3': 'Age 35-49', 'Unnamed: 4': 'Age 50-64',
                'Unnamed: 5': 'Age 65+', 'Unnamed: 6': 'Age Unknown', 'Gender': 'Male', 'Unnamed: 8': 'Female', 
                'Unnamed: 9': 'Gender Unknown', 'Ethnicity': 'African American', 'Unnamed: 11': 'Hispanic', 
                'Unnamed: 12': 'White', 'Unnamed: 13': 'Other Ethnicity', 'Unnamed: 14': 'Ethnicity Unknown', 
                'Party': 'Democrat', 'Unnamed: 16': 'Independent', 'Unnamed: 17': 'Republican', 'Unnamed: 18': 'Other Party',
                'Unnamed: 19': 'Party Unknown'}

# Rename columns based on dictionary above
df.rename(column_names, axis=1, inplace=True)

In [4]:
# Drop the first row of the df, containing headers
df.drop(labels=0, axis=0, inplace=True)
# Drop final row of df, containing column totals
df.drop(df.tail(1).index, inplace=True)

# Reset df index
df.reset_index(drop = True, inplace = True)

In [5]:
# Remove commas from numeric entries and converting all columns to numeric values
df = df.apply(lambda x: x.str.replace(',', ''))

df = df.apply(pd.to_numeric)

In [6]:
df.dtypes

Precinct             int64
Age 18-24            int64
Age 25-34            int64
Age 35-49            int64
Age 50-64            int64
Age 65+              int64
Age Unknown          int64
Male                 int64
Female               int64
Gender Unknown       int64
African American     int64
Hispanic             int64
White                int64
Other Ethnicity      int64
Ethnicity Unknown    int64
Democrat             int64
Independent          int64
Republican           int64
Other Party          int64
Total People         int64
dtype: object

In [7]:
# Set index to precinct number
df.set_index(df['Precinct'], inplace = True)
df.drop('Precinct', axis = 1, inplace = True)

# df.head(3)

In [8]:
# Use this function to remove all columns that sum to 0 people, i.e., columns that don't represent any voters
def remove_zero_sum_cols(df):
    features = []
    columns = list(df.columns.values)
    for c in columns:
        if df[c].sum() != 0:
            features.append(c)
    df = df[features]
    return df

In [9]:
df = remove_zero_sum_cols(df)

In [10]:
df.head(3)

Unnamed: 0_level_0,Age 18-24,Age 25-34,Age 35-49,Age 50-64,Age 65+,Male,Female,Gender Unknown,African American,Hispanic,White,Other Ethnicity,Democrat,Independent,Republican,Total People
Precinct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
101,123,222,374,666,621,829,1167,10,1052,42,781,131,1395,356,255,2006
102,33,93,163,258,418,378,583,4,466,17,428,54,707,134,124,965
103,25,41,103,136,180,210,275,0,33,9,413,30,258,108,119,485


---
---
#### Save clean master csv here

In [11]:
df.to_csv('../data/processed/2021_VBM_mailed_ballots_master.csv')

---
#### Generate and Export Subset Dataframes

In [14]:
def subset_dfs(df):
    age = []
    gen = []
    eth = []
    pty = []
    columns = list(df.columns.values)
    for c in columns:
        if 'Age' in c:
            age.append(c)
        elif 'M' or 'F' or 'G' in c:
            gen.append(c)
        elif 'A' or 'H' or 'W' or 'E' in c:
            eth.append(c)
        elif 'D' or 'I' or 'R' or 'P' in c:
            pty.append(c)
    return age, gen, eth, pty

In [15]:
subset_dfs(df)

(['Age 18-24', 'Age 25-34', 'Age 35-49', 'Age 50-64', 'Age 65+'],
 ['Male',
  'Female',
  'Gender Unknown',
  'African American',
  'Hispanic',
  'White',
  'Other Ethnicity',
  'Democrat',
  'Independent',
  'Republican',
  'Total People'],
 [],
 [])

In [18]:
def subset_dfs(df):
    age_cols = ['Age 18-24', 'Age 25-34', 'Age 35-49', 'Age 50-64', 'Age 65+', 'Age Unknown']
    gender_cols = ['Male', 'Female', 'Gender Unknown']
    ethnicity_cols = ['African American', 'Hispanic', 'White', 'Other Ethnicity', 'Ethnicity Unknown']
    party_cols = ['Democrat', 'Independent', 'Republican', 'Other Party']
    age = []
    gen = []
    eth = []
    pty = []
    columns = list(df.columns.values)
    for c in columns:
        if age_cols in c:
            age.append(c)
        elif gender_cols in c:
            gen.append(c)
        elif ethnicity_cols in c:
            eth.append(c)
        elif party_cols in c:
            pty.append(c)
    return df

# Subset dataframes containing age, gender, ethnicity, and party affiliation breakdowns by precinct
age_df = df[['Precinct', 'Age 18-24', 'Age 25-34', 'Age 35-49', 'Age 50-64', 'Age 65+', 'Age Unknown', 'Total People']].copy()
gen_df = df[['Precinct', 'Male', 'Female', 'Gender Unknown', 'Total People']].copy()
eth_df = df[['Precinct', 'African American', 'Hispanic', 'White', 'Other Ethnicity', 'Ethnicity Unknown', 'Total People']].copy()
party_df = df[['Precinct', 'Democrat', 'Independent', 'Republican', 'Other Party', 'Total People']].copy()

In [19]:
# Saving all master dataframe subsets to individual csv files
age_df.to_csv('../data/processed/2021_VBM_age_breakdown', index = False)
gen_df.to_csv('../data/processed/2021_VBM_gender_breakdown', index = False)
eth_df.to_csv('../data/processed/2021_VBM_ethnicity_breakdown', index = False)
party_df.to_csv('../data/processed/2021_VBM_party_breakdown', index = False)

---
---

### Analysis

In [12]:
df = pd.read_csv('../data/processed/2021_VBM_mailed_ballots_master.csv')

In [13]:
df.head(3)

Unnamed: 0,Precinct,Age 18-24,Age 25-34,Age 35-49,Age 50-64,Age 65+,Male,Female,Gender Unknown,African American,Hispanic,White,Other Ethnicity,Democrat,Independent,Republican,Total People
0,101,123,222,374,666,621,829,1167,10,1052,42,781,131,1395,356,255,2006
1,102,33,93,163,258,418,378,583,4,466,17,428,54,707,134,124,965
2,103,25,41,103,136,180,210,275,0,33,9,413,30,258,108,119,485


In [14]:
df.shape

(91, 20)

In [15]:
# df.isnull().sum()

In [16]:
# df.dtypes

---

In [None]:
# Set index to precinct number
df.set_index(df['Precinct'], inplace = True)
df.drop('Precinct', axis = 1, inplace = True)

# Drop 'Total People'
df.drop('Total People', axis = 1, inplace = True)

---

In [None]:
# age_cols = ['Age 18-24', 'Age 25-34', 'Age 35-49', 'Age 50-64', 'Age 65+', 'Age Unknown']
# gender_cols = ['Male', 'Female', 'Gender Unknown']
# ethnicity_cols = ['African American', 'Hispanic', 'White', 'Other Ethnicity', 'Ethnicity Unknown']
# party_cols = ['Democrat', 'Independent', 'Republican', 'Other Party']

**BELOW:** This table shows the mean, standard deviation, minimum and maximum values, and quartile values for all voters in Pinellas County (all precincts combined) who requested and wwere mailed ballots for the 2021 Municipal Primary.

In [19]:
df.describe()

Unnamed: 0,Age 18-24,Age 25-34,Age 35-49,Age 50-64,Age 65+,Age Unknown,Male,Female,Gender Unknown,African American,Hispanic,White,Other Ethnicity,Ethnicity Unknown,Democrat,Independent,Republican,Other Party,Total People
count,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0,91.0
mean,49.56044,135.978022,208.098901,304.758242,363.67033,0.0,455.263736,603.21978,3.582418,171.725275,43.362637,774.318681,72.659341,0.0,563.857143,231.824176,266.384615,0.0,1062.065934
std,36.356631,105.843908,140.642182,179.644732,248.161778,0.0,275.703578,348.777318,3.389742,250.057019,35.669008,574.249537,49.419794,0.0,325.32256,154.477801,226.309413,0.0,624.066695
min,0.0,1.0,0.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0
25%,23.5,67.5,106.5,183.0,209.5,0.0,257.0,365.5,1.0,21.5,20.5,342.5,33.0,0.0,351.5,113.5,80.5,0.0,633.5
50%,41.0,107.0,180.0,266.0,317.0,0.0,412.0,542.0,3.0,52.0,37.0,701.0,65.0,0.0,544.0,206.0,202.0,0.0,956.0
75%,59.5,176.5,279.5,403.0,420.0,0.0,608.5,826.5,6.0,197.0,51.0,1164.5,93.0,0.0,706.0,326.0,378.5,0.0,1461.0
max,191.0,532.0,687.0,836.0,1109.0,0.0,1296.0,1604.0,16.0,1106.0,183.0,2489.0,217.0,0.0,1449.0,686.0,954.0,0.0,2907.0


---

In [18]:
df.columns

Index(['Precinct', 'Age 18-24', 'Age 25-34', 'Age 35-49', 'Age 50-64',
       'Age 65+', 'Age Unknown', 'Male', 'Female', 'Gender Unknown',
       'African American', 'Hispanic', 'White', 'Other Ethnicity',
       'Ethnicity Unknown', 'Democrat', 'Independent', 'Republican',
       'Other Party', 'Total People'],
      dtype='object')

---
#### Age

In [None]:
age_features = ()

---
#### Gender

---
#### Ethnicity

---
#### Party Affiliation