In [1]:
import pandas as pd
import os

# **<font color = 'DarkRed'> Congressional District Level </font>**

## **<font color='Black'> Obtain congressional districts demographics: Population age, race, and economic status, and the Cook's Partisan Voting Index (PVI) for each congressional district </font>**

In [2]:
def Get_Districts_PVIs(State = None):
    filename = 'data/data-5vPn3.csv'
    data1 = pd.read_csv(filename, usecols = ['Dist', 'PVI'])
    data1.columns = ['CDistrict', 'PVI_2017']
    
    filename = 'data/PVIs_2022.csv'
    data2 = pd.read_csv(filename, usecols = ['Dist', '2022 PVI'])
    data2.columns = ['CDistrict', 'PVI_2022']
    
    data = pd.merge(data1, data2, on = 'CDistrict')
    
    if State != None:
        mask = data['CDistrict'].str.contains(State)
        data = data[mask]
        data['CDistrict'] = data['CDistrict'].str.replace(State + '-', '', regex = True)       
        
    data['PVI_2017'] = data['PVI_2017'].str.replace('EVEN', 'R+0', regex = True)
    data['PVI_2022'] = data['PVI_2022'].str.replace('EVEN', 'R+0', regex = True)
    
    data['PVI_2017_R'] = (data['PVI_2017'].str.replace('+', '', regex = True)
                         ).replace({"R": "", "D": "-"}, regex = True).astype(int)
    data['PVI_2017_D'] = -data['PVI_2017_R']
    
    data['PVI_2022_R'] = (data['PVI_2022'].str.replace('+', '', regex = True)
                         ).replace({"R": "", "D": "-"}, regex = True).astype(int)
    data['PVI_2022_D'] = -data['PVI_2022_R']
    
    data.set_index('CDistrict', inplace = True)
        
    return data

In [3]:
def Load_Districts_Pop19_Age(State_Code):
    IP = 'https://api.census.gov/data/2019/acs/acs5/subject?get=NAME,S0101_C01_{}E&for=congressional%20district:*&in=state:{}'
    
    IDs_Dict = {'001':'Total Population', '002':'Under 5 years', ('003', '004'):'5 to 14 years', 
                ('005', '006'):'15 to 24 years', '022':'Under 18 years', ('007', '008'):'25 to 34 years', 
                ('009', '010'):'35 to 44 years', ('011', '012'):'45 to 54 years', ('013', '014'):'55 to 64 years', 
                ('015', '016'):'65 to 74 years', ('017', '018'):'75 to 84 years', '019':'85 years and over', 
                '030':'65 years and over'}
    
    Districts_Pop_Age = pd.DataFrame({'CDistrict' : []})
    
    def Get_data(ID, key):
        data = pd.read_json(IP.format(ID, State_Code))
        data.drop([0], inplace = True)
        data.drop([0, 2], axis = 1, inplace = True)
        data.columns = [IDs_Dict[key], 'CDistrict']
        data.set_index('CDistrict', inplace = True)
        data.sort_index(inplace = True)
        data[IDs_Dict[key]] = pd.to_numeric(data[IDs_Dict[key]], errors='coerce')
        return data
    
    for k in IDs_Dict.keys():
        print(IDs_Dict[k])
        if type(k) != tuple:
            data = Get_data(k, k)
        else:
            for i in range(len(k)):
                data_i = Get_data(k[i], k)
                if i == 0:
                    data = data_i
                else:
                    data += data_i
                
        if Districts_Pop_Age.empty:
            Districts_Pop_Age = data
        else:
            Districts_Pop_Age = Districts_Pop_Age.join(data)
            
    Districts_Pop_Age['45 to 64 years'] = Districts_Pop_Age['45 to 54 years'] + Districts_Pop_Age['55 to 64 years']
    Districts_Pop_Age['25 to 44 years'] = Districts_Pop_Age['25 to 34 years'] + Districts_Pop_Age['35 to 44 years']
    Districts_Pop_Age['Under 45 years'] = Districts_Pop_Age['Total Population'] - (Districts_Pop_Age['45 to 64 years'] + 
                                                                                   Districts_Pop_Age['65 years and over'])
    Districts_Pop_Age['Under 35 years'] = Districts_Pop_Age['Under 45 years'] - Districts_Pop_Age['35 to 44 years']
    
    return Districts_Pop_Age

def Get_Districts_Pop19_Age(State_Code):
    filename = 'data/' + State_Code + '_Districts_Pop19_Age.json'
    if os.path.isfile(filename):
        Districts_Pop_Age = pd.read_json(filename, orient = 'table')
    else:
        Districts_Pop_Age = Load_Districts_Pop19_Age(State_Code)
        Districts_Pop_Age.to_json(filename, orient = 'table')

    return Districts_Pop_Age

In [4]:
def Load_Districts_Pop19_Race(State_Code):
    IP = 'https://api.census.gov/data/2019/acs/acs5?get=NAME,B03002_{}E&for=congressional%20district:*&in=state:{}'
    IDs_Dict = {'003':'White, Not Hispanic', '004':'Black or African American, Not Hispanic', 
                '005':'American Indian & Alaska Native, Not Hispanic', '006':'Asian, Not Hispanic',
                '007':'Native Hawaiin & Other Pacific Islander, Not Hispanic', '008':'Other, Not Hispanic', 
                '009':'Two or More Races, Not Hispanic', '012':'Hispanic or Latino'}
    
    Districts_Pop_Race = pd.DataFrame({'CDistrict' : []})
    
    def Get_data(ID):
        data = pd.read_json(IP.format(ID, State_Code))
        data.drop([0], inplace = True)
        data.drop([0, 2], axis = 1, inplace = True)
        data.columns = [IDs_Dict[ID], 'CDistrict']
        data.set_index('CDistrict', inplace = True)
        data.sort_index(inplace = True)
        data[IDs_Dict[ID]] = pd.to_numeric(data[IDs_Dict[ID]], errors='coerce')       
        return data
    
    for k in IDs_Dict.keys():
        print(IDs_Dict[k])
        data = Get_data(k)
        
        if Districts_Pop_Race.empty:
            Districts_Pop_Race = data
        else:
            Districts_Pop_Race = Districts_Pop_Race.join(data)
    
    return Districts_Pop_Race

def Get_Districts_Pop19_Race(State_Code):
    filename = 'data/' + State_Code + '_Districts_Pop19_Race.json'
    if os.path.isfile(filename):
        Districts_Pop_Race = pd.read_json(filename, orient = 'table')
    else:
        Districts_Pop_Race = Load_Districts_Pop19_Race(State_Code)
        Districts_Pop_Race.to_json(filename, orient = 'table')

    return Districts_Pop_Race

In [5]:
def Load_Districts_Econ(State_Code):
    IP = 'https://api.census.gov/data/2019/acs/acs5?get=NAME,B{}E&for=congressional%20district:*&in=state:{}'
    IDs_Dict = {'19326_001':'Median Income', '19326_002':'Median Income, Male', '19326_005':'Median Income, Female', 
                '19301_001':'Per Capita Income', '19013_001':'Median Household Income'}
    
    Districts_Econ = pd.DataFrame({'CDistrict' : []})
    
    def Get_data(ID):
        data = pd.read_json(IP.format(ID, State_Code))
        data.drop([0], inplace = True)
        data.drop([0, 2], axis = 1, inplace = True)
        data.columns = [IDs_Dict[ID], 'CDistrict']
        data.set_index('CDistrict', inplace = True)
        data.sort_index(inplace = True)
        data[IDs_Dict[ID]] = pd.to_numeric(data[IDs_Dict[ID]], errors='coerce')       
        return data
    
    for k in IDs_Dict.keys():
        print(IDs_Dict[k])
        data = Get_data(k)
        
        if Districts_Econ.empty:
            Districts_Econ = data
        else:
            Districts_Econ = Districts_Econ.join(data)
            
    Districts_Econ['Median Income Female/Male'] = (Districts_Econ['Median Income, Female']/
                                                   Districts_Econ['Median Income, Male'])
    
    return Districts_Econ

def Get_Districts_Econ(State_Code):
    filename = 'data/' + State_Code + '_Districts_Econ.json'
    if os.path.isfile(filename):
        Districts_Econ = pd.read_json(filename, orient = 'table')
    else:
        Districts_Econ = Load_Districts_Econ(State_Code)
        Districts_Econ.to_json(filename, orient = 'table')

    return Districts_Econ

In [6]:
def Get_Districts_Demographics(State, State_Code):
    Districts_demo = pd.concat([Get_Districts_PVIs(State), 
                                Get_Districts_Pop19_Age(State_Code), 
                                Get_Districts_Pop19_Race(State_Code), 
                                Get_Districts_Econ(State_Code)], axis = 1)
    
    cols = Districts_demo.loc[:, 'Under 5 years' : 'Hispanic or Latino'].columns
    new_cols = 'Percent ' + cols
    
    Districts_demo[new_cols] = Districts_demo[cols].div(Districts_demo['Total Population'], axis = 0)
        
    return Districts_demo

# **<font color = 'DarkRed'> County Level </font>**

## **<font color='Black'> Obtain counties demographics: Population age, race, and economic status, and the Partisan Voting Index for each county </font>**

In [7]:
# PVIs for the Counties are obtained from https://www.zipdatamaps.com/
def Get_Counties_PVIs(State = None):
    if State != 'VA':
        return 'Only VA counties PVIs are available at the moment'
    
    filename = 'data/PVI_VA_Counties_2020.csv'
    data = pd.read_csv(filename, dtype = str, usecols = ['Code', 'Partisan Voting Index (2020)'])
    data.columns = ['County Code', 'PVI']
    data['PVI_R'] = (data['PVI'].str.replace('+', '', regex = True)).replace(
        {"Republican ": "", "Democrat ": "-", "Even ": ""}, regex = True).astype(int)
    data['PVI_D'] = - data['PVI_R']
    
    data.set_index('County Code', inplace = True)
    data.sort_index(inplace = True)
    
    return data

In [8]:
def Load_Counties_Pop19_Age(State_Code):
    IP = 'https://api.census.gov/data/2019/acs/acs5/subject?get=NAME,S0101_C01_{}E&for=county:*&in=state:{}'
    
    IDs_Dict = {'001':'Total Population', '002':'Under 5 years', ('003', '004'):'5 to 14 years', 
                ('005', '006'):'15 to 24 years', '022':'Under 18 years', ('007', '008'):'25 to 34 years', 
                ('009', '010'):'35 to 44 years', ('011', '012'):'45 to 54 years', ('013', '014'):'55 to 64 years', 
                ('015', '016'):'65 to 74 years', ('017', '018'):'75 to 84 years', '019':'85 years and over', 
                '030':'65 years and over'}
    
    Counties_Pop_Age = pd.DataFrame({'County Code': []})
    
    def Get_data(ID, key):
        data = pd.read_json(IP.format(ID, State_Code))
        data.drop([0], inplace = True)
        data.drop([2], axis = 1, inplace = True)
        data.columns = ['County Name', IDs_Dict[key], 'County Code']
        data['County Name'] = data['County Name'].str.rsplit(',').str[0]
        data.set_index('County Code', inplace = True)
        data.sort_index(inplace = True)
        data[IDs_Dict[key]] = pd.to_numeric(data[IDs_Dict[key]], errors='coerce')
        return data
    
    for k in IDs_Dict.keys():
        print(IDs_Dict[k])
        if type(k) != tuple:
            data = Get_data(k, k)
        else:
            for i in range(len(k)):
                data_i = Get_data(k[i], k)
                if i == 0:
                    data = data_i
                else:
                    data += data_i
                
        if Counties_Pop_Age.empty:
            Counties_Pop_Age = data
        else:
            Counties_Pop_Age = Counties_Pop_Age.join(data.drop('County Name', axis = 1))
            
    Counties_Pop_Age['45 to 64 years'] = Counties_Pop_Age['45 to 54 years'] + Counties_Pop_Age['55 to 64 years']
    Counties_Pop_Age['25 to 44 years'] = Counties_Pop_Age['25 to 34 years'] + Counties_Pop_Age['35 to 44 years']
    Counties_Pop_Age['Under 45 years'] = Counties_Pop_Age['Total Population'] - (Counties_Pop_Age['45 to 64 years'] + 
                                                                                 Counties_Pop_Age['65 years and over'])
    Counties_Pop_Age['Under 35 years'] = Counties_Pop_Age['Under 45 years'] - Counties_Pop_Age['35 to 44 years']
    
    return Counties_Pop_Age

def Get_Counties_Pop19_Age(State_Code):
    filename = 'data/' + State_Code + '_Counties_Pop19_Age.json'
    if os.path.isfile(filename):
        Counties_Pop_Age = pd.read_json(filename, orient = 'table')
    else:
        Counties_Pop_Age = Load_Counties_Pop19_Age(State_Code)
        Counties_Pop_Age.to_json(filename, orient = 'table')

    return Counties_Pop_Age

In [9]:
def Load_Counties_Pop19_Race(State_Code):
    IP = 'https://api.census.gov/data/2019/acs/acs5?get=NAME,B03002_{}E&for=county:*&in=state:{}'
    IDs_Dict = {'003':'White, Not Hispanic', '004':'Black or African American, Not Hispanic', 
                '005':'American Indian & Alaska Native, Not Hispanic', '006':'Asian, Not Hispanic',
                '007':'Native Hawaiin & Other Pacific Islander, Not Hispanic', '008':'Other, Not Hispanic', 
                '009':'Two or More Races, Not Hispanic', '012':'Hispanic or Latino'}
    
    Counties_Pop_Race = pd.DataFrame({'County Code' : []})
    
    def Get_data(ID):
        data = pd.read_json(IP.format(ID, State_Code))
        data.drop([0], inplace = True)
        data.drop([2], axis = 1, inplace = True)
        data.columns = ['County Name', IDs_Dict[ID], 'County Code']
        data['County Name'] = data['County Name'].str.rsplit(',').str[0]
        data.set_index('County Code', inplace = True)
        data.sort_index(inplace = True)
        data[IDs_Dict[ID]] = pd.to_numeric(data[IDs_Dict[ID]], errors='coerce')       
        return data
    
    for k in IDs_Dict.keys():
        print(IDs_Dict[k])
        data = Get_data(k)
        if Counties_Pop_Race.empty:
            Counties_Pop_Race = data
        else:
            Counties_Pop_Race = Counties_Pop_Race.join(data.drop('County Name', axis = 1))
    
    return Counties_Pop_Race

def Get_Counties_Pop19_Race(State_Code):
    filename = 'data/' + State_Code + '_Counties_Pop19_Race.json'
    if os.path.isfile(filename):
        Counties_Pop_Race = pd.read_json(filename, orient = 'table')
    else:
        Counties_Pop_Race = Load_Counties_Pop19_Race(State_Code)
        Counties_Pop_Race.to_json(filename, orient = 'table')

    return Counties_Pop_Race

In [10]:
def Load_Counties_Econ(State_Code):
    IP = 'https://api.census.gov/data/2019/acs/acs5?get=NAME,B{}E&for=county:*&in=state:{}'
    IDs_Dict = {'19326_001':'Median Income', '19326_002':'Median Income, Male', '19326_005':'Median Income, Female', 
                '19301_001':'Per Capita Income', '19013_001':'Median Household Income'}
    
    Counties_Econ = pd.DataFrame({'County Code' : []})
    
    def Get_data(ID):
        data = pd.read_json(IP.format(ID, State_Code))
        data.drop([0], inplace = True)
        data.drop([2], axis = 1, inplace = True)
        data.columns = ['County Name', IDs_Dict[ID], 'County Code']
        data['County Name'] = data['County Name'].str.rsplit(',').str[0]
        data.set_index('County Code', inplace = True)
        data.sort_index(inplace = True)
        data[IDs_Dict[ID]] = pd.to_numeric(data[IDs_Dict[ID]], errors='coerce')       
        return data
    
    for k in IDs_Dict.keys():
        print(IDs_Dict[k])
        data = Get_data(k)
        
        if Counties_Econ.empty:
            Counties_Econ = data
        else:
            Counties_Econ = Counties_Econ.join(data.drop('County Name', axis = 1))
            
    Counties_Econ['Median Income Female/Male'] = Counties_Econ['Median Income, Female']/Counties_Econ['Median Income, Male']
    
    return Counties_Econ

def Get_Counties_Econ(State_Code):
    filename = 'data/' + State_Code + '_Counties_Econ.json'
    if os.path.isfile(filename):
        Counties_Econ = pd.read_json(filename, orient = 'table')
    else:
        Counties_Econ = Load_Counties_Econ(State_Code)
        Counties_Econ.to_json(filename, orient = 'table')

    return Counties_Econ

In [11]:
def Get_Counties_Demographics(State, State_Code):
    Counties_demo = pd.concat([Get_Counties_PVIs(State), 
                               Get_Counties_Pop19_Age(State_Code), 
                               Get_Counties_Pop19_Race(State_Code).drop('County Name', axis = 1), 
                               Get_Counties_Econ(State_Code).drop('County Name', axis = 1)], axis = 1)
    
    cols = Counties_demo.loc[:, 'Under 5 years' : 'Hispanic or Latino'].columns
    new_cols = 'Percent ' + cols
    
    Counties_demo[new_cols] = Counties_demo[cols].div(Counties_demo['Total Population'], axis = 0)
    
    return Counties_demo