In [132]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from jqi_functions import *

## Load Data

In [139]:
county_info = pd.read_csv('data/county_to_regions_key.csv')

In [3]:
ipums_2019 = cleaned_ipums_demo('2019')
ipums_2018 = cleaned_ipums_demo('2018')
ipums_2017 = cleaned_ipums_demo('2017')
ipums_2016 = cleaned_ipums_demo('2016')
ipums_2015 = cleaned_ipums_demo('2015')

In [134]:
%run jqi_functions
ipums_2020 = cleaned_ipums_demo('2020')

In [4]:
col_2019 = pd.read_csv('data/cost_of_living/united-way-col-1A1PS1C2019.csv')
col_2018 = pd.read_csv('data/cost_of_living/united-way-col-1A1PS1C2018.csv')
col_2017 = pd.read_csv('data/cost_of_living/united-way-col-1A1PS1C2017.csv')
col_2016 = pd.read_csv('data/cost_of_living/united-way-col-1A1PS1C2016.csv')
col_2015 = pd.read_csv('data/cost_of_living/united-way-col-1A1PS1C2015.csv')

In [5]:
def merge_ipums_col(df, col, county_info):
    df = pd.merge(df, county_info, on = 'COUNTYFIP')
    df = df[['INDNAICS', 'PERWT', 'INCWAGE', 'RACE', 'HISPAN',
           'NAICS Code', 'Industry Title_x', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code', 'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions']]
    df = pd.merge(df, col, left_on = 'CDI Regions', right_on = 'Regions')
    df = df.rename(columns = {'Cost of Living':'Regional COL'})
    df = pd.merge(df, col, left_on = 'Rural/Urban', right_on = 'Regions')
    df = df.rename(columns = {'Cost of Living':'Rural/Urban COL'})
    df = pd.merge(df, col, left_on = 'County', right_on = 'Regions')
    df = df.rename(columns = {'Cost of Living':'County COL'})
    df = df[['INDNAICS', 'PERWT', 'INCWAGE', 'RACE', 'HISPAN',
           'NAICS Code', 'Industry Title_x', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code', 'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions',
                        'Regional COL', 'Rural/Urban COL', 'County COL']]
    df['Regional Rural/Urban'] = df['CDI Regions'] + ' ' + df['Rural/Urban']
    df = pd.merge(df, col, left_on = 'Regional Rural/Urban', right_on = 'Regions')
    df = df.rename(columns = {'Cost of Living':'Regional Rural/Urban COL'})
    df = df.rename(columns = {'Industry Title_x':'Industry Title'})
    df = df[['INDNAICS', 'PERWT', 'INCWAGE', 'RACE', 'HISPAN',
           'NAICS Code', 'Industry Title', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code',
           'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions', 'Regional Rural/Urban',
                        'Regional COL', 'Rural/Urban COL', 'County COL', 'Regional Rural/Urban COL']]
    df['Industry Title'] = normalize_titles(df['Industry Title'])
    return df

In [6]:
ipums_2019 = merge_ipums_col(ipums_2019, col_2019, county_info)
ipums_2018 = merge_ipums_col(ipums_2018, col_2018, county_info)
ipums_2017 = merge_ipums_col(ipums_2017, col_2017, county_info)
ipums_2016 = merge_ipums_col(ipums_2016, col_2016, county_info)
ipums_2015 = merge_ipums_col(ipums_2015, col_2015, county_info)

In [140]:
ipums_2020 = merge_ipums_col(ipums_2020, col_2019, county_info)

In [7]:
ca_ipums_5year = [ipums_2019, ipums_2018, ipums_2017, ipums_2016, ipums_2015]
col_5year = [col_2019, col_2018, col_2017, col_2016, col_2015]
years = ['2019', '2018', '2017', '2016', '2015']

In [8]:
naics = pd.read_csv('data/naics_parsed_crosswalk.csv').drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]
naics['Industry Title'] = normalize_titles(naics['Industry Title'])
naics['Sub_1_Code'] = [str(x) for x in naics['Sub_1_Code']]
naics['Main_Code'] = [str(x) for x in naics['Main_Code']]

## County Lookup Dataframe

In [9]:
county_info = county_info[['County', 'Rural/Urban', 'CDI Regions']]
county_info['Regional Rural/Urban'] = county_info['CDI Regions'] + ' ' + county_info['Rural/Urban']
for col, year in zip(col_5year, years):
    county_info = pd.merge(county_info, col, left_on = 'County', right_on = 'Regions')

    county_info = county_info.rename(columns = {'Cost of Living':f'County COL {year}'})
    county_info = county_info.drop(columns=['Regions'])

    county_info = pd.merge(county_info, col, left_on = 'Regional Rural/Urban', right_on = 'Regions')

    county_info = county_info.rename(columns = {'Cost of Living':f'Regional Rural/Urban COL {year}'})
    county_info = county_info.drop(columns=['Regions'])

    county_info = pd.merge(county_info, col, left_on = 'CDI Regions', right_on = 'Regions')

    county_info = county_info.rename(columns = {'Cost of Living':f'Regional COL {year}'})
    county_info = county_info.drop(columns=['Regions'])

    county_info = pd.merge(county_info, col, left_on = 'Rural/Urban', right_on = 'Regions')

    county_info = county_info.rename(columns = {'Cost of Living':f'Rural/Urban COL {year}'})
    county_info = county_info.drop(columns=['Regions'])

    county_info[f'State COL {year}'] = col.iloc[11][1]

In [10]:
county_info = county_info[['County', 'Rural/Urban', 'CDI Regions', 'Regional Rural/Urban',
       'County COL 2019', 'Regional Rural/Urban COL 2019', 'Regional COL 2019',
       'Rural/Urban COL 2019', 'State COL 2019', 'County COL 2018',
       'Regional Rural/Urban COL 2018', 'Regional COL 2018',
       'Rural/Urban COL 2018', 'State COL 2018', 'County COL 2017',
       'Regional Rural/Urban COL 2017', 'Regional COL 2017',
       'Rural/Urban COL 2017', 'State COL 2017', 'County COL 2016',
       'Regional Rural/Urban COL 2016',
       'Regional COL 2016', 'Rural/Urban COL 2016',
       'State COL 2016', 'County COL 2015',
       'Regional Rural/Urban COL 2015', 'Regional COL 2015',
       'Rural/Urban COL 2015', 'State COL 2015']]

## EDD Data

In [11]:
edd_2019 = pd.read_csv('data/edd_2019_parsed.csv')
edd_2018 = pd.read_csv('data/edd_2018_parsed.csv')
edd_2017 = pd.read_csv('data/edd_2017_parsed.csv')
edd_2016 = pd.read_csv('data/edd_2016_parsed.csv')
edd_2015 = pd.read_csv('data/edd_2015_parsed.csv')

In [143]:
edd_2020 = pd.read_csv('data/edd/edd_2020_parsed.csv')

In [12]:
edd_dfs = [edd_2019, edd_2018, edd_2017, edd_2016, edd_2015]

In [13]:
def clean_edd(edd):
    edd['Area Name'] = edd['Area Name'].str.replace(' County', '')
    edd = edd.loc[edd['Area Type'] == 'County']
    edd = edd.drop(columns=['Industry Title'])
    edd = edd.rename(columns={"LMID Industry Title": "Industry Title"})
    edd['Sub_1_Code'] = [str(x) for x in edd['Sub_1_Code']]
    edd['Main_Code'] = [str(x) for x in edd['Main_Code']]
    return edd

In [14]:
edd_2019 = clean_edd(edd_2019)
edd_2018 = clean_edd(edd_2018)
edd_2017 = clean_edd(edd_2017)
edd_2016 = clean_edd(edd_2016)
edd_2015 = clean_edd(edd_2015)

In [144]:
edd_2020 = clean_edd(edd_2020)

## Breakdown dataframes by race

In [16]:
def append_race_ratio(df, wt_counts):
    df['race_ratio'] = 0
    for title in df['Industry Title'].unique():
        perwt = df['PERWT'][df['Industry Title'] == title].sum()
        df['race_ratio'][df['Industry Title'] == title] = perwt / wt_counts[title]
    return df

****2020****

In [145]:
ca_ipums_latino_2020 = ipums_2020.loc[ipums_2020['HISPAN'] != 0]
ca_ipums_no_latino_2020 = ipums_2020.loc[ipums_2020['HISPAN'] == 0]
ca_ipums_white_2020 = ca_ipums_no_latino_2020.loc[ca_ipums_no_latino_2020['RACE'] == 1]
ca_ipums_black_2020 = ca_ipums_no_latino_2020.loc[ca_ipums_no_latino_2020['RACE'] == 2]
ca_ipums_native_2020 = ca_ipums_no_latino_2020.loc[ca_ipums_no_latino_2020['RACE'] == 3]
ca_ipums_asian_2020 = ca_ipums_no_latino_2020.loc[(ca_ipums_no_latino_2020['RACE'] == 4) |
                                   (ca_ipums_no_latino_2020['RACE'] == 5) |
                                   (ca_ipums_no_latino_2020['RACE'] == 6)]
ca_ipums_other_2020 = ca_ipums_no_latino_2020.loc[ca_ipums_no_latino_2020['RACE'] == 7]
ca_ipums_multi_2020 = ca_ipums_no_latino_2020.loc[(ca_ipums_no_latino_2020['RACE'] == 8) |
                                   (ca_ipums_no_latino_2020['RACE'] == 9)]

In [146]:
wt_counts = {}
for title in ipums_2020['Industry Title'].unique():
    title_copy = ipums_2020.loc[ipums_2020['Industry Title'] == title].copy()
    wt_counts[title] = title_copy['PERWT'].sum()

In [149]:
ca_ipums_white_2020 = append_race_ratio(ca_ipums_white_2020, wt_counts)
ca_ipums_latino_2020 = append_race_ratio(ca_ipums_latino_2020, wt_counts)
ca_ipums_black_2020 = append_race_ratio(ca_ipums_black_2020, wt_counts)
ca_ipums_native_2020 = append_race_ratio(ca_ipums_native_2020, wt_counts)
ca_ipums_asian_2020 = append_race_ratio(ca_ipums_asian_2020, wt_counts)
ca_ipums_other_2020 = append_race_ratio(ca_ipums_other_2020, wt_counts)
ca_ipums_multi_2020 = append_race_ratio(ca_ipums_multi_2020, wt_counts)

In [150]:
ca_ipums_hw_white_2020 = add_geo_high_wages(ca_ipums_white_2020)
ca_ipums_hw_latino_2020 = add_geo_high_wages(ca_ipums_latino_2020)
ca_ipums_hw_black_2020 = add_geo_high_wages(ca_ipums_black_2020)
ca_ipums_hw_native_2020 = add_geo_high_wages(ca_ipums_native_2020)
ca_ipums_hw_asian_2020 = add_geo_high_wages(ca_ipums_asian_2020)
ca_ipums_hw_other_2020 = add_geo_high_wages(ca_ipums_other_2020)
ca_ipums_hw_multi_2020 = add_geo_high_wages(ca_ipums_multi_2020)

In [151]:
race_ipums_dfs_2020 = [ca_ipums_hw_white_2020, 
                  ca_ipums_hw_latino_2020, 
                  ca_ipums_hw_black_2020, 
                  ca_ipums_hw_native_2020, 
                  ca_ipums_hw_asian_2020, 
                  ca_ipums_hw_other_2020, 
                  ca_ipums_hw_multi_2020]

In [152]:
for df in race_ipums_dfs_2020:
    df['Sub_1_Code'] = [str(x) for x in df['Sub_1_Code']]
    df['Main_Code'] = [str(x) for x in df['Main_Code']]

**2019**

In [15]:
ca_ipums_latino_2019 = ipums_2019.loc[ipums_2019['HISPAN'] != 0]
ca_ipums_no_latino_2019 = ipums_2019.loc[ipums_2019['HISPAN'] == 0]
ca_ipums_white_2019 = ca_ipums_no_latino_2019.loc[ca_ipums_no_latino_2019['RACE'] == 1]
ca_ipums_black_2019 = ca_ipums_no_latino_2019.loc[ca_ipums_no_latino_2019['RACE'] == 2]
ca_ipums_native_2019 = ca_ipums_no_latino_2019.loc[ca_ipums_no_latino_2019['RACE'] == 3]
ca_ipums_asian_2019 = ca_ipums_no_latino_2019.loc[(ca_ipums_no_latino_2019['RACE'] == 4) |
                                   (ca_ipums_no_latino_2019['RACE'] == 5) |
                                   (ca_ipums_no_latino_2019['RACE'] == 6)]
ca_ipums_other_2019 = ca_ipums_no_latino_2019.loc[ca_ipums_no_latino_2019['RACE'] == 7]
ca_ipums_multi_2019 = ca_ipums_no_latino_2019.loc[(ca_ipums_no_latino_2019['RACE'] == 8) |
                                   (ca_ipums_no_latino_2019['RACE'] == 9)]

In [17]:
wt_counts = {}
for title in ipums_2019['Industry Title'].unique():
    title_copy = ipums_2019.loc[ipums_2019['Industry Title'] == title].copy()
    wt_counts[title] = title_copy['PERWT'].sum()

In [18]:
ca_ipums_white_2019 = append_race_ratio(ca_ipums_white_2019, wt_counts)
ca_ipums_latino_2019 = append_race_ratio(ca_ipums_latino_2019, wt_counts)
ca_ipums_black_2019 = append_race_ratio(ca_ipums_black_2019, wt_counts)
ca_ipums_native_2019 = append_race_ratio(ca_ipums_native_2019, wt_counts)
ca_ipums_asian_2019 = append_race_ratio(ca_ipums_asian_2019, wt_counts)
ca_ipums_other_2019 = append_race_ratio(ca_ipums_other_2019, wt_counts)
ca_ipums_multi_2019 = append_race_ratio(ca_ipums_multi_2019, wt_counts)

In [37]:
ca_ipums_hw_white_2019 = add_geo_high_wages(ca_ipums_white_2019)
ca_ipums_hw_latino_2019 = add_geo_high_wages(ca_ipums_latino_2019)
ca_ipums_hw_black_2019 = add_geo_high_wages(ca_ipums_black_2019)
ca_ipums_hw_native_2019 = add_geo_high_wages(ca_ipums_native_2019)
ca_ipums_hw_asian_2019 = add_geo_high_wages(ca_ipums_asian_2019)
ca_ipums_hw_other_2019 = add_geo_high_wages(ca_ipums_other_2019)
ca_ipums_hw_multi_2019 = add_geo_high_wages(ca_ipums_multi_2019)

In [38]:
race_ipums_dfs_2019 = [ca_ipums_hw_white_2019, 
                  ca_ipums_hw_latino_2019, 
                  ca_ipums_hw_black_2019, 
                  ca_ipums_hw_native_2019, 
                  ca_ipums_hw_asian_2019, 
                  ca_ipums_hw_other_2019, 
                  ca_ipums_hw_multi_2019]

In [39]:
for df in race_ipums_dfs_2019:
    df['Sub_1_Code'] = [str(x) for x in df['Sub_1_Code']]
    df['Main_Code'] = [str(x) for x in df['Main_Code']]

**2018**

In [31]:
ca_ipums_latino_2018 = ipums_2018.loc[ipums_2018['HISPAN'] != 0]
ca_ipums_no_latino_2018 = ipums_2018.loc[ipums_2018['HISPAN'] == 0]
ca_ipums_white_2018 = ca_ipums_no_latino_2018.loc[ca_ipums_no_latino_2018['RACE'] == 1]
ca_ipums_black_2018 = ca_ipums_no_latino_2018.loc[ca_ipums_no_latino_2018['RACE'] == 2]
ca_ipums_native_2018 = ca_ipums_no_latino_2018.loc[ca_ipums_no_latino_2018['RACE'] == 3]
ca_ipums_asian_2018 = ca_ipums_no_latino_2018.loc[(ca_ipums_no_latino_2018['RACE'] == 4) |
                                   (ca_ipums_no_latino_2018['RACE'] == 5) |
                                   (ca_ipums_no_latino_2018['RACE'] == 6)]
ca_ipums_other_2018 = ca_ipums_no_latino_2018.loc[ca_ipums_no_latino_2018['RACE'] == 7]
ca_ipums_multi_2018 = ca_ipums_no_latino_2018.loc[(ca_ipums_no_latino_2018['RACE'] == 8) |
                                   (ca_ipums_no_latino_2018['RACE'] == 9)]

In [32]:
wt_counts = {}
for title in ipums_2018['Industry Title'].unique():
    title_copy = ipums_2018.loc[ipums_2018['Industry Title'] == title].copy()
    wt_counts[title] = title_copy['PERWT'].sum()

In [33]:
ca_ipums_white_2018 = append_race_ratio(ca_ipums_white_2018, wt_counts)
ca_ipums_latino_2018 = append_race_ratio(ca_ipums_latino_2018, wt_counts)
ca_ipums_black_2018 = append_race_ratio(ca_ipums_black_2018, wt_counts)
ca_ipums_native_2018 = append_race_ratio(ca_ipums_native_2018, wt_counts)
ca_ipums_asian_2018 = append_race_ratio(ca_ipums_asian_2018, wt_counts)
ca_ipums_other_2018 = append_race_ratio(ca_ipums_other_2018, wt_counts)
ca_ipums_multi_2018 = append_race_ratio(ca_ipums_multi_2018, wt_counts)

In [40]:
ca_ipums_hw_white_2018 = add_geo_high_wages(ca_ipums_white_2018)
ca_ipums_hw_latino_2018 = add_geo_high_wages(ca_ipums_latino_2018)
ca_ipums_hw_black_2018 = add_geo_high_wages(ca_ipums_black_2018)
ca_ipums_hw_native_2018 = add_geo_high_wages(ca_ipums_native_2018)
ca_ipums_hw_asian_2018 = add_geo_high_wages(ca_ipums_asian_2018)
ca_ipums_hw_other_2018 = add_geo_high_wages(ca_ipums_other_2018)
ca_ipums_hw_multi_2018 = add_geo_high_wages(ca_ipums_multi_2018)

In [41]:
race_ipums_dfs_2018 = [ca_ipums_hw_white_2018, 
                  ca_ipums_hw_latino_2018, 
                  ca_ipums_hw_black_2018, 
                  ca_ipums_hw_native_2018, 
                  ca_ipums_hw_asian_2018, 
                  ca_ipums_hw_other_2018, 
                  ca_ipums_hw_multi_2018]

In [42]:
for df in race_ipums_dfs_2018:
    df['Sub_1_Code'] = [str(x) for x in df['Sub_1_Code']]
    df['Main_Code'] = [str(x) for x in df['Main_Code']]

**2017**

In [22]:
ca_ipums_latino_2017 = ipums_2017.loc[ipums_2017['HISPAN'] != 0]
ca_ipums_no_latino_2017 = ipums_2017.loc[ipums_2017['HISPAN'] == 0]
ca_ipums_white_2017 = ca_ipums_no_latino_2017.loc[ca_ipums_no_latino_2017['RACE'] == 1]
ca_ipums_black_2017 = ca_ipums_no_latino_2017.loc[ca_ipums_no_latino_2017['RACE'] == 2]
ca_ipums_native_2017 = ca_ipums_no_latino_2017.loc[ca_ipums_no_latino_2017['RACE'] == 3]
ca_ipums_asian_2017 = ca_ipums_no_latino_2017.loc[(ca_ipums_no_latino_2017['RACE'] == 4) |
                                   (ca_ipums_no_latino_2017['RACE'] == 5) |
                                   (ca_ipums_no_latino_2017['RACE'] == 6)]
ca_ipums_other_2017 = ca_ipums_no_latino_2017.loc[ca_ipums_no_latino_2017['RACE'] == 7]
ca_ipums_multi_2017 = ca_ipums_no_latino_2017.loc[(ca_ipums_no_latino_2017['RACE'] == 8) |
                                   (ca_ipums_no_latino_2017['RACE'] == 9)]

In [23]:
wt_counts = {}
for title in ipums_2017['Industry Title'].unique():
    title_copy = ipums_2017.loc[ipums_2017['Industry Title'] == title].copy()
    wt_counts[title] = title_copy['PERWT'].sum()

In [24]:
ca_ipums_white_2017 = append_race_ratio(ca_ipums_white_2017, wt_counts)
ca_ipums_latino_2017 = append_race_ratio(ca_ipums_latino_2017, wt_counts)
ca_ipums_black_2017 = append_race_ratio(ca_ipums_black_2017, wt_counts)
ca_ipums_native_2017 = append_race_ratio(ca_ipums_native_2017, wt_counts)
ca_ipums_asian_2017 = append_race_ratio(ca_ipums_asian_2017, wt_counts)
ca_ipums_other_2017 = append_race_ratio(ca_ipums_other_2017, wt_counts)
ca_ipums_multi_2017 = append_race_ratio(ca_ipums_multi_2017, wt_counts)

In [43]:
ca_ipums_hw_white_2017 = add_geo_high_wages(ca_ipums_white_2017)
ca_ipums_hw_latino_2017 = add_geo_high_wages(ca_ipums_latino_2017)
ca_ipums_hw_black_2017 = add_geo_high_wages(ca_ipums_black_2017)
ca_ipums_hw_native_2017 = add_geo_high_wages(ca_ipums_native_2017)
ca_ipums_hw_asian_2017 = add_geo_high_wages(ca_ipums_asian_2017)
ca_ipums_hw_other_2017 = add_geo_high_wages(ca_ipums_other_2017)
ca_ipums_hw_multi_2017 = add_geo_high_wages(ca_ipums_multi_2017)

In [44]:
race_ipums_dfs_2017 = [ca_ipums_hw_white_2017, 
                  ca_ipums_hw_latino_2017, 
                  ca_ipums_hw_black_2017, 
                  ca_ipums_hw_native_2017, 
                  ca_ipums_hw_asian_2017, 
                  ca_ipums_hw_other_2017, 
                  ca_ipums_hw_multi_2017]

In [45]:
for df in race_ipums_dfs_2017:
    df['Sub_1_Code'] = [str(x) for x in df['Sub_1_Code']]
    df['Main_Code'] = [str(x) for x in df['Main_Code']]

**2016**

In [25]:
ca_ipums_latino_2016 = ipums_2016.loc[ipums_2016['HISPAN'] != 0]
ca_ipums_no_latino_2016 = ipums_2016.loc[ipums_2016['HISPAN'] == 0]
ca_ipums_white_2016 = ca_ipums_no_latino_2016.loc[ca_ipums_no_latino_2016['RACE'] == 1]
ca_ipums_black_2016 = ca_ipums_no_latino_2016.loc[ca_ipums_no_latino_2016['RACE'] == 2]
ca_ipums_native_2016 = ca_ipums_no_latino_2016.loc[ca_ipums_no_latino_2016['RACE'] == 3]
ca_ipums_asian_2016 = ca_ipums_no_latino_2016.loc[(ca_ipums_no_latino_2016['RACE'] == 4) |
                                   (ca_ipums_no_latino_2016['RACE'] == 5) |
                                   (ca_ipums_no_latino_2016['RACE'] == 6)]
ca_ipums_other_2016 = ca_ipums_no_latino_2016.loc[ca_ipums_no_latino_2016['RACE'] == 7]
ca_ipums_multi_2016 = ca_ipums_no_latino_2016.loc[(ca_ipums_no_latino_2016['RACE'] == 8) |
                                   (ca_ipums_no_latino_2016['RACE'] == 9)]

In [26]:
wt_counts = {}
for title in ipums_2016['Industry Title'].unique():
    title_copy = ipums_2016.loc[ipums_2016['Industry Title'] == title].copy()
    wt_counts[title] = title_copy['PERWT'].sum()

In [27]:
ca_ipums_white_2016 = append_race_ratio(ca_ipums_white_2016, wt_counts)
ca_ipums_latino_2016 = append_race_ratio(ca_ipums_latino_2016, wt_counts)
ca_ipums_black_2016 = append_race_ratio(ca_ipums_black_2016, wt_counts)
ca_ipums_native_2016 = append_race_ratio(ca_ipums_native_2016, wt_counts)
ca_ipums_asian_2016 = append_race_ratio(ca_ipums_asian_2016, wt_counts)
ca_ipums_other_2016 = append_race_ratio(ca_ipums_other_2016, wt_counts)
ca_ipums_multi_2016 = append_race_ratio(ca_ipums_multi_2016, wt_counts)

In [46]:
ca_ipums_hw_white_2016 = add_geo_high_wages(ca_ipums_white_2016)
ca_ipums_hw_latino_2016 = add_geo_high_wages(ca_ipums_latino_2016)
ca_ipums_hw_black_2016 = add_geo_high_wages(ca_ipums_black_2016)
ca_ipums_hw_native_2016 = add_geo_high_wages(ca_ipums_native_2016)
ca_ipums_hw_asian_2016 = add_geo_high_wages(ca_ipums_asian_2016)
ca_ipums_hw_other_2016 = add_geo_high_wages(ca_ipums_other_2016)
ca_ipums_hw_multi_2016 = add_geo_high_wages(ca_ipums_multi_2016)

In [47]:
race_ipums_dfs_2016 = [ca_ipums_hw_white_2016, 
                  ca_ipums_hw_latino_2016, 
                  ca_ipums_hw_black_2016, 
                  ca_ipums_hw_native_2016, 
                  ca_ipums_hw_asian_2016, 
                  ca_ipums_hw_other_2016, 
                  ca_ipums_hw_multi_2016]

In [48]:
for df in race_ipums_dfs_2016:
    df['Sub_1_Code'] = [str(x) for x in df['Sub_1_Code']]
    df['Main_Code'] = [str(x) for x in df['Main_Code']]

**2015**

In [34]:
ca_ipums_latino_2015 = ipums_2015.loc[ipums_2015['HISPAN'] != 0]
ca_ipums_no_latino_2015 = ipums_2015.loc[ipums_2015['HISPAN'] == 0]
ca_ipums_white_2015 = ca_ipums_no_latino_2015.loc[ca_ipums_no_latino_2015['RACE'] == 1]
ca_ipums_black_2015 = ca_ipums_no_latino_2015.loc[ca_ipums_no_latino_2015['RACE'] == 2]
ca_ipums_native_2015 = ca_ipums_no_latino_2015.loc[ca_ipums_no_latino_2015['RACE'] == 3]
ca_ipums_asian_2015 = ca_ipums_no_latino_2015.loc[(ca_ipums_no_latino_2015['RACE'] == 4) |
                                   (ca_ipums_no_latino_2015['RACE'] == 5) |
                                   (ca_ipums_no_latino_2015['RACE'] == 6)]
ca_ipums_other_2015 = ca_ipums_no_latino_2015.loc[ca_ipums_no_latino_2015['RACE'] == 7]
ca_ipums_multi_2015 = ca_ipums_no_latino_2015.loc[(ca_ipums_no_latino_2015['RACE'] == 8) |
                                   (ca_ipums_no_latino_2015['RACE'] == 9)]

In [35]:
wt_counts = {}
for title in ipums_2015['Industry Title'].unique():
    title_copy = ipums_2015.loc[ipums_2015['Industry Title'] == title].copy()
    wt_counts[title] = title_copy['PERWT'].sum()

In [36]:
ca_ipums_white_2015 = append_race_ratio(ca_ipums_white_2015, wt_counts)
ca_ipums_latino_2015 = append_race_ratio(ca_ipums_latino_2015, wt_counts)
ca_ipums_black_2015 = append_race_ratio(ca_ipums_black_2015, wt_counts)
ca_ipums_native_2015 = append_race_ratio(ca_ipums_native_2015, wt_counts)
ca_ipums_asian_2015 = append_race_ratio(ca_ipums_asian_2015, wt_counts)
ca_ipums_other_2015 = append_race_ratio(ca_ipums_other_2015, wt_counts)
ca_ipums_multi_2015 = append_race_ratio(ca_ipums_multi_2015, wt_counts)

In [49]:
ca_ipums_hw_white_2015 = add_geo_high_wages(ca_ipums_white_2015)
ca_ipums_hw_latino_2015 = add_geo_high_wages(ca_ipums_latino_2015)
ca_ipums_hw_black_2015 = add_geo_high_wages(ca_ipums_black_2015)
ca_ipums_hw_native_2015 = add_geo_high_wages(ca_ipums_native_2015)
ca_ipums_hw_asian_2015 = add_geo_high_wages(ca_ipums_asian_2015)
ca_ipums_hw_other_2015 = add_geo_high_wages(ca_ipums_other_2015)
ca_ipums_hw_multi_2015 = add_geo_high_wages(ca_ipums_multi_2015)

In [50]:
race_ipums_dfs_2015 = [ca_ipums_hw_white_2015, 
                  ca_ipums_hw_latino_2015, 
                  ca_ipums_hw_black_2015, 
                  ca_ipums_hw_native_2015, 
                  ca_ipums_hw_asian_2015, 
                  ca_ipums_hw_other_2015, 
                  ca_ipums_hw_multi_2015]

In [51]:
for df in race_ipums_dfs_2015:
    df['Sub_1_Code'] = [str(x) for x in df['Sub_1_Code']]
    df['Main_Code'] = [str(x) for x in df['Main_Code']]

### High Wage Output Functions

In [147]:
def edd_to_hw(edd_df, ipums_df_hw, naics_df, county_df, county: str, parsed_code: str, date: str, sample_size: int):
    # filter edd by date, edd county, and industry via parsed code
    edd_df = edd_df.loc[edd_df['Date'] == date].copy()
    if len(edd_df) == 0:
        return "Date not valid or found", np.nan, np.nan, np.nan
    edd_df = edd_df.loc[(edd_df['Sub_4_Code'] == parsed_code) | 
                        (edd_df['Sub_3_Code'] == parsed_code) | 
                        (edd_df['Sub_2_Code'] == parsed_code) | 
                        (edd_df['Sub_1_Code'] == parsed_code) | 
                        (edd_df['Main_Code'] == parsed_code)].copy()
    edd_df = edd_df.loc[edd_df['Area Name'] == county] # this is possible because all counties are in EDD data
    edd_df = edd_df.drop_duplicates(subset='Main_EDD').reset_index().iloc[:,1:]
    
    # merge naics with edd
    edd_df_ = pd.merge(edd_df, naics_df, on='Sub_4_Code')
    if len(edd_df_) == 0:
        edd_df_ = pd.merge(edd_df, naics_df, on='Sub_3_Code')
        if len(edd_df_) == 0:
            edd_df_ = pd.merge(edd_df, naics_df, on='Sub_2_Code')
            if len(edd_df_) == 0:
                edd_df_ = pd.merge(edd_df, naics_df, on='Sub_1_Code')
                if len(edd_df_) == 0:
                    edd_df_ = pd.merge(edd_df, naics_df, on='Main_Code')
                    if len(edd_df_) == 0:
                        return "No parsed code of input industry found within input county", np.nan, np.nan, np.nan
    edd_df = edd_df_.rename(columns = {'Industry Title_x':'EDD Industry',
                                 'Industry Title_y': 'IPUMS Industry'})
    edd_df = edd_df[['EDD Industry', 'Area Name', 'IPUMS Industry', 'INDNAICS', 'Current Employment']]
    employment_count = int(edd_df['Current Employment'].values[0])
    naics_code = edd_df['INDNAICS'].values[0]
    output, hw_perc, industry, ratio = ca_ipums_filter(ipums_df_hw, county_df, county, naics_code, sample_size)
    race_ind_count = employment_count * ratio
    hw_count = (race_ind_count * hw_perc) / 100
    output += f", High wage count: {hw_count}"
    return output, hw_count, industry, race_ind_count

In [148]:
def ca_ipums_filter(df, county_df, county: str, NAICS: str, n: int):
    df = df.loc[df['County'] == county].copy()
    level = 'county'
    if len(df) == 0:
        county_df = county_df.loc[county_df['County'] == county].copy() # narrow down county df
        reg_rural_urban = county_df['Regional Rural/Urban'].values[0]
        region = county_df['CDI Regions'].values[0]
        rural_urban = county_df['Rural/Urban'].values[0]
        df = df.loc[df['Regional Rural/Urban'] == reg_rural_urban].copy()
        level = 'reg_rural_urban'
        if len(df) == 0:
            df = df.loc[df['CDI Regions'] == region].copy()
            level = 'region'
            if len(df) == 0:
                df = df.loc[df['Rural/Urban'] == rural_urban].copy()
                level = 'rural_urban'
                if len(df) == 0:
                    level = 'state'
    
    df = df.loc[df['INDNAICS'] == NAICS].copy()
    try:
        industry = df['Industry Title'].values[0]
    except:
        return "Industry not valid or found", np.nan, np.nan, np.nan
    if len(df) == 0:
        return "NAICS Code not valid or found", np.nan, np.nan, np.nan
    df = df.drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]
    
    # incorporate tracked levels for conditions
    if level == 'county':
        if df['unwt_county_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: County, Industry: {industry}, High wage percentage: {df['wt_county_hw_perc'].values[0]}", df['wt_county_hw_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_regcomm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Regional Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_regcomm_hw_perc'].values[0]}", df['wt_regcomm_hw_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_reg_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Regional, Industry: {industry}, High wage percentage: {df['wt_reg_high_wage_perc'].values[0]}", df['wt_reg_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_comm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_comm_high_wage_perc'].values[0]}", df['wt_comm_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: State, Industry: {industry}, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}", df['wt_CA_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        else:
            return "Not enough records available to satisfy sample size request", np.nan, np.nan, np.nan
        
    elif level == 'reg_rural_urban':
        if df['unwt_regcomm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Regional Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_regcomm_hw_perc'].values[0]}", df['wt_regcomm_hw_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_reg_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Regional, Industry: {industry}, High wage percentage: {df['wt_reg_high_wage_perc'].values[0]}", df['wt_reg_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_comm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_comm_high_wage_perc'].values[0]}", df['wt_comm_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: State, Industry: {industry}, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}", df['wt_CA_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        else:
            return "Not enough records available to satisfy sample size request", np.nan, np.nan, np.nan
        
    elif level == 'region':
        if df['unwt_reg_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Regional, Industry: {industry}, High wage percentage: {df['wt_reg_high_wage_perc'].values[0]}", df['wt_reg_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_comm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_comm_high_wage_perc'].values[0]}", df['wt_comm_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: State, Industry: {industry}, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}", df['wt_CA_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        else:
            return "Not enough records available to satisfy sample size request", np.nan, np.nan, np.nan
        
    elif level == 'rural_urban':
        if df['unwt_comm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_comm_high_wage_perc'].values[0]}", df['wt_comm_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: State, Industry: {industry}, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}", df['wt_CA_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        else:
            return "Not enough records available to satisfy sample size request", np.nan, np.nan, np.nan
        
    elif level == 'state':
        if df['unwt_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: State, Industry: {industry}, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}", df['wt_CA_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        else:
            return "Not enough records available to satisfy sample size request", np.nan, np.nan, np.nan

### High Wage Outputs - 2020

In [153]:
counties_edd = edd_2020['Area Name'].unique()
parsed_codes = set(list(edd_2020['Main_Code'].unique()) + 
                   list(edd_2020['Sub_1_Code'].unique()) + 
                   list(edd_2020['Sub_2_Code'].unique()) + 
                   list(edd_2020['Sub_3_Code'].unique()) + 
                   list(edd_2020['Sub_4_Code'].unique()))
dates_edd = edd_2020['Date'].unique()
total_iterations = len(counties_edd) * len(parsed_codes) * len(dates_edd)

White

In [154]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2020, ca_ipums_hw_white_2020, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

KeyError: 'Regional Rural/Urban'

In [None]:
df_dict_white_2020 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_white_2020 = pd.DataFrame(df_dict_white_2020)
tidy_df_white_2020 = tidy_df_white_2020[tidy_df_white_2020['Industry'].notna()]
tidy_df_white_2020['Date']= pd.to_datetime(tidy_df_white_2020['Date'])
tidy_df_white_2020['High Wage Count'] = tidy_df_white_2020['High Wage Count'].astype(int)
tidy_df_white_2020 = tidy_df_white_2020.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_white_2020['Output Race'] = 'White'

Latino

In [None]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2020, ca_ipums_hw_latino_2020, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

In [None]:
df_dict_latino_2020 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_latino_2020 = pd.DataFrame(df_dict_latino_2020)
tidy_df_latino_2020 = tidy_df_latino_2020[tidy_df_latino_2020['Industry'].notna()]
tidy_df_latino_2020['Date']= pd.to_datetime(tidy_df_latino_2020['Date'])
tidy_df_latino_2020['High Wage Count'] = tidy_df_latino_2020['High Wage Count'].astype(int)
tidy_df_latino_2020 = tidy_df_latino_2020.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_latino_2020['Output Race'] = 'Latino'

Black

In [None]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2020, ca_ipums_hw_black_2020, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

In [None]:
df_dict_black_2020 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_black_2020 = pd.DataFrame(df_dict_black_2020)
tidy_df_black_2020 = tidy_df_black_2020[tidy_df_black_2020['Industry'].notna()]
tidy_df_black_2020['Date']= pd.to_datetime(tidy_df_black_2020['Date'])
tidy_df_black_2020['High Wage Count'] = tidy_df_black_2020['High Wage Count'].astype(int)
tidy_df_black_2020 = tidy_df_black_2020.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_black_2020['Output Race'] = 'Black'

American Indian/Alaska Native

In [None]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2020, ca_ipums_hw_native_2020, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

In [155]:
df_dict_native_2020 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_native_2020 = pd.DataFrame(df_dict_native_2020)
tidy_df_native_2020 = tidy_df_native_2020[tidy_df_native_2020['Industry'].notna()]
tidy_df_native_2020['Date']= pd.to_datetime(tidy_df_native_2020['Date'])
tidy_df_native_2020['High Wage Count'] = tidy_df_native_2020['High Wage Count'].astype(int)
tidy_df_native_2020 = tidy_df_native_2020.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_native_2020['Output Race'] = 'American Indian/Alaska Native'

Asian

In [None]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2018, ca_ipums_hw_asian_2018, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

df_dict_asian_2018 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_asian_2018 = pd.DataFrame(df_dict_asian_2018)
tidy_df_asian_2018 = tidy_df_asian_2018[tidy_df_asian_2018['Industry'].notna()]
tidy_df_asian_2018['Date']= pd.to_datetime(tidy_df_asian_2018['Date'])
tidy_df_asian_2018['High Wage Count'] = tidy_df_asian_2018['High Wage Count'].astype(int)
tidy_df_asian_2018 = tidy_df_asian_2018.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_asian_2018['Output Race'] = 'Asian'

Other

industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2018, ca_ipums_hw_other_2018, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

df_dict_other_2018 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_other_2018 = pd.DataFrame(df_dict_other_2018)
tidy_df_other_2018 = tidy_df_other_2018[tidy_df_other_2018['Industry'].notna()]
tidy_df_other_2018['Date']= pd.to_datetime(tidy_df_other_2018['Date'])
tidy_df_other_2018['High Wage Count'] = tidy_df_other_2018['High Wage Count'].astype(int)
tidy_df_other_2018 = tidy_df_other_2018.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_other_2018['Output Race'] = 'Some other race'

Multi

industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2018, ca_ipums_hw_multi_2018, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

df_dict_multi_2018 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_multi_2018 = pd.DataFrame(df_dict_multi_2018)
tidy_df_multi_2018 = tidy_df_multi_2018[tidy_df_multi_2018['Industry'].notna()]
tidy_df_multi_2018['Date']= pd.to_datetime(tidy_df_multi_2018['Date'])
tidy_df_multi_2018['High Wage Count'] = tidy_df_multi_2018['High Wage Count'].astype(int)
tidy_df_multi_2018 = tidy_df_multi_2018.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_multi_2018['Output Race'] = 'Multiracial'

Concatenate

df_race_2018 = pd.concat([tidy_df_white_2018, 
                          tidy_df_black_2018, 
                          tidy_df_latino_2018, 
                          tidy_df_asian_2018, 
                          tidy_df_native_2018, 
                          tidy_df_other_2018, 
                          tidy_df_multi_2018], 
                         ignore_index=True)
df_race_2018 = pd.merge(df_race_2018, col_2018, left_on='County', right_on='Regions')
df_race_2018['Year'] = 2018
df_race_2018 = df_race_2018[['Industry', 'Date', 'County', 'High Wage Count', 'Cost of Living', 'Employment Count', 'Output Race', 'Year']]

df_race_2018.to_csv('hw_outputs_w_race_2018.csv', encoding='utf-8', index=False)

### High Wage Outputs - 2019

Previously computed in `jqi-race-breakdown`

### High Wage Outputs - 2018

In [54]:
counties_edd = edd_2018['Area Name'].unique()
parsed_codes = set(list(edd_2018['Main_Code'].unique()) + 
                   list(edd_2018['Sub_1_Code'].unique()) + 
                   list(edd_2018['Sub_2_Code'].unique()) + 
                   list(edd_2018['Sub_3_Code'].unique()) + 
                   list(edd_2018['Sub_4_Code'].unique()))
dates_edd = edd_2018['Date'].unique()
total_iterations = len(counties_edd) * len(parsed_codes) * len(dates_edd)

White

In [58]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2018, ca_ipums_hw_white_2018, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [59]:
df_dict_white_2018 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_white_2018 = pd.DataFrame(df_dict_white_2018)
tidy_df_white_2018 = tidy_df_white_2018[tidy_df_white_2018['Industry'].notna()]
tidy_df_white_2018['Date']= pd.to_datetime(tidy_df_white_2018['Date'])
tidy_df_white_2018['High Wage Count'] = tidy_df_white_2018['High Wage Count'].astype(int)
tidy_df_white_2018 = tidy_df_white_2018.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_white_2018['Output Race'] = 'White'

Latino

In [60]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2018, ca_ipums_hw_latino_2018, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [61]:
df_dict_latino_2018 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_latino_2018 = pd.DataFrame(df_dict_latino_2018)
tidy_df_latino_2018 = tidy_df_latino_2018[tidy_df_latino_2018['Industry'].notna()]
tidy_df_latino_2018['Date']= pd.to_datetime(tidy_df_latino_2018['Date'])
tidy_df_latino_2018['High Wage Count'] = tidy_df_latino_2018['High Wage Count'].astype(int)
tidy_df_latino_2018 = tidy_df_latino_2018.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_latino_2018['Output Race'] = 'Latino'

Black

In [62]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2018, ca_ipums_hw_black_2018, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [63]:
df_dict_black_2018 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_black_2018 = pd.DataFrame(df_dict_black_2018)
tidy_df_black_2018 = tidy_df_black_2018[tidy_df_black_2018['Industry'].notna()]
tidy_df_black_2018['Date']= pd.to_datetime(tidy_df_black_2018['Date'])
tidy_df_black_2018['High Wage Count'] = tidy_df_black_2018['High Wage Count'].astype(int)
tidy_df_black_2018 = tidy_df_black_2018.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_black_2018['Output Race'] = 'Black'

American Indian/Alaska Native

In [64]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2018, ca_ipums_hw_native_2018, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [65]:
df_dict_native_2018 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_native_2018 = pd.DataFrame(df_dict_native_2018)
tidy_df_native_2018 = tidy_df_native_2018[tidy_df_native_2018['Industry'].notna()]
tidy_df_native_2018['Date']= pd.to_datetime(tidy_df_native_2018['Date'])
tidy_df_native_2018['High Wage Count'] = tidy_df_native_2018['High Wage Count'].astype(int)
tidy_df_native_2018 = tidy_df_native_2018.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_native_2018['Output Race'] = 'American Indian/Alaska Native'

Asian

In [66]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2018, ca_ipums_hw_asian_2018, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [67]:
df_dict_asian_2018 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_asian_2018 = pd.DataFrame(df_dict_asian_2018)
tidy_df_asian_2018 = tidy_df_asian_2018[tidy_df_asian_2018['Industry'].notna()]
tidy_df_asian_2018['Date']= pd.to_datetime(tidy_df_asian_2018['Date'])
tidy_df_asian_2018['High Wage Count'] = tidy_df_asian_2018['High Wage Count'].astype(int)
tidy_df_asian_2018 = tidy_df_asian_2018.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_asian_2018['Output Race'] = 'Asian'

Other

In [68]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2018, ca_ipums_hw_other_2018, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [69]:
df_dict_other_2018 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_other_2018 = pd.DataFrame(df_dict_other_2018)
tidy_df_other_2018 = tidy_df_other_2018[tidy_df_other_2018['Industry'].notna()]
tidy_df_other_2018['Date']= pd.to_datetime(tidy_df_other_2018['Date'])
tidy_df_other_2018['High Wage Count'] = tidy_df_other_2018['High Wage Count'].astype(int)
tidy_df_other_2018 = tidy_df_other_2018.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_other_2018['Output Race'] = 'Some other race'

Multi

In [70]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2018, ca_ipums_hw_multi_2018, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [71]:
df_dict_multi_2018 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_multi_2018 = pd.DataFrame(df_dict_multi_2018)
tidy_df_multi_2018 = tidy_df_multi_2018[tidy_df_multi_2018['Industry'].notna()]
tidy_df_multi_2018['Date']= pd.to_datetime(tidy_df_multi_2018['Date'])
tidy_df_multi_2018['High Wage Count'] = tidy_df_multi_2018['High Wage Count'].astype(int)
tidy_df_multi_2018 = tidy_df_multi_2018.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_multi_2018['Output Race'] = 'Multiracial'

Concatenate

In [72]:
df_race_2018 = pd.concat([tidy_df_white_2018, 
                          tidy_df_black_2018, 
                          tidy_df_latino_2018, 
                          tidy_df_asian_2018, 
                          tidy_df_native_2018, 
                          tidy_df_other_2018, 
                          tidy_df_multi_2018], 
                         ignore_index=True)
df_race_2018 = pd.merge(df_race_2018, col_2018, left_on='County', right_on='Regions')
df_race_2018['Year'] = 2018
df_race_2018 = df_race_2018[['Industry', 'Date', 'County', 'High Wage Count', 'Cost of Living', 'Employment Count', 'Output Race', 'Year']]

In [89]:
df_race_2018.to_csv('hw_outputs_w_race_2018.csv', encoding='utf-8', index=False)

### High Wage Outputs - 2017

In [73]:
counties_edd = edd_2017['Area Name'].unique()
parsed_codes = set(list(edd_2017['Main_Code'].unique()) + 
                   list(edd_2017['Sub_1_Code'].unique()) + 
                   list(edd_2017['Sub_2_Code'].unique()) + 
                   list(edd_2017['Sub_3_Code'].unique()) + 
                   list(edd_2017['Sub_4_Code'].unique()))
dates_edd = edd_2017['Date'].unique()
total_iterations = len(counties_edd) * len(parsed_codes) * len(dates_edd)

White

In [74]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2017, ca_ipums_hw_white_2017, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [75]:
df_dict_white_2017 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_white_2017 = pd.DataFrame(df_dict_white_2017)
tidy_df_white_2017 = tidy_df_white_2017[tidy_df_white_2017['Industry'].notna()]
tidy_df_white_2017['Date']= pd.to_datetime(tidy_df_white_2017['Date'])
tidy_df_white_2017['High Wage Count'] = tidy_df_white_2017['High Wage Count'].astype(int)
tidy_df_white_2017 = tidy_df_white_2017.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_white_2017['Output Race'] = 'White'

Latino

In [76]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2017, ca_ipums_hw_latino_2017, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [77]:
df_dict_latino_2017 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_latino_2017 = pd.DataFrame(df_dict_latino_2017)
tidy_df_latino_2017 = tidy_df_latino_2017[tidy_df_latino_2017['Industry'].notna()]
tidy_df_latino_2017['Date']= pd.to_datetime(tidy_df_latino_2017['Date'])
tidy_df_latino_2017['High Wage Count'] = tidy_df_latino_2017['High Wage Count'].astype(int)
tidy_df_latino_2017 = tidy_df_latino_2017.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_latino_2017['Output Race'] = 'Latino'

Black

In [78]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2017, ca_ipums_hw_black_2017, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [79]:
df_dict_black_2017 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_black_2017 = pd.DataFrame(df_dict_black_2017)
tidy_df_black_2017 = tidy_df_black_2017[tidy_df_black_2017['Industry'].notna()]
tidy_df_black_2017['Date']= pd.to_datetime(tidy_df_black_2017['Date'])
tidy_df_black_2017['High Wage Count'] = tidy_df_black_2017['High Wage Count'].astype(int)
tidy_df_black_2017 = tidy_df_black_2017.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_black_2017['Output Race'] = 'Black'

American Indian/Alaska Native

In [80]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2017, ca_ipums_hw_native_2017, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [81]:
df_dict_native_2017 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_native_2017 = pd.DataFrame(df_dict_native_2017)
tidy_df_native_2017 = tidy_df_native_2017[tidy_df_native_2017['Industry'].notna()]
tidy_df_native_2017['Date']= pd.to_datetime(tidy_df_native_2017['Date'])
tidy_df_native_2017['High Wage Count'] = tidy_df_native_2017['High Wage Count'].astype(int)
tidy_df_native_2017 = tidy_df_native_2017.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_native_2017['Output Race'] = 'American Indian/Alaska Native'

Asian

In [82]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2017, ca_ipums_hw_asian_2017, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [83]:
df_dict_asian_2017 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_asian_2017 = pd.DataFrame(df_dict_asian_2017)
tidy_df_asian_2017 = tidy_df_asian_2017[tidy_df_asian_2017['Industry'].notna()]
tidy_df_asian_2017['Date']= pd.to_datetime(tidy_df_asian_2017['Date'])
tidy_df_asian_2017['High Wage Count'] = tidy_df_asian_2017['High Wage Count'].astype(int)
tidy_df_asian_2017 = tidy_df_asian_2017.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_asian_2017['Output Race'] = 'Asian'

Other

In [84]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2017, ca_ipums_hw_other_2017, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [85]:
df_dict_other_2017 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_other_2017 = pd.DataFrame(df_dict_other_2017)
tidy_df_other_2017 = tidy_df_other_2017[tidy_df_other_2017['Industry'].notna()]
tidy_df_other_2017['Date']= pd.to_datetime(tidy_df_other_2017['Date'])
tidy_df_other_2017['High Wage Count'] = tidy_df_other_2017['High Wage Count'].astype(int)
tidy_df_other_2017 = tidy_df_other_2017.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_other_2017['Output Race'] = 'Some other race'

Multi

In [86]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2017, ca_ipums_hw_multi_2017, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [87]:
df_dict_multi_2017 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_multi_2017 = pd.DataFrame(df_dict_multi_2017)
tidy_df_multi_2017 = tidy_df_multi_2017[tidy_df_multi_2017['Industry'].notna()]
tidy_df_multi_2017['Date']= pd.to_datetime(tidy_df_multi_2017['Date'])
tidy_df_multi_2017['High Wage Count'] = tidy_df_multi_2017['High Wage Count'].astype(int)
tidy_df_multi_2017 = tidy_df_multi_2017.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_multi_2017['Output Race'] = 'Multiracial'

Concatenate

In [88]:
df_race_2017 = pd.concat([tidy_df_white_2017, 
                          tidy_df_black_2017, 
                          tidy_df_latino_2017, 
                          tidy_df_asian_2017, 
                          tidy_df_native_2017, 
                          tidy_df_other_2017, 
                          tidy_df_multi_2017], 
                         ignore_index=True)
df_race_2017 = pd.merge(df_race_2017, col_2017, left_on='County', right_on='Regions')
df_race_2017['Year'] = 2017
df_race_2017 = df_race_2017[['Industry', 'Date', 'County', 'High Wage Count', 'Cost of Living', 'Employment Count', 'Output Race', 'Year']]

In [90]:
df_race_2017.to_csv('hw_outputs_w_race_2017.csv', encoding='utf-8', index=False)

### High Wage Outputs - 2016

In [91]:
counties_edd = edd_2016['Area Name'].unique()
parsed_codes = set(list(edd_2016['Main_Code'].unique()) + 
                   list(edd_2016['Sub_1_Code'].unique()) + 
                   list(edd_2016['Sub_2_Code'].unique()) + 
                   list(edd_2016['Sub_3_Code'].unique()) + 
                   list(edd_2016['Sub_4_Code'].unique()))
dates_edd = edd_2016['Date'].unique()
total_iterations = len(counties_edd) * len(parsed_codes) * len(dates_edd)

White

In [92]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2016, ca_ipums_hw_white_2016, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [93]:
df_dict_white_2016 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_white_2016 = pd.DataFrame(df_dict_white_2016)
tidy_df_white_2016 = tidy_df_white_2016[tidy_df_white_2016['Industry'].notna()]
tidy_df_white_2016['Date']= pd.to_datetime(tidy_df_white_2016['Date'])
tidy_df_white_2016['High Wage Count'] = tidy_df_white_2016['High Wage Count'].astype(int)
tidy_df_white_2016 = tidy_df_white_2016.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_white_2016['Output Race'] = 'White'

Latino

In [94]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2016, ca_ipums_hw_latino_2016, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [95]:
df_dict_latino_2016 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_latino_2016 = pd.DataFrame(df_dict_latino_2016)
tidy_df_latino_2016 = tidy_df_latino_2016[tidy_df_latino_2016['Industry'].notna()]
tidy_df_latino_2016['Date']= pd.to_datetime(tidy_df_latino_2016['Date'])
tidy_df_latino_2016['High Wage Count'] = tidy_df_latino_2016['High Wage Count'].astype(int)
tidy_df_latino_2016 = tidy_df_latino_2016.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_latino_2016['Output Race'] = 'Latino'

Black

In [96]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2016, ca_ipums_hw_black_2016, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [97]:
df_dict_black_2016 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_black_2016 = pd.DataFrame(df_dict_black_2016)
tidy_df_black_2016 = tidy_df_black_2016[tidy_df_black_2016['Industry'].notna()]
tidy_df_black_2016['Date']= pd.to_datetime(tidy_df_black_2016['Date'])
tidy_df_black_2016['High Wage Count'] = tidy_df_black_2016['High Wage Count'].astype(int)
tidy_df_black_2016 = tidy_df_black_2016.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_black_2016['Output Race'] = 'Black'

American Indian/Alaska Native

In [98]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2016, ca_ipums_hw_native_2016, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [99]:
df_dict_native_2016 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_native_2016 = pd.DataFrame(df_dict_native_2016)
tidy_df_native_2016 = tidy_df_native_2016[tidy_df_native_2016['Industry'].notna()]
tidy_df_native_2016['Date']= pd.to_datetime(tidy_df_native_2016['Date'])
tidy_df_native_2016['High Wage Count'] = tidy_df_native_2016['High Wage Count'].astype(int)
tidy_df_native_2016 = tidy_df_native_2016.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_native_2016['Output Race'] = 'American Indian/Alaska Native'

Asian

In [100]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2016, ca_ipums_hw_asian_2016, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [101]:
df_dict_asian_2016 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_asian_2016 = pd.DataFrame(df_dict_asian_2016)
tidy_df_asian_2016 = tidy_df_asian_2016[tidy_df_asian_2016['Industry'].notna()]
tidy_df_asian_2016['Date']= pd.to_datetime(tidy_df_asian_2016['Date'])
tidy_df_asian_2016['High Wage Count'] = tidy_df_asian_2016['High Wage Count'].astype(int)
tidy_df_asian_2016 = tidy_df_asian_2016.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_asian_2016['Output Race'] = 'Asian'

Other

In [102]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2016, ca_ipums_hw_other_2016, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [103]:
df_dict_other_2016 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_other_2016 = pd.DataFrame(df_dict_other_2016)
tidy_df_other_2016 = tidy_df_other_2016[tidy_df_other_2016['Industry'].notna()]
tidy_df_other_2016['Date']= pd.to_datetime(tidy_df_other_2016['Date'])
tidy_df_other_2016['High Wage Count'] = tidy_df_other_2016['High Wage Count'].astype(int)
tidy_df_other_2016 = tidy_df_other_2016.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_other_2016['Output Race'] = 'Some other race'

Multi

In [104]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2016, ca_ipums_hw_multi_2016, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [105]:
df_dict_multi_2016 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_multi_2016 = pd.DataFrame(df_dict_multi_2016)
tidy_df_multi_2016 = tidy_df_multi_2016[tidy_df_multi_2016['Industry'].notna()]
tidy_df_multi_2016['Date']= pd.to_datetime(tidy_df_multi_2016['Date'])
tidy_df_multi_2016['High Wage Count'] = tidy_df_multi_2016['High Wage Count'].astype(int)
tidy_df_multi_2016 = tidy_df_multi_2016.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_multi_2016['Output Race'] = 'Multiracial'

Concatenate

In [106]:
df_race_2016 = pd.concat([tidy_df_white_2016, 
                          tidy_df_black_2016, 
                          tidy_df_latino_2016, 
                          tidy_df_asian_2016, 
                          tidy_df_native_2016, 
                          tidy_df_other_2016, 
                          tidy_df_multi_2016], 
                         ignore_index=True)
df_race_2016 = pd.merge(df_race_2016, col_2016, left_on='County', right_on='Regions')
df_race_2016['Year'] = 2016
df_race_2016 = df_race_2016[['Industry', 'Date', 'County', 'High Wage Count', 'Cost of Living', 'Employment Count', 'Output Race', 'Year']]

In [124]:
df_race_2016.to_csv('hw_outputs_w_race_2016.csv', encoding='utf-8', index=False)

### High Wage Outputs - 2015

In [108]:
counties_edd = edd_2015['Area Name'].unique()
parsed_codes = set(list(edd_2015['Main_Code'].unique()) + 
                   list(edd_2015['Sub_1_Code'].unique()) + 
                   list(edd_2015['Sub_2_Code'].unique()) + 
                   list(edd_2015['Sub_3_Code'].unique()) + 
                   list(edd_2015['Sub_4_Code'].unique()))
dates_edd = edd_2015['Date'].unique()
total_iterations = len(counties_edd) * len(parsed_codes) * len(dates_edd)

White

In [109]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2015, ca_ipums_hw_white_2015, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [110]:
df_dict_white_2015 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_white_2015 = pd.DataFrame(df_dict_white_2015)
tidy_df_white_2015 = tidy_df_white_2015[tidy_df_white_2015['Industry'].notna()]
tidy_df_white_2015['Date']= pd.to_datetime(tidy_df_white_2015['Date'])
tidy_df_white_2015['High Wage Count'] = tidy_df_white_2015['High Wage Count'].astype(int)
tidy_df_white_2015 = tidy_df_white_2015.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_white_2015['Output Race'] = 'White'

Latino

In [111]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2015, ca_ipums_hw_latino_2015, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [112]:
df_dict_latino_2015 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_latino_2015 = pd.DataFrame(df_dict_latino_2015)
tidy_df_latino_2015 = tidy_df_latino_2015[tidy_df_latino_2015['Industry'].notna()]
tidy_df_latino_2015['Date']= pd.to_datetime(tidy_df_latino_2015['Date'])
tidy_df_latino_2015['High Wage Count'] = tidy_df_latino_2015['High Wage Count'].astype(int)
tidy_df_latino_2015 = tidy_df_latino_2015.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_latino_2015['Output Race'] = 'Latino'

Black

In [113]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2015, ca_ipums_hw_black_2015, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [114]:
df_dict_black_2015 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_black_2015 = pd.DataFrame(df_dict_black_2015)
tidy_df_black_2015 = tidy_df_black_2015[tidy_df_black_2015['Industry'].notna()]
tidy_df_black_2015['Date']= pd.to_datetime(tidy_df_black_2015['Date'])
tidy_df_black_2015['High Wage Count'] = tidy_df_black_2015['High Wage Count'].astype(int)
tidy_df_black_2015 = tidy_df_black_2015.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_black_2015['Output Race'] = 'Black'

American Indian/Alaska Native

In [115]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2015, ca_ipums_hw_native_2015, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [116]:
df_dict_native_2015 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_native_2015 = pd.DataFrame(df_dict_native_2015)
tidy_df_native_2015 = tidy_df_native_2015[tidy_df_native_2015['Industry'].notna()]
tidy_df_native_2015['Date']= pd.to_datetime(tidy_df_native_2015['Date'])
tidy_df_native_2015['High Wage Count'] = tidy_df_native_2015['High Wage Count'].astype(int)
tidy_df_native_2015 = tidy_df_native_2015.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_native_2015['Output Race'] = 'American Indian/Alaska Native'

Asian

In [117]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2015, ca_ipums_hw_asian_2015, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [118]:
df_dict_asian_2015 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_asian_2015 = pd.DataFrame(df_dict_asian_2015)
tidy_df_asian_2015 = tidy_df_asian_2015[tidy_df_asian_2015['Industry'].notna()]
tidy_df_asian_2015['Date']= pd.to_datetime(tidy_df_asian_2015['Date'])
tidy_df_asian_2015['High Wage Count'] = tidy_df_asian_2015['High Wage Count'].astype(int)
tidy_df_asian_2015 = tidy_df_asian_2015.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_asian_2015['Output Race'] = 'Asian'

Other

In [119]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2015, ca_ipums_hw_other_2015, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [120]:
df_dict_other_2015 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_other_2015 = pd.DataFrame(df_dict_other_2015)
tidy_df_other_2015 = tidy_df_other_2015[tidy_df_other_2015['Industry'].notna()]
tidy_df_other_2015['Date']= pd.to_datetime(tidy_df_other_2015['Date'])
tidy_df_other_2015['High Wage Count'] = tidy_df_other_2015['High Wage Count'].astype(int)
tidy_df_other_2015 = tidy_df_other_2015.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_other_2015['Output Race'] = 'Some other race'

Multi

In [121]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd_2015, ca_ipums_hw_multi_2015, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10440 == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete
Progress: 70% Complete
Progress: 80% Complete
Progress: 90% Complete
Progress: 100% Complete


In [122]:
df_dict_multi_2015 = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_multi_2015 = pd.DataFrame(df_dict_multi_2015)
tidy_df_multi_2015 = tidy_df_multi_2015[tidy_df_multi_2015['Industry'].notna()]
tidy_df_multi_2015['Date']= pd.to_datetime(tidy_df_multi_2015['Date'])
tidy_df_multi_2015['High Wage Count'] = tidy_df_multi_2015['High Wage Count'].astype(int)
tidy_df_multi_2015 = tidy_df_multi_2015.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_multi_2015['Output Race'] = 'Multiracial'

Concatenate

In [123]:
df_race_2015 = pd.concat([tidy_df_white_2015, 
                          tidy_df_black_2015, 
                          tidy_df_latino_2015, 
                          tidy_df_asian_2015, 
                          tidy_df_native_2015, 
                          tidy_df_other_2015, 
                          tidy_df_multi_2015], 
                         ignore_index=True)
df_race_2015 = pd.merge(df_race_2015, col_2015, left_on='County', right_on='Regions')
df_race_2015['Year'] = 2015
df_race_2015 = df_race_2015[['Industry', 'Date', 'County', 'High Wage Count', 'Cost of Living', 'Employment Count', 'Output Race', 'Year']]

In [125]:
df_race_2015.to_csv('hw_outputs_w_race_2015.csv', encoding='utf-8', index=False)

### Concatenate all dataframes & export

In [128]:
df_race_2019 = pd.read_csv('data/hw_outputs_w_race_2019.csv')
df_race_2019['Year'] = 2019

In [129]:
hw_outputs_w_race_5year = pd.concat([df_race_2019, 
                          df_race_2018, 
                          df_race_2017, 
                          df_race_2016, 
                          df_race_2015], 
                         ignore_index=True)

In [130]:
hw_outputs_w_race_5year.to_csv('hw_outputs_w_race_5year.csv', encoding='utf-8', index=False)