# Notebook for Creating High Wage Outputs with Racial Demographics Breakdown

#### This notebook is currently written to create high wage outputs for 2020.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import warnings
import os
import re
from tqdm.notebook import tqdm # for progress bar
from jqi_functions import *
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format='retina'

## Ensure that IPUMS data is in the proper file location

The desired year for IPUMS data should live in the data folder, under `data/ipums` with the naming convention as `IPUMS_{YEAR}.csv`, where `{YEAR}` should match the year entered below.

A full example for the file path would be `data/ipums/IPUMS_2020.csv`.

## Set the desired EDD year, IPUMS year, and corresponding cost of living year.

In [2]:
edd_year = '2022'
ipums_year = '2019'
col_year = '2019'

## Creating IPUMS dataframe

#### IPUMS Data
`cleaned_ipums` is a function to generate a cleaned pandas dataframe using IPUMS data, filtering it down to California only and the desired year. By setting the `demo` parameter to True, the dataframe will also include the racial demographic information for each record in the dataset. The year needs to be entered in string format as a parameter.

In [3]:
ca_ipums = cleaned_ipums(ipums_year, demo=True)

#### Cost of living needs to be updated each year.

In [4]:
cost_of_living = pd.read_csv(f'data/cost_of_living/united-way-col-1A1PS1C{col_year}.csv')

### Create county lookup dataframe

Expanding the `county_info` dataframe to include cost of living metrics. This dataframe is used when industry information in a geographic area is too sparse and the next largest geographic area needs to be used instead.

In [5]:
county_info = pd.read_csv('data/county_to_regions_key.csv')

In [6]:
county_info = county_info[['County', 'COUNTYFIP', 'Rural/Urban', 'CERF Regions', 'Population']]

In [7]:
county_info = pd.merge(county_info, cost_of_living, left_on = 'CERF Regions', right_on = 'Regions')

In [8]:
county_info = county_info.rename(columns = {'Cost of Living':'Regional COL'})
county_info = county_info.drop(columns=['Regions'])

In [9]:
county_info['State COL'] = cost_of_living.iloc[13][1]

View of final `county_info` dataframe.

In [10]:
county_info.head()

Unnamed: 0,County,COUNTYFIP,Rural/Urban,CERF Regions,Population,Regional COL,State COL
0,Alameda,1,Urban,Bay Area,1656754,97249,77555
1,Contra Costa,13,Urban,Bay Area,1142251,97249,77555
2,Solano,95,Urban,Bay Area,441829,97249,77555
3,San Mateo,81,Urban,Bay Area,767423,97249,77555
4,Santa Clara,85,Urban,Bay Area,1927470,97249,77555


In [11]:
ca_ipums = pd.merge(ca_ipums, county_info, on = 'COUNTYFIP')

View of final `ca_ipums` dataframe.

In [12]:
ca_ipums.head()

Unnamed: 0,YEAR,COUNTYFIP,INDNAICS,PERWT,RACE,HISPAN,INCWAGE,NAICS Code,Industry Title,Industry,Crosswalk Value,County,Rural/Urban,CERF Regions,Population,Regional COL,State COL
0,2019,37,4853,21.0,2,0,23100,4853,taxi and limousine service,taxi and limousine service,11,Los Angeles,Urban,Los Angeles,10081570,80216,77555
1,2019,37,4853,21.0,2,0,23100,4853,taxi and limousine service,taxi and limousine service,14,Los Angeles,Urban,Los Angeles,10081570,80216,77555
2,2019,37,4853,11.0,9,0,28000,4853,taxi and limousine service,taxi and limousine service,11,Los Angeles,Urban,Los Angeles,10081570,80216,77555
3,2019,37,4853,11.0,9,0,28000,4853,taxi and limousine service,taxi and limousine service,14,Los Angeles,Urban,Los Angeles,10081570,80216,77555
4,2019,37,4853,35.0,9,0,28000,4853,taxi and limousine service,taxi and limousine service,11,Los Angeles,Urban,Los Angeles,10081570,80216,77555


## Create EDD Dataframe

#### EDD Data
The year for EDD data must be specified.

These CSV files are filtered and cleaned versions of the raw EDD Current Employment Statistics dataset. These CSV files can be created for upcoming years with the notebook `multiyear-edd-data-creation.ipynb`

In [13]:
edd = pd.read_csv(f'data/edd/edd_{edd_year}.csv')

View of final `edd` dataframe.

In [14]:
edd.head()

Unnamed: 0,Area Type,Area Name,Year,Month,Date,Series Code,Seasonally Adjusted,Current Employment,Industry Title,COUNTYFIP,County,Rural/Urban,CERF Regions,Crosswalk Value
0,County,Napa,2022,January,01/01/2022,60560000,N,3800,administrative and support and waste services,55,Napa,Rural,Bay Area,25
1,County,Napa,2022,February,02/01/2022,60560000,N,3900,administrative and support and waste services,55,Napa,Rural,Bay Area,25
2,County,Napa,2022,March,03/01/2022,60560000,N,3900,administrative and support and waste services,55,Napa,Rural,Bay Area,25
3,County,Napa,2022,April,04/01/2022,60560000,N,3900,administrative and support and waste services,55,Napa,Rural,Bay Area,25
4,County,Solano,2022,January,01/01/2022,60560000,N,5200,administrative and support and waste services,95,Solano,Urban,Bay Area,25


## Breakdown dataframes by race

In [15]:
ca_ipums_latino = ca_ipums.loc[ca_ipums['HISPAN'] != 0]
ca_ipums_no_latino = ca_ipums.loc[ca_ipums['HISPAN'] == 0]
ca_ipums_white = ca_ipums_no_latino.loc[ca_ipums_no_latino['RACE'] == 1]
ca_ipums_black = ca_ipums_no_latino.loc[ca_ipums_no_latino['RACE'] == 2]
ca_ipums_native = ca_ipums_no_latino.loc[ca_ipums_no_latino['RACE'] == 3]
ca_ipums_asian = ca_ipums_no_latino.loc[(ca_ipums_no_latino['RACE'] == 4) |
                                   (ca_ipums_no_latino['RACE'] == 5) |
                                   (ca_ipums_no_latino['RACE'] == 6)]
ca_ipums_other = ca_ipums_no_latino.loc[ca_ipums_no_latino['RACE'] == 7]
ca_ipums_multi = ca_ipums_no_latino.loc[(ca_ipums_no_latino['RACE'] == 8) |
                                   (ca_ipums_no_latino['RACE'] == 9)]

In [16]:
wt_counts = {}
for code in ca_ipums['Crosswalk Value'].unique():
    code_copy = ca_ipums.loc[ca_ipums['Crosswalk Value'] == code].copy()
    wt_counts[code] = code_copy['PERWT'].sum()

In [17]:
def append_race_ratio(df, wt_counts):
    df['race_ratio'] = 0
    for code in df['Crosswalk Value'].unique():
        perwt = df['PERWT'][df['Crosswalk Value'] == code].sum()
        df['race_ratio'][df['Crosswalk Value'] == code] = perwt / wt_counts[code]
    return df

In [18]:
ca_ipums_white = append_race_ratio(ca_ipums_white, wt_counts)

In [19]:
ca_ipums_latino = append_race_ratio(ca_ipums_latino, wt_counts)

In [20]:
ca_ipums_black = append_race_ratio(ca_ipums_black, wt_counts)

In [21]:
ca_ipums_native = append_race_ratio(ca_ipums_native, wt_counts)

In [22]:
ca_ipums_asian = append_race_ratio(ca_ipums_asian, wt_counts)

In [23]:
ca_ipums_other = append_race_ratio(ca_ipums_other, wt_counts)

In [24]:
ca_ipums_multi = append_race_ratio(ca_ipums_multi, wt_counts)

## Add High Wage Features

`add_geo_high_wages` is a function that adds the following engineered features:
- Above Threshold (Number of records above respective cost of living threshold)
- Weighted above threshold (Above Threshold multiplied by person weight variable)
- Unweighted industry counts (Number of records in that industry)
- Weighted industry counts (Sum of person weight values in that industry)
- Weighted high wage percentage (Weighted Above Threshold divided by Weighted Industry Counts as a percentage)

The features are created for the following geographical levels:
- Region
- California

In [25]:
ca_ipums_hw_white = add_geo_high_wages(ca_ipums_white)
ca_ipums_hw_latino = add_geo_high_wages(ca_ipums_latino)
ca_ipums_hw_black = add_geo_high_wages(ca_ipums_black)
ca_ipums_hw_native = add_geo_high_wages(ca_ipums_native)
ca_ipums_hw_asian = add_geo_high_wages(ca_ipums_asian)
ca_ipums_hw_other = add_geo_high_wages(ca_ipums_other)
ca_ipums_hw_multi = add_geo_high_wages(ca_ipums_multi)

## Create High Wage Outputs Dataframe

`edd_to_hw` is the function that outputs the values needed to create the high wage output dataframe. This portion of the notebook runs through every unique combination of region, industry, and date, to get that respective output and add it to the dataframe.

Because of the nested structure of the EDD industries, only a small selection of EDD industries can be used to ensure that individuals in nested industries are not counted twice. The selection of these industries different per region, so the series code of each industry is documented in the `region_series_codes` global variable in the `jqi_functions.py` library. Each of these codes were then assigned their own crosswalk value, which align with each crosswalk value assigned to each IPUMS industry. For generating high wage outputs, we only iterate through the EDD industries that have been selected and have a designated crosswalk value.

Getting unique values for each region, industry, and date.

In [26]:
regions_ipums = ca_ipums['CERF Regions'].unique()

In [27]:
crosswalk_vals = sorted(edd['Crosswalk Value'].unique())

In [28]:
dates_edd = edd['Date'].unique()

#### White

In [29]:
industries = []
dates = []
regions = []
counts = []
emp_counts = []

For loop to populate lists for the high wage outputs. This will take some time to finish running.

In [30]:
for region in tqdm(regions_ipums):
    for code in crosswalk_vals:
        for date in dates_edd:
            hw_count, hw_perc, employment_count, industry = edd_to_hw(edd, ca_ipums_hw_white, region, code, date, 10)
            industries.append(industry)
            dates.append(date)
            regions.append(region)
            counts.append(hw_count)
            emp_counts.append(employment_count)

  0%|          | 0/12 [00:00<?, ?it/s]

Creating a cleaned dataframe from the output lists.

In [31]:
df_dict_white = {'Industry':industries, 'Date':dates, 'Region':regions, 'High Wage Count':counts, 'Employment Count':emp_counts}
hw_output_white = pd.DataFrame(df_dict_white)
hw_output_white = hw_output_white[hw_output_white['Industry'].notna()]
hw_output_white['Date']= pd.to_datetime(hw_output_white['Date'])
hw_output_white['High Wage Count'] = hw_output_white['High Wage Count'].astype(int)
hw_output_white = hw_output_white.sort_values(by=['Industry', 'Region', 'Date'])
hw_output_white = pd.merge(hw_output_white, cost_of_living, left_on='Region', right_on='Regions')
hw_output_white = hw_output_white[['Industry', 'Date', 'Region', 'High Wage Count', 'Employment Count', 'Cost of Living']]
hw_output_white = hw_output_white.drop_duplicates()
hw_output_white['Output Race'] = 'White'

View of final `hw_output_white` dataframe.

In [32]:
hw_output_white.head()

Unnamed: 0,Industry,Date,Region,High Wage Count,Employment Count,Cost of Living,Output Race
0,accommodation and food services,2022-01-01,Kern,968,24200.0,54862,White
1,accommodation and food services,2022-02-01,Kern,980,24500.0,54862,White
2,accommodation and food services,2022-03-01,Kern,992,24800.0,54862,White
3,accommodation and food services,2022-04-01,Kern,992,24800.0,54862,White
4,administrative and support and waste services,2022-01-01,Kern,6065,12000.0,54862,White


#### Latino

In [33]:
industries = []
dates = []
regions = []
counts = []
emp_counts = []

For loop to populate lists for the high wage outputs. This will take some time to finish running.

In [34]:
for region in tqdm(regions_ipums):
    for code in crosswalk_vals:
        for date in dates_edd:
            hw_count, hw_perc, employment_count, industry = edd_to_hw(edd, ca_ipums_hw_latino, region, code, date, 10)
            industries.append(industry)
            dates.append(date)
            regions.append(region)
            counts.append(hw_count)
            emp_counts.append(employment_count)

  0%|          | 0/12 [00:00<?, ?it/s]

Creating a cleaned dataframe from the output lists.

In [35]:
df_dict_latino = {'Industry':industries, 'Date':dates, 'Region':regions, 'High Wage Count':counts, 'Employment Count':emp_counts}
hw_output_latino = pd.DataFrame(df_dict_latino)
hw_output_latino = hw_output_latino[hw_output_latino['Industry'].notna()]
hw_output_latino['Date']= pd.to_datetime(hw_output_latino['Date'])
hw_output_latino['High Wage Count'] = hw_output_latino['High Wage Count'].astype(int)
hw_output_latino = hw_output_latino.sort_values(by=['Industry', 'Region', 'Date'])
hw_output_latino = pd.merge(hw_output_latino, cost_of_living, left_on='Region', right_on='Regions')
hw_output_latino = hw_output_latino[['Industry', 'Date', 'Region', 'High Wage Count', 'Employment Count', 'Cost of Living']]
hw_output_latino = hw_output_latino.drop_duplicates()
hw_output_latino['Output Race'] = 'Latino'

View of final `hw_output_latino` dataframe.

In [36]:
hw_output_latino.head()

Unnamed: 0,Industry,Date,Region,High Wage Count,Employment Count,Cost of Living,Output Race
0,accommodation and food services,2022-01-01,Kern,0,24200.0,54862,Latino
1,accommodation and food services,2022-02-01,Kern,0,24500.0,54862,Latino
2,accommodation and food services,2022-03-01,Kern,0,24800.0,54862,Latino
3,accommodation and food services,2022-04-01,Kern,0,24800.0,54862,Latino
4,administrative and support and waste services,2022-01-01,Kern,773,12000.0,54862,Latino


#### Black

In [37]:
industries = []
dates = []
regions = []
counts = []
emp_counts = []

For loop to populate lists for the high wage outputs. This will take some time to finish running.

In [38]:
for region in tqdm(regions_ipums):
    for code in crosswalk_vals:
        for date in dates_edd:
            hw_count, hw_perc, employment_count, industry = edd_to_hw(edd, ca_ipums_hw_black, region, code, date, 10)
            industries.append(industry)
            dates.append(date)
            regions.append(region)
            counts.append(hw_count)
            emp_counts.append(employment_count)

  0%|          | 0/12 [00:00<?, ?it/s]

Creating a cleaned dataframe from the output lists.

In [39]:
df_dict_black = {'Industry':industries, 'Date':dates, 'Region':regions, 'High Wage Count':counts, 'Employment Count':emp_counts}
hw_output_black = pd.DataFrame(df_dict_black)
hw_output_black = hw_output_black[hw_output_black['Industry'].notna()]
hw_output_black['Date']= pd.to_datetime(hw_output_black['Date'])
hw_output_black['High Wage Count'] = hw_output_black['High Wage Count'].astype(int)
hw_output_black = hw_output_black.sort_values(by=['Industry', 'Region', 'Date'])
hw_output_black = pd.merge(hw_output_black, cost_of_living, left_on='Region', right_on='Regions')
hw_output_black = hw_output_black[['Industry', 'Date', 'Region', 'High Wage Count', 'Employment Count', 'Cost of Living']]
hw_output_black = hw_output_black.drop_duplicates()
hw_output_black['Output Race'] = 'Black'

View of final `hw_output_black` dataframe.

In [40]:
hw_output_black.head()

Unnamed: 0,Industry,Date,Region,High Wage Count,Employment Count,Cost of Living,Output Race
0,accommodation and food services,2022-01-01,Kern,1162,24200.0,54862,Black
1,accommodation and food services,2022-02-01,Kern,1177,24500.0,54862,Black
2,accommodation and food services,2022-03-01,Kern,1191,24800.0,54862,Black
3,accommodation and food services,2022-04-01,Kern,1191,24800.0,54862,Black
4,administrative and support and waste services,2022-01-01,Kern,1731,12000.0,54862,Black


#### American Indian/Alaska Native

In [41]:
industries = []
dates = []
regions = []
counts = []
emp_counts = []

For loop to populate lists for the high wage outputs. This will take some time to finish running.

In [42]:
for region in tqdm(regions_ipums):
    for code in crosswalk_vals:
        for date in dates_edd:
            hw_count, hw_perc, employment_count, industry = edd_to_hw(edd, ca_ipums_hw_native, region, code, date, 10)
            industries.append(industry)
            dates.append(date)
            regions.append(region)
            counts.append(hw_count)
            emp_counts.append(employment_count)

  0%|          | 0/12 [00:00<?, ?it/s]

Creating a cleaned dataframe from the output lists.

In [43]:
df_dict_native = {'Industry':industries, 'Date':dates, 'Region':regions, 'High Wage Count':counts, 'Employment Count':emp_counts}
hw_output_native = pd.DataFrame(df_dict_native)
hw_output_native = hw_output_native[hw_output_native['Industry'].notna()]
hw_output_native['Date']= pd.to_datetime(hw_output_native['Date'])
hw_output_native['High Wage Count'] = hw_output_native['High Wage Count'].astype(int)
hw_output_native = hw_output_native.sort_values(by=['Industry', 'Region', 'Date'])
hw_output_native = pd.merge(hw_output_native, cost_of_living, left_on='Region', right_on='Regions')
hw_output_native = hw_output_native[['Industry', 'Date', 'Region', 'High Wage Count', 'Employment Count', 'Cost of Living']]
hw_output_native = hw_output_native.drop_duplicates()
hw_output_native['Output Race'] = 'American Indian/Alaska Native'

View of final `hw_output_native` dataframe.

In [44]:
hw_output_native.head()

Unnamed: 0,Industry,Date,Region,High Wage Count,Employment Count,Cost of Living,Output Race
0,accommodation and food services,2022-01-01,Kern,2975,24200.0,54862,American Indian/Alaska Native
1,accommodation and food services,2022-02-01,Kern,3012,24500.0,54862,American Indian/Alaska Native
2,accommodation and food services,2022-03-01,Kern,3049,24800.0,54862,American Indian/Alaska Native
3,accommodation and food services,2022-04-01,Kern,3049,24800.0,54862,American Indian/Alaska Native
4,educational services,2022-01-01,Kern,795,1900.0,54862,American Indian/Alaska Native


#### Asian

In [45]:
industries = []
dates = []
regions = []
counts = []
emp_counts = []

For loop to populate lists for the high wage outputs. This will take some time to finish running.

In [46]:
for region in tqdm(regions_ipums):
    for code in crosswalk_vals:
        for date in dates_edd:
            hw_count, hw_perc, employment_count, industry = edd_to_hw(edd, ca_ipums_hw_asian, region, code, date, 10)
            industries.append(industry)
            dates.append(date)
            regions.append(region)
            counts.append(hw_count)
            emp_counts.append(employment_count)

  0%|          | 0/12 [00:00<?, ?it/s]

Creating a cleaned dataframe from the output lists.

In [47]:
df_dict_asian = {'Industry':industries, 'Date':dates, 'Region':regions, 'High Wage Count':counts, 'Employment Count':emp_counts}
hw_output_asian = pd.DataFrame(df_dict_asian)
hw_output_asian = hw_output_asian[hw_output_asian['Industry'].notna()]
hw_output_asian['Date']= pd.to_datetime(hw_output_asian['Date'])
hw_output_asian['High Wage Count'] = hw_output_asian['High Wage Count'].astype(int)
hw_output_asian = hw_output_asian.sort_values(by=['Industry', 'Region', 'Date'])
hw_output_asian = pd.merge(hw_output_asian, cost_of_living, left_on='Region', right_on='Regions')
hw_output_asian = hw_output_asian[['Industry', 'Date', 'Region', 'High Wage Count', 'Employment Count', 'Cost of Living']]
hw_output_asian = hw_output_asian.drop_duplicates()
hw_output_asian['Output Race'] = 'Asian'

View of final `hw_output_asian` dataframe.

In [48]:
hw_output_asian.head()

Unnamed: 0,Industry,Date,Region,High Wage Count,Employment Count,Cost of Living,Output Race
0,accommodation and food services,2022-01-01,Kern,4534,24200.0,54862,Asian
1,accommodation and food services,2022-02-01,Kern,4590,24500.0,54862,Asian
2,accommodation and food services,2022-03-01,Kern,4646,24800.0,54862,Asian
3,accommodation and food services,2022-04-01,Kern,4646,24800.0,54862,Asian
4,construction,2022-01-01,Kern,5505,14800.0,54862,Asian


#### Some other race

In [49]:
industries = []
dates = []
regions = []
counts = []
emp_counts = []

For loop to populate lists for the high wage outputs. This will take some time to finish running.

In [50]:
for region in tqdm(regions_ipums):
    for code in crosswalk_vals:
        for date in dates_edd:
            hw_count, hw_perc, employment_count, industry = edd_to_hw(edd, ca_ipums_hw_other, region, code, date, 10)
            industries.append(industry)
            dates.append(date)
            regions.append(region)
            counts.append(hw_count)
            emp_counts.append(employment_count)

  0%|          | 0/12 [00:00<?, ?it/s]

Creating a cleaned dataframe from the output lists.

In [51]:
df_dict_other = {'Industry':industries, 'Date':dates, 'Region':regions, 'High Wage Count':counts, 'Employment Count':emp_counts}
hw_output_other = pd.DataFrame(df_dict_other)
hw_output_other = hw_output_other[hw_output_other['Industry'].notna()]
hw_output_other['Date']= pd.to_datetime(hw_output_other['Date'])
hw_output_other['High Wage Count'] = hw_output_other['High Wage Count'].astype(int)
hw_output_other = hw_output_other.sort_values(by=['Industry', 'Region', 'Date'])
hw_output_other = pd.merge(hw_output_other, cost_of_living, left_on='Region', right_on='Regions')
hw_output_other = hw_output_other[['Industry', 'Date', 'Region', 'High Wage Count', 'Employment Count', 'Cost of Living']]
hw_output_other = hw_output_other.drop_duplicates()
hw_output_other['Output Race'] = 'Some other race'

View of final `hw_output_other` dataframe.

In [52]:
hw_output_other.head()

Unnamed: 0,Industry,Date,Region,High Wage Count,Employment Count,Cost of Living,Output Race
0,accommodation and food services,2022-01-01,Los Angeles,34775,384500.0,80216,Some other race
1,accommodation and food services,2022-02-01,Los Angeles,35535,392900.0,80216,Some other race
2,accommodation and food services,2022-03-01,Los Angeles,36015,398200.0,80216,Some other race
3,accommodation and food services,2022-04-01,Los Angeles,36648,405200.0,80216,Some other race
4,arts entertainment and recreation,2022-01-01,Los Angeles,-795,79500.0,80216,Some other race


#### Multiracial

In [53]:
industries = []
dates = []
regions = []
counts = []
emp_counts = []

For loop to populate lists for the high wage outputs. This will take some time to finish running.

In [54]:
for region in tqdm(regions_ipums):
    for code in crosswalk_vals:
        for date in dates_edd:
            hw_count, hw_perc, employment_count, industry = edd_to_hw(edd, ca_ipums_hw_multi, region, code, date, 10)
            industries.append(industry)
            dates.append(date)
            regions.append(region)
            counts.append(hw_count)
            emp_counts.append(employment_count)

  0%|          | 0/12 [00:00<?, ?it/s]

Creating a cleaned dataframe from the output lists.

In [55]:
df_dict_multi = {'Industry':industries, 'Date':dates, 'Region':regions, 'High Wage Count':counts, 'Employment Count':emp_counts}
hw_output_multi = pd.DataFrame(df_dict_multi)
hw_output_multi = hw_output_multi[hw_output_multi['Industry'].notna()]
hw_output_multi['Date']= pd.to_datetime(hw_output_multi['Date'])
hw_output_multi['High Wage Count'] = hw_output_multi['High Wage Count'].astype(int)
hw_output_multi = hw_output_multi.sort_values(by=['Industry', 'Region', 'Date'])
hw_output_multi = pd.merge(hw_output_multi, cost_of_living, left_on='Region', right_on='Regions')
hw_output_multi = hw_output_multi[['Industry', 'Date', 'Region', 'High Wage Count', 'Employment Count', 'Cost of Living']]
hw_output_multi = hw_output_multi.drop_duplicates()
hw_output_multi['Output Race'] = 'Multiracial'

View of final `hw_output_multi` dataframe.

In [56]:
hw_output_multi.head()

Unnamed: 0,Industry,Date,Region,High Wage Count,Employment Count,Cost of Living,Output Race
0,accommodation and food services,2022-01-01,Kern,2465,24200.0,54862,Multiracial
1,accommodation and food services,2022-02-01,Kern,2495,24500.0,54862,Multiracial
2,accommodation and food services,2022-03-01,Kern,2526,24800.0,54862,Multiracial
3,accommodation and food services,2022-04-01,Kern,2526,24800.0,54862,Multiracial
4,administrative and support and waste services,2022-01-01,Kern,1797,12000.0,54862,Multiracial


### Concatenate all dataframes and export

Code to export the dataframe as a CSV file - change file path if needed and uncomment to run.

In [57]:
hw_output_demo = pd.concat([hw_output_white, hw_output_black, hw_output_latino, hw_output_asian, hw_output_native, hw_output_other, hw_output_multi], ignore_index=True)

In [58]:
hw_output_demo.head()

Unnamed: 0,Industry,Date,Region,High Wage Count,Employment Count,Cost of Living,Output Race
0,accommodation and food services,2022-01-01,Kern,968,24200.0,54862,White
1,accommodation and food services,2022-02-01,Kern,980,24500.0,54862,White
2,accommodation and food services,2022-03-01,Kern,992,24800.0,54862,White
3,accommodation and food services,2022-04-01,Kern,992,24800.0,54862,White
4,administrative and support and waste services,2022-01-01,Kern,6065,12000.0,54862,White


#### Add Region Population to Dataframe

In [59]:
reg_pop = county_info.groupby(by='CERF Regions').sum()[['Population']].reset_index()
reg_pop

Unnamed: 0,CERF Regions,Population
0,Bay Area,7710026
1,Central Coast,2342005
2,Central San Joaquin,1752543
3,Eastern Sierra,188734
4,Inland Empire,4560470
5,Kern,887641
6,Los Angeles,10081570
7,North State,713754
8,Northern San Joaquin,1557179
9,Orange,3168044


In [60]:
hw_output_demo = pd.merge(hw_output_demo, reg_pop, left_on='Region', right_on='CERF Regions')
hw_output_demo = hw_output_demo.drop(columns=['CERF Regions'])
hw_output_demo = hw_output_demo.rename(columns={"Population": "Region Population"})
hw_output_demo.head()

Unnamed: 0,Industry,Date,Region,High Wage Count,Employment Count,Cost of Living,Output Race,Region Population
0,accommodation and food services,2022-01-01,Kern,968,24200.0,54862,White,887641
1,accommodation and food services,2022-02-01,Kern,980,24500.0,54862,White,887641
2,accommodation and food services,2022-03-01,Kern,992,24800.0,54862,White,887641
3,accommodation and food services,2022-04-01,Kern,992,24800.0,54862,White,887641
4,administrative and support and waste services,2022-01-01,Kern,6065,12000.0,54862,White,887641


In [61]:
hw_output_demo.to_csv(f'data/outputs/hw_outputs_w_race_{edd_year}.csv', encoding='utf-8', index=False)

## Code for concatenating multiple years (2010-2022)

In [66]:
# hw_2010 = pd.read_csv('data/outputs/hw_outputs_w_race_2010.csv')
# hw_2011 = pd.read_csv('data/outputs/hw_outputs_w_race_2011.csv')
# hw_2012 = pd.read_csv('data/outputs/hw_outputs_w_race_2012.csv')
# hw_2013 = pd.read_csv('data/outputs/hw_outputs_w_race_2013.csv')
# hw_2014 = pd.read_csv('data/outputs/hw_outputs_w_race_2014.csv')
# hw_2015 = pd.read_csv('data/outputs/hw_outputs_w_race_2015.csv')
# hw_2016 = pd.read_csv('data/outputs/hw_outputs_w_race_2016.csv')
# hw_2017 = pd.read_csv('data/outputs/hw_outputs_w_race_2017.csv')
# hw_2018 = pd.read_csv('data/outputs/hw_outputs_w_race_2018.csv')
# hw_2019 = pd.read_csv('data/outputs/hw_outputs_w_race_2019.csv')
# hw_2020 = pd.read_csv('data/outputs/hw_outputs_w_race_2020.csv')
# hw_2021 = pd.read_csv('data/outputs/hw_outputs_w_race_2021.csv')
# hw_2022 = pd.read_csv('data/outputs/hw_outputs_w_race_2022.csv')

In [67]:
# hw_2010['Year'] = 2010
# hw_2011['Year'] = 2011
# hw_2012['Year'] = 2012
# hw_2013['Year'] = 2013
# hw_2014['Year'] = 2014
# hw_2015['Year'] = 2015
# hw_2016['Year'] = 2016
# hw_2017['Year'] = 2017
# hw_2018['Year'] = 2018
# hw_2019['Year'] = 2019
# hw_2020['Year'] = 2020
# hw_2021['Year'] = 2021
# hw_2022['Year'] = 2022

In [68]:
# hw_output_concat = pd.concat([hw_2010, hw_2011, hw_2012, hw_2013, 
#                               hw_2014, hw_2015, hw_2016, hw_2017, 
#                               hw_2018, hw_2019, hw_2020, hw_2021, 
#                               hw_2022])

In [69]:
# hw_output_concat.to_csv('data/outputs/hw_outputs_w_race_multiyear.csv', encoding='utf-8', index=False)