# Notebook for Creating High Wage Outputs with Racial Demographics Breakdown

#### This notebook is currently written to create high wage outputs for 2020.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import warnings
import os
import re
from jqi_functions import *
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format='retina'

## Set the desired year and the corresponding cost of living year

In [2]:
year = '2020'
col_year = '2019'

## Creating IPUMS dataframe

#### IPUMS Data
`cleaned_ipums_demo` is a function to generate a cleaned pandas dataframe using IPUMS data, filtering it down to California only and the desired year. The dataframe will also include the racial demographic information for each record in the dataset. The year needs to be entered in string format as a parameter.

In [3]:
ca_ipums = cleaned_ipums_demo(year)

#### Cost of living needs to be updated each year.

In this case, the 2020 United Way Real Cost Measure has not been published, so I will continue using the data from 2019.

In [4]:
cost_of_living = pd.read_csv(f'data/cost_of_living/united-way-col-1A1PS1C{col_year}.csv')

### Create county lookup dataframe

Expanding the `county_info` dataframe to include cost of living metrics. This dataframe is used when industry information in a geographic area is too sparse and the next largest geographic area needs to be used instead.

In [5]:
county_info = pd.read_csv('data/county_to_regions_key.csv')

In [6]:
county_info = county_info[['County', 'COUNTYFIP', 'Rural/Urban', 'CERF Regions']]

In [7]:
county_info = pd.merge(county_info, cost_of_living, left_on = 'CERF Regions', right_on = 'Regions')

In [8]:
county_info = county_info.rename(columns = {'Cost of Living':'Regional COL'})
county_info = county_info.drop(columns=['Regions'])

In [9]:
# county_info = pd.merge(county_info, cost_of_living, left_on = 'Rural/Urban', right_on = 'Regions')

In [10]:
# county_info = county_info.rename(columns = {'Cost of Living':'Rural/Urban COL', 'Regions_x':'Regions'})
# county_info = county_info.drop(columns=['Regions_y'])

In [11]:
county_info['State COL'] = cost_of_living.iloc[13][1]

View of final `county_info` dataframe.

In [12]:
county_info.head()

Unnamed: 0,County,COUNTYFIP,Rural/Urban,CERF Regions,Regional COL,State COL
0,Alameda,1,Urban,Bay Area,97249,77555
1,Contra Costa,13,Urban,Bay Area,97249,77555
2,Solano,95,Urban,Bay Area,97249,77555
3,San Mateo,81,Urban,Bay Area,97249,77555
4,Santa Clara,85,Urban,Bay Area,97249,77555


In [13]:
ca_ipums = pd.merge(ca_ipums, county_info, on = 'COUNTYFIP')

View of final `ca_ipums` dataframe.

In [14]:
ca_ipums.head()

Unnamed: 0,YEAR,COUNTYFIP,INDNAICS,PERWT,RACE,HISPAN,INCWAGE,NAICS Code,Industry Title,Industry,Crosswalk Value,County,Rural/Urban,CERF Regions,Regional COL,State COL
0,2020,73,928110p4,79.0,1,0,62000,928110p4,us marines,us marines,36,San Diego,Urban,San Diego-Imperial,77956,77555
1,2020,73,928110p4,64.0,1,0,24000,928110p4,us marines,us marines,36,San Diego,Urban,San Diego-Imperial,77956,77555
2,2020,73,928110p4,66.0,1,0,30000,928110p4,us marines,us marines,36,San Diego,Urban,San Diego-Imperial,77956,77555
3,2020,73,928110p4,58.0,1,0,24000,928110p4,us marines,us marines,36,San Diego,Urban,San Diego-Imperial,77956,77555
4,2020,73,928110p4,75.0,1,0,20000,928110p4,us marines,us marines,36,San Diego,Urban,San Diego-Imperial,77956,77555


## Create EDD Dataframe

#### EDD Data
The year for EDD data must be specified.

These CSV files are filtered and cleaned versions of the raw EDD Current Employment Statistics dataset. These CSV files can be created for upcoming years with the notebook `multiyear-edd-data-creation.ipynb`

In [15]:
edd = pd.read_csv(f'data/edd/edd_{year}.csv')

View of final `edd` dataframe.

In [16]:
edd.head()

Unnamed: 0,Area Type,Area Name,Year,Month,Date,Series Code,Seasonally Adjusted,Current Employment,Industry Title,COUNTYFIP,County,Rural/Urban,CERF Regions,Crosswalk Value
0,County,Alameda,2020,January,01/01/2020,80000000,N,27400,other services,1,Alameda,Urban,Bay Area,32
1,County,Alameda,2020,February,02/01/2020,80000000,N,27700,other services,1,Alameda,Urban,Bay Area,32
2,County,Alameda,2020,March,03/01/2020,80000000,N,27200,other services,1,Alameda,Urban,Bay Area,32
3,County,Alameda,2020,April,04/01/2020,80000000,N,19300,other services,1,Alameda,Urban,Bay Area,32
4,County,Alameda,2020,May,05/01/2020,80000000,N,20100,other services,1,Alameda,Urban,Bay Area,32


## Breakdown dataframes by race

In [17]:
ca_ipums_latino = ca_ipums.loc[ca_ipums['HISPAN'] != 0]
ca_ipums_no_latino = ca_ipums.loc[ca_ipums['HISPAN'] == 0]
ca_ipums_white = ca_ipums_no_latino.loc[ca_ipums_no_latino['RACE'] == 1]
ca_ipums_black = ca_ipums_no_latino.loc[ca_ipums_no_latino['RACE'] == 2]
ca_ipums_native = ca_ipums_no_latino.loc[ca_ipums_no_latino['RACE'] == 3]
ca_ipums_asian = ca_ipums_no_latino.loc[(ca_ipums_no_latino['RACE'] == 4) |
                                   (ca_ipums_no_latino['RACE'] == 5) |
                                   (ca_ipums_no_latino['RACE'] == 6)]
ca_ipums_other = ca_ipums_no_latino.loc[ca_ipums_no_latino['RACE'] == 7]
ca_ipums_multi = ca_ipums_no_latino.loc[(ca_ipums_no_latino['RACE'] == 8) |
                                   (ca_ipums_no_latino['RACE'] == 9)]

In [18]:
wt_counts = {}
for code in ca_ipums['Crosswalk Value'].unique():
    code_copy = ca_ipums.loc[ca_ipums['Crosswalk Value'] == code].copy()
    wt_counts[code] = code_copy['PERWT'].sum()

In [19]:
def append_race_ratio(df, wt_counts):
    df['race_ratio'] = 0
    for code in df['Crosswalk Value'].unique():
        perwt = df['PERWT'][df['Crosswalk Value'] == code].sum()
        df['race_ratio'][df['Crosswalk Value'] == code] = perwt / wt_counts[code]
    return df

In [20]:
ca_ipums_white = append_race_ratio(ca_ipums_white, wt_counts)

In [21]:
ca_ipums_latino = append_race_ratio(ca_ipums_latino, wt_counts)

In [22]:
ca_ipums_black = append_race_ratio(ca_ipums_black, wt_counts)

In [23]:
ca_ipums_native = append_race_ratio(ca_ipums_native, wt_counts)

In [24]:
ca_ipums_asian = append_race_ratio(ca_ipums_asian, wt_counts)

In [25]:
ca_ipums_other = append_race_ratio(ca_ipums_other, wt_counts)

In [26]:
ca_ipums_multi = append_race_ratio(ca_ipums_multi, wt_counts)

## Add High Wage Features

`add_geo_high_wages` is a function that adds the following engineered features:
- Above Threshold (Number of records above respective cost of living threshold)
- Weighted above threshold (Above Threshold multiplied by person weight variable)
- Unweighted industry counts (Number of records in that industry)
- Weighted industry counts (Sum of person weight values in that industry)
- Weighted high wage percentage (Weighted Above Threshold divided by Weighted Industry Counts as a percentage)

The features are created for the following geographical levels:
- Region
- California

In [27]:
ca_ipums_hw_white = add_geo_high_wages(ca_ipums_white)
ca_ipums_hw_latino = add_geo_high_wages(ca_ipums_latino)
ca_ipums_hw_black = add_geo_high_wages(ca_ipums_black)
ca_ipums_hw_native = add_geo_high_wages(ca_ipums_native)
ca_ipums_hw_asian = add_geo_high_wages(ca_ipums_asian)
ca_ipums_hw_other = add_geo_high_wages(ca_ipums_other)
ca_ipums_hw_multi = add_geo_high_wages(ca_ipums_multi)

## Create High Wage Outputs Dataframe

`edd_to_hw` is the function that outputs the values needed to create the high wage output dataframe. This portion of the notebook runs through every unique combination of region, industry, and date, to get that respective output and add it to the dataframe.

Because of the nested structure of the EDD industries, only a small selection of EDD industries can be used to ensure that individuals in nested industries are not counted twice. The selection of these industries different per region, so the series code of each industry is documented in the `region_series_codes` global variable in the `jqi_functions.py` library. Each of these codes were then assigned their own crosswalk value, which align with each crosswalk value assigned to each IPUMS industry. For generating high wage outputs, we only iterate through the EDD industries that have been selected and have a designated crosswalk value.

Getting unique values for each region, industry, and date.

In [28]:
regions_ipums = ca_ipums['CERF Regions'].unique()

In [29]:
crosswalk_vals = sorted(edd['Crosswalk Value'].unique())

In [30]:
dates_edd = edd['Date'].unique()

In [31]:
total_iterations = len(regions_ipums) * len(crosswalk_vals) * len(dates_edd)

#### White

In [32]:
industries = []
dates = []
regions = []
counts = []
emp_counts = []

For loop to populate lists for the high wage outputs. This will take some time to finish running.

In [None]:
progress_count = 0
for region in regions_ipums:
    for code in crosswalk_vals:
        for date in dates_edd:
            hw_count, hw_perc, employment_count, industry = edd_to_hw(edd, ca_ipums_hw_white, region, code, date, 10)
            industries.append(industry)
            dates.append(date)
            regions.append(region)
            counts.append(hw_count)
            emp_counts.append(employment_count)
            progress_count += 1
            if progress_count % (total_iterations / 10) == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Progress: 10% Complete
Progress: 20% Complete
Progress: 30% Complete
Progress: 40% Complete
Progress: 50% Complete
Progress: 60% Complete


Creating a cleaned dataframe from the output lists.

In [None]:
df_dict_white = {'Industry':industries, 'Date':dates, 'Region':regions, 'High Wage Count':counts, 'Employment Count':emp_counts}
hw_output_white = pd.DataFrame(df_dict_white)
hw_output_white = hw_output_white[hw_output_white['Industry'].notna()]
hw_output_white['Date']= pd.to_datetime(hw_output_white['Date'])
hw_output_white['High Wage Count'] = hw_output_white['High Wage Count'].astype(int)
hw_output_white = hw_output_white.sort_values(by=['Industry', 'Region', 'Date'])
hw_output_white = pd.merge(hw_output_white, cost_of_living, left_on='Region', right_on='Regions')
hw_output_white = hw_output_white[['Industry', 'Date', 'Region', 'High Wage Count', 'Employment Count', 'Cost of Living']]
hw_output_white = hw_output_white.drop_duplicates()
hw_output_white['Output Race'] = 'White'

View of final `hw_output_white` dataframe.

In [None]:
hw_output_white.head()

#### Latino

In [None]:
industries = []
dates = []
regions = []
counts = []
emp_counts = []

For loop to populate lists for the high wage outputs. This will take some time to finish running.

In [None]:
progress_count = 0
for region in regions_ipums:
    for code in crosswalk_vals:
        for date in dates_edd:
            hw_count, hw_perc, employment_count, industry = edd_to_hw(edd, ca_ipums_hw_latino, region, code, date, 10)
            industries.append(industry)
            dates.append(date)
            regions.append(region)
            counts.append(hw_count)
            emp_counts.append(employment_count)
            progress_count += 1
            if progress_count % (total_iterations / 10) == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Creating a cleaned dataframe from the output lists.

In [None]:
df_dict_latino = {'Industry':industries, 'Date':dates, 'Region':regions, 'High Wage Count':counts, 'Employment Count':emp_counts}
hw_output_latino = pd.DataFrame(df_dict_latino)
hw_output_latino = hw_output_latino[hw_output_latino['Industry'].notna()]
hw_output_latino['Date']= pd.to_datetime(hw_output_latino['Date'])
hw_output_latino['High Wage Count'] = hw_output_latino['High Wage Count'].astype(int)
hw_output_latino = hw_output_latino.sort_values(by=['Industry', 'Region', 'Date'])
hw_output_latino = pd.merge(hw_output_latino, cost_of_living, left_on='Region', right_on='Regions')
hw_output_latino = hw_output_latino[['Industry', 'Date', 'Region', 'High Wage Count', 'Employment Count', 'Cost of Living']]
hw_output_latino = hw_output_latino.drop_duplicates()
hw_output_latino['Output Race'] = 'Latino'

View of final `hw_output_latino` dataframe.

In [None]:
hw_output_latino.head()

#### Black

In [None]:
industries = []
dates = []
regions = []
counts = []
emp_counts = []

For loop to populate lists for the high wage outputs. This will take some time to finish running.

In [None]:
progress_count = 0
for region in regions_ipums:
    for code in crosswalk_vals:
        for date in dates_edd:
            hw_count, hw_perc, employment_count, industry = edd_to_hw(edd, ca_ipums_hw_black, region, code, date, 10)
            industries.append(industry)
            dates.append(date)
            regions.append(region)
            counts.append(hw_count)
            emp_counts.append(employment_count)
            progress_count += 1
            if progress_count % (total_iterations / 10) == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Creating a cleaned dataframe from the output lists.

In [None]:
df_dict_black = {'Industry':industries, 'Date':dates, 'Region':regions, 'High Wage Count':counts, 'Employment Count':emp_counts}
hw_output_black = pd.DataFrame(df_dict_black)
hw_output_black = hw_output_black[hw_output_black['Industry'].notna()]
hw_output_black['Date']= pd.to_datetime(hw_output_black['Date'])
hw_output_black['High Wage Count'] = hw_output_black['High Wage Count'].astype(int)
hw_output_black = hw_output_black.sort_values(by=['Industry', 'Region', 'Date'])
hw_output_black = pd.merge(hw_output_black, cost_of_living, left_on='Region', right_on='Regions')
hw_output_black = hw_output_black[['Industry', 'Date', 'Region', 'High Wage Count', 'Employment Count', 'Cost of Living']]
hw_output_black = hw_output_black.drop_duplicates()
hw_output_black['Output Race'] = 'Black'

View of final `hw_output_black` dataframe.

In [None]:
hw_output_black.head()

#### American Indian/Alaska Native

In [None]:
industries = []
dates = []
regions = []
counts = []
emp_counts = []

For loop to populate lists for the high wage outputs. This will take some time to finish running.

In [None]:
progress_count = 0
for region in regions_ipums:
    for code in crosswalk_vals:
        for date in dates_edd:
            hw_count, hw_perc, employment_count, industry = edd_to_hw(edd, ca_ipums_hw_native, region, code, date, 10)
            industries.append(industry)
            dates.append(date)
            regions.append(region)
            counts.append(hw_count)
            emp_counts.append(employment_count)
            progress_count += 1
            if progress_count % (total_iterations / 10) == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Creating a cleaned dataframe from the output lists.

In [None]:
df_dict_native = {'Industry':industries, 'Date':dates, 'Region':regions, 'High Wage Count':counts, 'Employment Count':emp_counts}
hw_output_native = pd.DataFrame(df_dict_native)
hw_output_native = hw_output_native[hw_output_native['Industry'].notna()]
hw_output_native['Date']= pd.to_datetime(hw_output_native['Date'])
hw_output_native['High Wage Count'] = hw_output_native['High Wage Count'].astype(int)
hw_output_native = hw_output_native.sort_values(by=['Industry', 'Region', 'Date'])
hw_output_native = pd.merge(hw_output_native, cost_of_living, left_on='Region', right_on='Regions')
hw_output_native = hw_output_native[['Industry', 'Date', 'Region', 'High Wage Count', 'Employment Count', 'Cost of Living']]
hw_output_native = hw_output_native.drop_duplicates()
hw_output_native['Output Race'] = 'American Indian/Alaska Native'

View of final `hw_output_native` dataframe.

In [None]:
hw_output_native.head()

#### Asian

In [None]:
industries = []
dates = []
regions = []
counts = []
emp_counts = []

For loop to populate lists for the high wage outputs. This will take some time to finish running.

In [None]:
progress_count = 0
for region in regions_ipums:
    for code in crosswalk_vals:
        for date in dates_edd:
            hw_count, hw_perc, employment_count, industry = edd_to_hw(edd, ca_ipums_hw_asian, region, code, date, 10)
            industries.append(industry)
            dates.append(date)
            regions.append(region)
            counts.append(hw_count)
            emp_counts.append(employment_count)
            progress_count += 1
            if progress_count % (total_iterations / 10) == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Creating a cleaned dataframe from the output lists.

In [None]:
df_dict_asian = {'Industry':industries, 'Date':dates, 'Region':regions, 'High Wage Count':counts, 'Employment Count':emp_counts}
hw_output_asian = pd.DataFrame(df_dict_asian)
hw_output_asian = hw_output_asian[hw_output_asian['Industry'].notna()]
hw_output_asian['Date']= pd.to_datetime(hw_output_asian['Date'])
hw_output_asian['High Wage Count'] = hw_output_asian['High Wage Count'].astype(int)
hw_output_asian = hw_output_asian.sort_values(by=['Industry', 'Region', 'Date'])
hw_output_asian = pd.merge(hw_output_asian, cost_of_living, left_on='Region', right_on='Regions')
hw_output_asian = hw_output_asian[['Industry', 'Date', 'Region', 'High Wage Count', 'Employment Count', 'Cost of Living']]
hw_output_asian = hw_output_asian.drop_duplicates()
hw_output_asian['Output Race'] = 'Asian'

View of final `hw_output_asian` dataframe.

In [None]:
hw_output_asian.head()

#### Some other race

In [None]:
industries = []
dates = []
regions = []
counts = []
emp_counts = []

For loop to populate lists for the high wage outputs. This will take some time to finish running.

In [None]:
progress_count = 0
for region in regions_ipums:
    for code in crosswalk_vals:
        for date in dates_edd:
            hw_count, hw_perc, employment_count, industry = edd_to_hw(edd, ca_ipums_hw_other, region, code, date, 10)
            industries.append(industry)
            dates.append(date)
            regions.append(region)
            counts.append(hw_count)
            emp_counts.append(employment_count)
            progress_count += 1
            if progress_count % (total_iterations / 10) == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Creating a cleaned dataframe from the output lists.

In [None]:
df_dict_other = {'Industry':industries, 'Date':dates, 'Region':regions, 'High Wage Count':counts, 'Employment Count':emp_counts}
hw_output_other = pd.DataFrame(df_dict_other)
hw_output_other = hw_output_other[hw_output_other['Industry'].notna()]
hw_output_other['Date']= pd.to_datetime(hw_output_other['Date'])
hw_output_other['High Wage Count'] = hw_output_other['High Wage Count'].astype(int)
hw_output_other = hw_output_other.sort_values(by=['Industry', 'Region', 'Date'])
hw_output_other = pd.merge(hw_output_other, cost_of_living, left_on='Region', right_on='Regions')
hw_output_other = hw_output_other[['Industry', 'Date', 'Region', 'High Wage Count', 'Employment Count', 'Cost of Living']]
hw_output_other = hw_output_other.drop_duplicates()
hw_output_other['Output Race'] = 'Some other race'

View of final `hw_output_other` dataframe.

In [None]:
hw_output_other.head()

#### Multiracial

In [None]:
industries = []
dates = []
regions = []
counts = []
emp_counts = []

For loop to populate lists for the high wage outputs. This will take some time to finish running.

In [None]:
progress_count = 0
for region in regions_ipums:
    for code in crosswalk_vals:
        for date in dates_edd:
            hw_count, hw_perc, employment_count, industry = edd_to_hw(edd, ca_ipums_hw_multi, region, code, date, 10)
            industries.append(industry)
            dates.append(date)
            regions.append(region)
            counts.append(hw_count)
            emp_counts.append(employment_count)
            progress_count += 1
            if progress_count % (total_iterations / 10) == 0:
                percent_done = int((progress_count / total_iterations) * 100)
                print(f'Progress: {percent_done}% Complete')

Creating a cleaned dataframe from the output lists.

In [None]:
df_dict_multi = {'Industry':industries, 'Date':dates, 'Region':regions, 'High Wage Count':counts, 'Employment Count':emp_counts}
hw_output_multi = pd.DataFrame(df_dict_multi)
hw_output_multi = hw_output_multi[hw_output_multi['Industry'].notna()]
hw_output_multi['Date']= pd.to_datetime(hw_output_multi['Date'])
hw_output_multi['High Wage Count'] = hw_output_multi['High Wage Count'].astype(int)
hw_output_multi = hw_output_multi.sort_values(by=['Industry', 'Region', 'Date'])
hw_output_multi = pd.merge(hw_output_multi, cost_of_living, left_on='Region', right_on='Regions')
hw_output_multi = hw_output_multi[['Industry', 'Date', 'Region', 'High Wage Count', 'Employment Count', 'Cost of Living']]
hw_output_multi = hw_output_multi.drop_duplicates()
hw_output_multi['Output Race'] = 'Multiracial'

View of final `hw_output_multi` dataframe.

In [None]:
hw_output_multi.head()

### Concatenate all dataframes and export

Code to export the dataframe as a CSV file - change file path if needed and uncomment to run.

In [None]:
hw_output_demo = pd.concat([hw_output_white, hw_output_black, hw_output_latino, hw_output_asian, hw_output_native, hw_output_other, hw_output_multi], ignore_index=True)

In [None]:
hw_output_demo.head()

In [None]:
# hw_output_demo.to_csv(f'data/outputs/hw_outputs_w_race_{year}.csv', encoding='utf-8', index=False)