# JQI Outputs with Demographics Breakdown

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import warnings
import os
import re
from jqi_functions import *
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format='retina'

## Creating CSV/dataframe

In [2]:
def cleaned_ipums_demo(year: str):
    cwd = os.getcwd()
    if int(year) < 2015:
        print('Invalid year')
        return None
    ipums = pd.read_csv(f'{cwd}/data/IPUMS_{year}.csv')
    ipums = ipums[['YEAR','STATEFIP', 'COUNTYFIP', 'INDNAICS','PERWT','RACE','HISPAN','INCWAGE']]
    ca_ipums = ipums.loc[ipums['STATEFIP'] == 6].copy().reset_index().iloc[:,1:]
    ipums_titles = pd.read_csv(f'{cwd}/data/ind_indnaics_crosswalk_2000_onward_without_code_descriptions.csv')
    ipums_titles = ipums_titles.iloc[2:]
    ipums_titles = ipums_titles.iloc[:,9:]
    ipums_titles['2018 Onward ACS/PRCS INDNAICS CODE'] = normalize_titles(ipums_titles['2018 Onward ACS/PRCS INDNAICS CODE'])
    ipums_titles['2013-2017 ACS/PRCS INDNAICS CODE'] = normalize_titles(ipums_titles['2013-2017 ACS/PRCS INDNAICS CODE'])
    ca_ipums['INDNAICS'] = normalize_titles(ca_ipums['INDNAICS'])
    if int(year) >= 2018:
        ipums_titles = ipums_titles[['2018 Onward ACS/PRCS INDNAICS CODE', 'Industry Title']]
        merged_ipums = pd.merge(ca_ipums, ipums_titles, left_on = 'INDNAICS', right_on = '2018 Onward ACS/PRCS INDNAICS CODE')
        merged_ipums = merged_ipums.rename(columns={"2018 Onward ACS/PRCS INDNAICS CODE": "NAICS Code"})
    else:
        ipums_titles = ipums_titles[['2013-2017 ACS/PRCS INDNAICS CODE', 'Industry Title']]
        merged_ipums = pd.merge(ca_ipums, ipums_titles, left_on = 'INDNAICS', right_on = '2013-2017 ACS/PRCS INDNAICS CODE')
        merged_ipums = merged_ipums.rename(columns={"2013-2017 ACS/PRCS INDNAICS CODE": "NAICS Code"})
    naics_parsed_crosswalk = pd.read_csv(f'{cwd}/data/naics_parsed_crosswalk.csv').drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]
    merged_ipums = pd.merge(merged_ipums, naics_parsed_crosswalk, on='INDNAICS')
    return merged_ipums

In [3]:
ca_ipums = cleaned_ipums_demo('2019')

In [5]:
county_info = pd.read_csv('data/county_to_regions_key - Sheet1.csv')

In [6]:
cost_of_living = pd.read_csv('data/united-way-col-1A1PS1C2019.csv') # 1 adult, 1 preschooler, 1 child

In [7]:
ca_ipums = pd.merge(ca_ipums, county_info, on = 'COUNTYFIP')

In [8]:
len(ca_ipums['County'].unique()) # only 34 California counties present in the IPUMS data

34

In [9]:
ca_ipums = ca_ipums[['INDNAICS', 'PERWT', 'INCWAGE', 'RACE', 'HISPAN',
       'NAICS Code', 'Industry Title_x', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code', 'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions']]

In [10]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'CDI Regions', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'Regional COL'})

In [11]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'Rural/Urban', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'Rural/Urban COL'})

In [12]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'County', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'County COL'})

In [13]:
ca_ipums = ca_ipums[['INDNAICS', 'PERWT', 'INCWAGE', 'RACE', 'HISPAN',
       'NAICS Code', 'Industry Title_x', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code', 'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions',
                    'Regional COL', 'Rural/Urban COL', 'County COL']]

In [14]:
ca_ipums['Regional Rural/Urban'] = ca_ipums['CDI Regions'] + ' ' + ca_ipums['Rural/Urban']

In [15]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'Regional Rural/Urban', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'Regional Rural/Urban COL'})

In [16]:
ca_ipums = ca_ipums.rename(columns = {'Industry Title_x':'Industry Title'})
ca_ipums = ca_ipums[['INDNAICS', 'PERWT', 'INCWAGE', 'RACE', 'HISPAN',
       'NAICS Code', 'Industry Title', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code',
       'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions', 'Regional Rural/Urban',
                    'Regional COL', 'Rural/Urban COL', 'County COL', 'Regional Rural/Urban COL']]

In [17]:
ca_ipums['Industry Title'] = normalize_titles(ca_ipums['Industry Title'])

In [95]:
ca_ipums_test = pd.read_csv('data/IPUMS_2019.csv')
ca_ipums_test = ca_ipums_test[['STATEFIP', 
               'COUNTYFIP',
              'INDNAICS',
              'PERWT',
              'INCWAGE']]
ca_ipums_test = ca_ipums_test.loc[ca_ipums_test['STATEFIP'] == 6].copy()

In [97]:
ca_ipums_test['PERWT'].sum()

39512223.0

In [98]:
ca_ipums['PERWT'].sum()

49249536.0

## Create county lookup dataframe

In [19]:
county_info = county_info[['County', 'Rural/Urban', 'CDI Regions']]

In [20]:
county_info['Regional Rural/Urban'] = county_info['CDI Regions'] + ' ' + county_info['Rural/Urban']

In [21]:
county_info = pd.merge(county_info, cost_of_living, left_on = 'County', right_on = 'Regions')

In [22]:
county_info = county_info.rename(columns = {'Cost of Living':'County COL'})
county_info = county_info.drop(columns=['Regions'])

In [23]:
county_info = pd.merge(county_info, cost_of_living, left_on = 'Regional Rural/Urban', right_on = 'Regions')

In [24]:
county_info = county_info.rename(columns = {'Cost of Living':'Regional Rural/Urban COL'})
county_info = county_info.drop(columns=['Regions'])

In [25]:
county_info = pd.merge(county_info, cost_of_living, left_on = 'CDI Regions', right_on = 'Regions')

In [26]:
county_info = county_info.rename(columns = {'Cost of Living':'Regional COL'})
county_info = county_info.drop(columns=['Regions'])

In [27]:
county_info = pd.merge(county_info, cost_of_living, left_on = 'Rural/Urban', right_on = 'Regions')

In [28]:
county_info = county_info.rename(columns = {'Cost of Living':'Rural/Urban COL'})
county_info = county_info.drop(columns=['Regions'])

In [29]:
county_info['State COL'] = cost_of_living.iloc[11][1]

In [30]:
county_info

Unnamed: 0,County,Rural/Urban,CDI Regions,Regional Rural/Urban,County COL,Regional Rural/Urban COL,Regional COL,Rural/Urban COL,State COL
0,Alameda,Urban,Bay Area,Bay Area Urban,88296,94329,93392,79472,74448
1,Contra Costa,Urban,Bay Area,Bay Area Urban,86284,94329,93392,79472,74448
2,Solano,Urban,Bay Area,Bay Area Urban,66751,94329,93392,79472,74448
3,San Mateo,Urban,Bay Area,Bay Area Urban,112606,94329,93392,79472,74448
4,Santa Clara,Urban,Bay Area,Bay Area Urban,107879,94329,93392,79472,74448
...,...,...,...,...,...,...,...,...,...
67,Placer,Rural,Sacramento,Sacramento Rural,63983,64653,64520,58812,74448
68,Placer,Rural,Sacramento,Sacramento Rural,63983,64653,63126,58812,74448
69,Sutter,Rural,Sacramento,Sacramento Rural,55477,64653,64520,58812,74448
70,Sutter,Rural,Sacramento,Sacramento Rural,55477,64653,63126,58812,74448


## Merge with EDD Data

In [31]:
edd = pd.read_csv('data/edd_merged.csv')

In [32]:
edd['Area Name'] = edd['Area Name'].str.replace(' County', '')
edd = edd.loc[edd['Area Type'] == 'County']
edd = edd.drop(columns=['Industry Title'])
edd = edd.rename(columns={"LMID Industry Title": "Industry Title"})

In [33]:
edd['Sub_1_Code'] = [str(x) for x in edd['Sub_1_Code']]
edd['Main_Code'] = [str(x) for x in edd['Main_Code']]

In [34]:
edd.head()

Unnamed: 0,Industry Title,Parsed_Code,Area Type,Area Name,Date,Seasonally Adjusted,Current Employment,Main_EDD,Main_Code,Sub_1,Sub_1_Code,Sub_2,Sub_2_Code,Sub_3,Sub_3_Code,Sub_4,Sub_4_Code
0,county,939,County,Madera,3/1/19,N,1600,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
1,county,939,County,Fresno,1/1/19,N,7800,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
2,county,939,County,Kern,1/1/19,N,9900,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
3,county,939,County,Los Angeles,1/1/19,N,106800,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
4,county,939,County,Madera,1/1/19,N,1600,government,900,state and local government,940,local government,930,local government excluding education,932,county,939


## Load NAICS Crosswalk

In [35]:
naics = pd.read_csv('data/naics_parsed_crosswalk.csv').drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]

In [36]:
naics['Industry Title'] = normalize_titles(naics['Industry Title'])
naics['Sub_1_Code'] = [str(x) for x in naics['Sub_1_Code']]
naics['Main_Code'] = [str(x) for x in naics['Main_Code']]

## Breakdown dataframes by race

In [135]:
ca_ipums_latino = ca_ipums.loc[ca_ipums['HISPAN'] != 0]
ca_ipums_no_latino = ca_ipums.loc[ca_ipums['HISPAN'] == 0]
ca_ipums_white = ca_ipums_no_latino.loc[ca_ipums_no_latino['RACE'] == 1]
ca_ipums_black = ca_ipums_no_latino.loc[ca_ipums_no_latino['RACE'] == 2]
ca_ipums_native = ca_ipums_no_latino.loc[ca_ipums_no_latino['RACE'] == 3]
ca_ipums_asian = ca_ipums_no_latino.loc[(ca_ipums_no_latino['RACE'] == 4) |
                                   (ca_ipums_no_latino['RACE'] == 5) |
                                   (ca_ipums_no_latino['RACE'] == 6)]
ca_ipums_other = ca_ipums_no_latino.loc[ca_ipums_no_latino['RACE'] == 7]
ca_ipums_multi = ca_ipums_no_latino.loc[(ca_ipums_no_latino['RACE'] == 8) |
                                   (ca_ipums_no_latino['RACE'] == 9)]

In [136]:
wt_counts = {}
for title in ca_ipums['Industry Title'].unique():
    title_copy = ca_ipums.loc[ca_ipums['Industry Title'] == title].copy()
    wt_counts[title] = title_copy['PERWT'].sum()

In [173]:
def append_race_ratio(df, wt_counts):
    df['race_ratio'] = 0
    for title in df['Industry Title'].unique():
        perwt = df['PERWT'][df['Industry Title'] == title].sum()
        df['race_ratio'][df['Industry Title'] == title] = perwt / wt_counts[title]
    return df

In [174]:
ca_ipums_white = append_race_ratio(ca_ipums_white, wt_counts)

In [176]:
ca_ipums_latino = append_race_ratio(ca_ipums_latino, wt_counts)

In [177]:
ca_ipums_black = append_race_ratio(ca_ipums_black, wt_counts)

In [178]:
ca_ipums_native = append_race_ratio(ca_ipums_native, wt_counts)

In [179]:
ca_ipums_asian = append_race_ratio(ca_ipums_asian, wt_counts)

In [180]:
ca_ipums_other = append_race_ratio(ca_ipums_other, wt_counts)

In [181]:
ca_ipums_multi = append_race_ratio(ca_ipums_multi, wt_counts)

## Sample size IPUMS function

In [148]:
def add_to_state_df(df):
    df['Above CA Threshold'] = df['INCWAGE'] > 74448
    df['wt_ind_counts'] = df['PERWT'].groupby(df['INDNAICS']).transform('sum')
    df["Above CA Threshold"] = df["Above CA Threshold"].astype(int)
    df["wt_CA_above_thresh"] = df["Above CA Threshold"] * df['PERWT']
    df['wt_CA_high_wage_count'] = df['wt_CA_above_thresh'].groupby(
        df['INDNAICS']).transform('sum')
    df['wt_CA_high_wage_perc'] = (df['wt_CA_high_wage_count'] / df['wt_ind_counts']) * 100
    df['unwt_ind_counts'] = df['INDNAICS'].groupby(df['INDNAICS']).transform('count')
    return df

#### County Level

In [149]:
def add_to_county_df(df):
    df['above_county_thresh'] = df['INCWAGE'] > df['County COL']
    df["above_county_thresh"] = df["above_county_thresh"].astype(int)
    df["wt_county_above_thresh"] = df["above_county_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','County']).agg(wt_county_ind_counts = ('PERWT','sum'),
                                                     wt_county_hw_count = ('wt_county_above_thresh','sum'),
                                                     unwt_county_ind_counts = ('INDNAICS','count')).reset_index()    
    
    df = pd.merge(df, df_agg, on=['INDNAICS', 'County'])
    df['wt_county_hw_perc'] = (df['wt_county_hw_count'] / df['wt_county_ind_counts']) * 100
    df = df.rename(columns={"County_x": "County", 'wt_county_hw_count_x':'wt_county_hw_count','wt_county_ind_counts_x':'wt_county_ind_counts',
                           'unwt_county_ind_counts_x':'unwt_county_ind_counts'})
    return df

#### Regional Level

In [150]:
def add_to_region_df(df): 
    df['above_region_thresh'] = df['INCWAGE'] > df['Regional COL']
    df["above_region_thresh"] = df["above_region_thresh"].astype(int)
    df["wt_reg_above_thresh"] = df["above_region_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','CDI Regions']).agg(wt_reg_ind_counts = ('PERWT','sum'),
                                                     wt_reg_high_wage_count = ('wt_reg_above_thresh','sum'),
                                                     unwt_reg_ind_counts = ('INDNAICS','count')).reset_index()    
    
    df = pd.merge(df, df_agg, on=['INDNAICS', 'CDI Regions'])
    df['wt_reg_high_wage_perc'] = (df['wt_reg_high_wage_count'] / df['wt_reg_ind_counts']) * 100
    df = df.rename(columns={"CDI Regions_x": "CDI Regions", 'wt_reg_high_wage_count_x':'wt_reg_high_wage_count','wt_reg_ind_counts_x':'wt_reg_ind_counts',
                           'unwt_reg_ind_counts_x':'unwt_reg_ind_counts'})
    return df

#### Regional Rural/Urban Level

In [151]:
def add_to_regioncomm_df(df):
    df['above_regcomm_thresh'] = df['INCWAGE'] > df['Regional Rural/Urban COL']
    df["above_regcomm_thresh"] = df["above_regcomm_thresh"].astype(int)
    df["wt_regcomm_above_thresh"] = df["above_regcomm_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','Regional Rural/Urban']).agg(wt_regcomm_ind_counts = ('PERWT','sum'),
                                                     wt_regcomm_hw_count = ('wt_regcomm_above_thresh','sum'),
                                                     unwt_regcomm_ind_counts = ('INDNAICS','count')).reset_index()    
    
    df = pd.merge(df, df_agg, on=['INDNAICS', 'Regional Rural/Urban'])
    df['wt_regcomm_hw_perc'] = (df['wt_regcomm_hw_count'] / df['wt_regcomm_ind_counts']) * 100
    df = df.rename(columns={"Regional Rural/Urban_x": "Regional Rural/Urban", 'wt_regcomm_hw_count_x':'wt_regcomm_hw_count','wt_regcomm_ind_counts_x':'wt_regcomm_ind_counts',
                           'unwt_regcomm_ind_counts_x':'unwt_regcomm_ind_counts'})
    return df

#### Rural/Urban level

In [152]:
def add_to_community_df(df):
    df['above_comm_thresh'] = df['INCWAGE'] > df['Rural/Urban COL']
    df["above_comm_thresh"] = df["above_comm_thresh"].astype(int)
    df["wt_comm_above_thresh"] = df["above_comm_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','Rural/Urban']).agg(wt_comm_ind_counts = ('PERWT','sum'),
                                                     wt_comm_high_wage_count = ('wt_comm_above_thresh','sum'),
                                                     unwt_comm_ind_counts = ('INDNAICS','count')).reset_index()    
    
    df = pd.merge(df, df_agg, on=['INDNAICS', 'Rural/Urban'])
    df['wt_comm_high_wage_perc'] = (df['wt_comm_high_wage_count'] / df['wt_comm_ind_counts']) * 100
    df = df.rename(columns={"Rural/Urban_x": "Rural/Urban", 'wt_comm_high_wage_count_x':'wt_comm_high_wage_count','wt_comm_ind_counts_x':'wt_comm_ind_counts',
                           'unwt_comm_ind_counts_x':'unwt_comm_ind_counts'})
    return df

#### Add all levels

In [153]:
def add_geo_high_wages(df):
    df_new = df.copy() # initialize new dataframe
    df_new = add_to_state_df(df_new) # creating state level counts
    df_new = add_to_community_df(df_new) # creating rural/urban level counts
    df_new = add_to_region_df(df_new) # creating regional level counts
    df_new = add_to_regioncomm_df(df_new) # creating regional rural/urban level counts
    df_new = add_to_county_df(df_new) # creating county level counts
    return df_new

In [182]:
ca_ipums_hw_white = add_geo_high_wages(ca_ipums_white)
ca_ipums_hw_latino = add_geo_high_wages(ca_ipums_latino)
ca_ipums_hw_black = add_geo_high_wages(ca_ipums_black)
ca_ipums_hw_native = add_geo_high_wages(ca_ipums_native)
ca_ipums_hw_asian = add_geo_high_wages(ca_ipums_asian)
ca_ipums_hw_other = add_geo_high_wages(ca_ipums_other)
ca_ipums_hw_multi = add_geo_high_wages(ca_ipums_multi)

In [183]:
race_ipums_dfs = [ca_ipums_hw_white, ca_ipums_hw_latino, ca_ipums_hw_black, ca_ipums_hw_native, ca_ipums_hw_asian, ca_ipums_hw_other, ca_ipums_hw_multi]

In [184]:
for df in race_ipums_dfs:
    df['Sub_1_Code'] = [str(x) for x in df['Sub_1_Code']]
    df['Main_Code'] = [str(x) for x in df['Main_Code']]

## EDD Hierarchy Function

In [157]:
ca_ipums_hw_white.head().T

Unnamed: 0,0,1,2,3,4
INDNAICS,4853,4853,4853,4853,4853
PERWT,35.0,35.0,35.0,35.0,58.0
INCWAGE,0,0,0,0,0
RACE,1,1,1,1,1
HISPAN,0,0,0,0,0
NAICS Code,4853,4853,4853,4853,4853
Industry Title,taxi and limousine service,taxi and limousine service,taxi and limousine service,taxi and limousine service,taxi and limousine service
Main_Code,400,400,400,400,400
Sub_1_Code,430,430,430,430,430
Sub_2_Code,480,480,480,480,480


#### Incorporate county look up

In [195]:
def edd_to_hw(edd_df, ipums_df_hw, naics_df, county_df, county: str, parsed_code: str, date: str, sample_size: int):
    # filter edd by date, edd county, and industry via parsed code
    edd_df = edd_df.loc[edd_df['Date'] == date].copy()
    if len(edd_df) == 0:
        return "Date not valid or found", np.nan, np.nan, np.nan
    edd_df = edd_df.loc[(edd_df['Sub_4_Code'] == parsed_code) | 
                        (edd_df['Sub_3_Code'] == parsed_code) | 
                        (edd_df['Sub_2_Code'] == parsed_code) | 
                        (edd_df['Sub_1_Code'] == parsed_code) | 
                        (edd_df['Main_Code'] == parsed_code)].copy()
    edd_df = edd_df.loc[edd_df['Area Name'] == county] # this is possible because all counties are in EDD data
    edd_df = edd_df.drop_duplicates(subset='Main_EDD').reset_index().iloc[:,1:]
    
    # merge naics with edd
    edd_df_ = pd.merge(edd_df, naics_df, on='Sub_4_Code')
    if len(edd_df_) == 0:
        edd_df_ = pd.merge(edd_df, naics_df, on='Sub_3_Code')
        if len(edd_df_) == 0:
            edd_df_ = pd.merge(edd_df, naics_df, on='Sub_2_Code')
            if len(edd_df_) == 0:
                edd_df_ = pd.merge(edd_df, naics_df, on='Sub_1_Code')
                if len(edd_df_) == 0:
                    edd_df_ = pd.merge(edd_df, naics_df, on='Main_Code')
                    if len(edd_df_) == 0:
                        return "No parsed code of input industry found within input county", np.nan, np.nan, np.nan
    edd_df = edd_df_.rename(columns = {'Industry Title_x':'EDD Industry',
                                 'Industry Title_y': 'IPUMS Industry'})
    edd_df = edd_df[['EDD Industry', 'Area Name', 'IPUMS Industry', 'INDNAICS', 'Current Employment']]
    employment_count = int(edd_df['Current Employment'].values[0])
    naics_code = edd_df['INDNAICS'].values[0]
    output, hw_perc, industry, ratio = ca_ipums_filter(ipums_df_hw, county_df, county, naics_code, sample_size)
    race_ind_count = employment_count * ratio
    hw_count = (race_ind_count * hw_perc) / 100
    output += f", High wage count: {hw_count}"
    return output, hw_count, industry, race_ind_count

In [196]:
def ca_ipums_filter(df, county_df, county: str, NAICS: str, n: int):
    df = df.loc[df['County'] == county].copy()
    level = 'county'
    if len(df) == 0:
        county_df = county_df.loc[county_df['County'] == county].copy() # narrow down county df
        reg_rural_urban = county_df['Regional Rural/Urban'].values[0]
        region = county_df['CDI Regions'].values[0]
        rural_urban = county_df['Rural/Urban'].values[0]
        df = df.loc[df['Regional Rural/Urban'] == reg_rural_urban].copy()
        level = 'reg_rural_urban'
        if len(df) == 0:
            df = df.loc[df['CDI Regions'] == region].copy()
            level = 'region'
            if len(df) == 0:
                df = df.loc[df['Rural/Urban'] == rural_urban].copy()
                level = 'rural_urban'
                if len(df) == 0:
                    level = 'state'
    
    df = df.loc[df['INDNAICS'] == NAICS].copy()
    try:
        industry = df['Industry Title'].values[0]
    except:
        return "Industry not valid or found", np.nan, np.nan, np.nan
    if len(df) == 0:
        return "NAICS Code not valid or found", np.nan, np.nan, np.nan
    df = df.drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]
    
    # incorporate tracked levels for conditions
    if level == 'county':
        if df['unwt_county_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: County, Industry: {industry}, High wage percentage: {df['wt_county_hw_perc'].values[0]}", df['wt_county_hw_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_regcomm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Regional Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_regcomm_hw_perc'].values[0]}", df['wt_regcomm_hw_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_reg_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Regional, Industry: {industry}, High wage percentage: {df['wt_reg_high_wage_perc'].values[0]}", df['wt_reg_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_comm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_comm_high_wage_perc'].values[0]}", df['wt_comm_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: State, Industry: {industry}, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}", df['wt_CA_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        else:
            return "Not enough records available to satisfy sample size request", np.nan, np.nan, np.nan
        
    elif level == 'reg_rural_urban':
        if df['unwt_regcomm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Regional Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_regcomm_hw_perc'].values[0]}", df['wt_regcomm_hw_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_reg_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Regional, Industry: {industry}, High wage percentage: {df['wt_reg_high_wage_perc'].values[0]}", df['wt_reg_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_comm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_comm_high_wage_perc'].values[0]}", df['wt_comm_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: State, Industry: {industry}, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}", df['wt_CA_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        else:
            return "Not enough records available to satisfy sample size request", np.nan, np.nan, np.nan
        
    elif level == 'region':
        if df['unwt_reg_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Regional, Industry: {industry}, High wage percentage: {df['wt_reg_high_wage_perc'].values[0]}", df['wt_reg_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_comm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_comm_high_wage_perc'].values[0]}", df['wt_comm_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: State, Industry: {industry}, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}", df['wt_CA_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        else:
            return "Not enough records available to satisfy sample size request", np.nan, np.nan, np.nan
        
    elif level == 'rural_urban':
        if df['unwt_comm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_comm_high_wage_perc'].values[0]}", df['wt_comm_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        elif df['unwt_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: State, Industry: {industry}, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}", df['wt_CA_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        else:
            return "Not enough records available to satisfy sample size request", np.nan, np.nan, np.nan
        
    elif level == 'state':
        if df['unwt_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: State, Industry: {industry}, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}", df['wt_CA_high_wage_perc'].values[0], industry, df['race_ratio'].values[0]
        else:
            return "Not enough records available to satisfy sample size request", np.nan, np.nan, np.nan

### Create tidy dataframe of high wage counts by race breakdown

In [197]:
counties_edd = edd['Area Name'].unique()

In [198]:
parsed_codes = set(list(edd['Main_Code'].unique()) + list(edd['Sub_1_Code'].unique()) + list(edd['Sub_2_Code'].unique()) + list(edd['Sub_3_Code'].unique()) + list(edd['Sub_4_Code'].unique()))

In [199]:
dates_edd = edd['Date'].unique()

White

In [200]:
total_iterations = len(counties_edd) * len(parsed_codes) * len(dates_edd)

In [201]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []

In [202]:
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd, ca_ipums_hw_white, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10000 == 0:
                print(f'Progress: {progress_count} / {total_iterations}')

Progress: 10000 / 104400
Progress: 20000 / 104400
Progress: 30000 / 104400
Progress: 40000 / 104400
Progress: 50000 / 104400
Progress: 60000 / 104400
Progress: 70000 / 104400
Progress: 80000 / 104400
Progress: 90000 / 104400
Progress: 100000 / 104400


In [203]:
df_dict_white = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_white = pd.DataFrame(df_dict_white)
tidy_df_white = tidy_df_white[tidy_df_white['Industry'].notna()]
tidy_df_white['Date']= pd.to_datetime(tidy_df_white['Date'])
tidy_df_white['High Wage Count'] = tidy_df_white['High Wage Count'].astype(int)
tidy_df_white = tidy_df_white.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_white

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count
6433,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,5394,19421.782564
6434,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,5946,21409.127757
6432,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,6047,21770.463247
6435,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,5971,21499.461629
6436,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,5407,19466.949501
...,...,...,...,...,...
14314,wholesale electronic markets agents and brokers,2019-10-01,Orange,723,1889.565626
13439,wholesale electronic markets agents and brokers,2019-11-01,Orange,723,1889.565626
14315,wholesale electronic markets agents and brokers,2019-11-01,Orange,723,1889.565626
13437,wholesale electronic markets agents and brokers,2019-12-01,Orange,723,1889.565626


In [204]:
tidy_df_white['Output Race'] = 'White'
tidy_df_white

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count,Output Race
6433,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,5394,19421.782564,White
6434,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,5946,21409.127757,White
6432,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,6047,21770.463247,White
6435,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,5971,21499.461629,White
6436,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,5407,19466.949501,White
...,...,...,...,...,...,...
14314,wholesale electronic markets agents and brokers,2019-10-01,Orange,723,1889.565626,White
13439,wholesale electronic markets agents and brokers,2019-11-01,Orange,723,1889.565626,White
14315,wholesale electronic markets agents and brokers,2019-11-01,Orange,723,1889.565626,White
13437,wholesale electronic markets agents and brokers,2019-12-01,Orange,723,1889.565626,White


Latino

In [205]:
industries = []
dates = []
counties = []
counts = []
emp_counts = [] 
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd, ca_ipums_hw_latino, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10000 == 0:
                print(f'Progress: {progress_count} / {total_iterations}')

Progress: 10000 / 104400
Progress: 20000 / 104400
Progress: 30000 / 104400
Progress: 40000 / 104400
Progress: 50000 / 104400
Progress: 60000 / 104400
Progress: 70000 / 104400
Progress: 80000 / 104400
Progress: 90000 / 104400
Progress: 100000 / 104400


In [206]:
df_dict_latino = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_latino = pd.DataFrame(df_dict_latino)
tidy_df_latino = tidy_df_latino[tidy_df_latino['Industry'].notna()]
tidy_df_latino['Date']= pd.to_datetime(tidy_df_latino['Date'])
tidy_df_latino['High Wage Count'] = tidy_df_latino['High Wage Count'].astype(int)
tidy_df_latino = tidy_df_latino.sort_values(by=['Industry', 'County', 'Date'])

In [207]:
tidy_df_latino['Output Race'] = 'Latino'
tidy_df_latino

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count,Output Race
6433,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,878,10200.176628,Latino
6434,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,968,11243.915632,Latino
6432,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,984,11433.686360,Latino
6435,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,972,11291.358314,Latino
6436,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,880,10223.897969,Latino
...,...,...,...,...,...,...
14314,wholesale electronic markets agents and brokers,2019-10-01,Orange,39,660.603062,Latino
13439,wholesale electronic markets agents and brokers,2019-11-01,Orange,39,660.603062,Latino
14315,wholesale electronic markets agents and brokers,2019-11-01,Orange,39,660.603062,Latino
13437,wholesale electronic markets agents and brokers,2019-12-01,Orange,39,660.603062,Latino


Black

In [208]:
industries = []
dates = []
counties = []
counts = []
emp_counts = [] 
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd, ca_ipums_hw_black, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10000 == 0:
                print(f'Progress: {progress_count} / {total_iterations}')

Progress: 10000 / 104400
Progress: 20000 / 104400
Progress: 30000 / 104400
Progress: 40000 / 104400
Progress: 50000 / 104400
Progress: 60000 / 104400
Progress: 70000 / 104400
Progress: 80000 / 104400
Progress: 90000 / 104400
Progress: 100000 / 104400


In [209]:
df_dict_black = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_black = pd.DataFrame(df_dict_black)
tidy_df_black = tidy_df_black[tidy_df_black['Industry'].notna()]
tidy_df_black['Date']= pd.to_datetime(tidy_df_black['Date'])
tidy_df_black['High Wage Count'] = tidy_df_black['High Wage Count'].astype(int)
tidy_df_black = tidy_df_black.sort_values(by=['Industry', 'County', 'Date'])

In [210]:
tidy_df_black['Output Race'] = 'Black'
tidy_df_black

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count,Output Race
6433,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,193,1833.846013,Black
6434,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,213,2021.495372,Black
6432,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,217,2055.613437,Black
6435,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,214,2030.024888,Black
6436,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,194,1838.110771,Black
...,...,...,...,...,...,...
7111,wholesale electronic markets agents and brokers,2019-08-01,Los Angeles,24,131.343671,Black
7112,wholesale electronic markets agents and brokers,2019-09-01,Los Angeles,24,128.998248,Black
7114,wholesale electronic markets agents and brokers,2019-10-01,Los Angeles,24,130.170960,Black
7115,wholesale electronic markets agents and brokers,2019-11-01,Los Angeles,24,131.343671,Black


American Indian/Alaska Native

In [211]:
industries = []
dates = []
counties = []
counts = []
emp_counts = [] 
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd, ca_ipums_hw_native, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10000 == 0:
                print(f'Progress: {progress_count} / {total_iterations}')

Progress: 10000 / 104400
Progress: 20000 / 104400
Progress: 30000 / 104400
Progress: 40000 / 104400
Progress: 50000 / 104400
Progress: 60000 / 104400
Progress: 70000 / 104400
Progress: 80000 / 104400
Progress: 90000 / 104400
Progress: 100000 / 104400


In [212]:
df_dict_native = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_native = pd.DataFrame(df_dict_native)
tidy_df_native = tidy_df_native[tidy_df_native['Industry'].notna()]
tidy_df_native['Date']= pd.to_datetime(tidy_df_native['Date'])
tidy_df_native['High Wage Count'] = tidy_df_native['High Wage Count'].astype(int)
tidy_df_native = tidy_df_native.sort_values(by=['Industry', 'County', 'Date'])

In [213]:
tidy_df_native['Output Race'] = 'American Indian/Alaska Native'
tidy_df_native

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count,Output Race
5713,architectural engineering and related services,2019-01-01,Los Angeles,33,68.990093,American Indian/Alaska Native
6049,architectural engineering and related services,2019-01-01,Los Angeles,33,68.990093,American Indian/Alaska Native
5714,architectural engineering and related services,2019-02-01,Los Angeles,33,69.190064,American Indian/Alaska Native
6050,architectural engineering and related services,2019-02-01,Los Angeles,33,69.190064,American Indian/Alaska Native
5712,architectural engineering and related services,2019-03-01,Los Angeles,33,68.990093,American Indian/Alaska Native
...,...,...,...,...,...,...
12741,restaurants and other food services,2019-12-01,Orange,18,151.532039,American Indian/Alaska Native
12885,restaurants and other food services,2019-12-01,Orange,18,151.532039,American Indian/Alaska Native
12969,restaurants and other food services,2019-12-01,Orange,18,151.532039,American Indian/Alaska Native
13161,restaurants and other food services,2019-12-01,Orange,18,151.532039,American Indian/Alaska Native


Asian

In [214]:
industries = []
dates = []
counties = []
counts = []
emp_counts = [] 
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd, ca_ipums_hw_asian, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10000 == 0:
                print(f'Progress: {progress_count} / {total_iterations}')

Progress: 10000 / 104400
Progress: 20000 / 104400
Progress: 30000 / 104400
Progress: 40000 / 104400
Progress: 50000 / 104400
Progress: 60000 / 104400
Progress: 70000 / 104400
Progress: 80000 / 104400
Progress: 90000 / 104400
Progress: 100000 / 104400


In [215]:
df_dict_asian = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_asian = pd.DataFrame(df_dict_asian)
tidy_df_asian = tidy_df_asian[tidy_df_asian['Industry'].notna()]
tidy_df_asian['Date']= pd.to_datetime(tidy_df_asian['Date'])
tidy_df_asian['High Wage Count'] = tidy_df_asian['High Wage Count'].astype(int)
tidy_df_asian = tidy_df_asian.sort_values(by=['Industry', 'County', 'Date'])

In [216]:
tidy_df_asian['Output Race'] = 'Asian'
tidy_df_asian

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count,Output Race
6433,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,2156,9773.180103,Asian
6434,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,2377,10773.226439,Asian
6432,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,2417,10955.053046,Asian
6435,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,2387,10818.683091,Asian
6436,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,2161,9795.908429,Asian
...,...,...,...,...,...,...
14314,wholesale electronic markets agents and brokers,2019-10-01,Orange,203,2823.598586,Asian
13439,wholesale electronic markets agents and brokers,2019-11-01,Orange,203,2823.598586,Asian
14315,wholesale electronic markets agents and brokers,2019-11-01,Orange,203,2823.598586,Asian
13437,wholesale electronic markets agents and brokers,2019-12-01,Orange,203,2823.598586,Asian


Some other race

In [217]:
industries = []
dates = []
counties = []
counts = []
emp_counts = [] 
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd, ca_ipums_hw_other, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10000 == 0:
                print(f'Progress: {progress_count} / {total_iterations}')

Progress: 10000 / 104400
Progress: 20000 / 104400
Progress: 30000 / 104400
Progress: 40000 / 104400
Progress: 50000 / 104400
Progress: 60000 / 104400
Progress: 70000 / 104400
Progress: 80000 / 104400
Progress: 90000 / 104400
Progress: 100000 / 104400


In [218]:
df_dict_other = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_other = pd.DataFrame(df_dict_other)
tidy_df_other = tidy_df_other[tidy_df_other['Industry'].notna()]
tidy_df_other['Date']= pd.to_datetime(tidy_df_other['Date'])
tidy_df_other['High Wage Count'] = tidy_df_other['High Wage Count'].astype(int)
tidy_df_other = tidy_df_other.sort_values(by=['Industry', 'County', 'Date'])

In [219]:
tidy_df_other['Output Race'] = 'Some other race'
tidy_df_other

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count,Output Race
5713,architectural engineering and related services,2019-01-01,Los Angeles,120,135.898279,Some other race
6049,architectural engineering and related services,2019-01-01,Los Angeles,120,135.898279,Some other race
5714,architectural engineering and related services,2019-02-01,Los Angeles,120,136.292187,Some other race
6050,architectural engineering and related services,2019-02-01,Los Angeles,120,136.292187,Some other race
5712,architectural engineering and related services,2019-03-01,Los Angeles,120,135.898279,Some other race
...,...,...,...,...,...,...
7159,taxi and limousine service,2019-08-01,Los Angeles,0,83.720463,Some other race
7160,taxi and limousine service,2019-09-01,Los Angeles,0,84.429959,Some other race
7162,taxi and limousine service,2019-10-01,Los Angeles,0,85.139454,Some other race
7163,taxi and limousine service,2019-11-01,Los Angeles,0,86.558445,Some other race


Multiracial

In [220]:
industries = []
dates = []
counties = []
counts = []
emp_counts = []
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd, ca_ipums_hw_multi, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10000 == 0:
                print(f'Progress: {progress_count} / {total_iterations}')

Progress: 10000 / 104400
Progress: 20000 / 104400
Progress: 30000 / 104400
Progress: 40000 / 104400
Progress: 50000 / 104400
Progress: 60000 / 104400
Progress: 70000 / 104400
Progress: 80000 / 104400
Progress: 90000 / 104400
Progress: 100000 / 104400


In [221]:
df_dict_multi = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_multi = pd.DataFrame(df_dict_multi)
tidy_df_multi = tidy_df_multi[tidy_df_multi['Industry'].notna()]
tidy_df_multi['Date']= pd.to_datetime(tidy_df_multi['Date'])
tidy_df_multi['High Wage Count'] = tidy_df_multi['High Wage Count'].astype(int)
tidy_df_multi = tidy_df_multi.sort_values(by=['Industry', 'County', 'Date'])

In [222]:
tidy_df_multi['Output Race'] = 'Multiracial'
tidy_df_multi

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count,Output Race
6433,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,302,1680.565209,Multiracial
6434,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,333,1852.530021,Multiracial
6432,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,339,1883.796350,Multiracial
6435,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,334,1860.346603,Multiracial
6436,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,303,1684.473500,Multiracial
...,...,...,...,...,...,...
14314,wholesale electronic markets agents and brokers,2019-10-01,Orange,18,61.733602,Multiracial
13439,wholesale electronic markets agents and brokers,2019-11-01,Orange,18,61.733602,Multiracial
14315,wholesale electronic markets agents and brokers,2019-11-01,Orange,18,61.733602,Multiracial
13437,wholesale electronic markets agents and brokers,2019-12-01,Orange,18,61.733602,Multiracial


In [232]:
tidy_df_demo = pd.concat([tidy_df_white, tidy_df_black, tidy_df_latino, tidy_df_asian, tidy_df_native, tidy_df_other, tidy_df_multi], ignore_index=True)

In [233]:
tidy_df_demo_col = pd.merge(tidy_df_demo, cost_of_living, left_on='County', right_on='Regions')

In [234]:
tidy_df_demo_col = tidy_df_demo_col[['Industry', 'Date', 'County', 'High Wage Count', 'Cost of Living', 'Employment Count', 'Output Race']]

In [235]:
tidy_df_demo_col

Unnamed: 0,Industry,Date,County,High Wage Count,Cost of Living,Employment Count,Output Race
0,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,5394,80216,19421.782564,White
1,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,5394,80216,19421.782564,White
2,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,5946,80216,21409.127757,White
3,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,5946,80216,21409.127757,White
4,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,6047,80216,21770.463247,White
...,...,...,...,...,...,...,...
33031,management of companies and enterprises,2019-10-01,San Francisco,2601,114808,3365.126538,Asian
33032,management of companies and enterprises,2019-11-01,San Francisco,2470,114808,3195.170652,Asian
33033,management of companies and enterprises,2019-11-01,San Francisco,2470,114808,3195.170652,Asian
33034,management of companies and enterprises,2019-12-01,San Francisco,2457,114808,3178.175064,Asian


In [236]:
# tidy_df_demo_col.to_csv('high_wage_outputs_w_race_v2.csv', encoding='utf-8', index=False)