# JQI Outputs with Demographics Breakdown

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import warnings
import os
import re
from jqi_functions import *
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format='retina'

## Creating CSV/dataframe

In [2]:
def cleaned_ipums_demo(year: str):
    cwd = os.getcwd()
    if int(year) < 2015:
        print('Invalid year')
        return None
    ipums = pd.read_csv(f'{cwd}/data/IPUMS_{year}.csv')
    ipums = ipums[['YEAR','STATEFIP', 'COUNTYFIP', 'INDNAICS','PERWT','RACE','INCWAGE']]
    ca_ipums = ipums.loc[ipums['STATEFIP'] == 6].copy().reset_index().iloc[:,1:]
    ipums_titles = pd.read_csv(f'{cwd}/data/ind_indnaics_crosswalk_2000_onward_without_code_descriptions.csv')
    ipums_titles = ipums_titles.iloc[2:]
    ipums_titles = ipums_titles.iloc[:,9:]
    ipums_titles['2018 Onward ACS/PRCS INDNAICS CODE'] = normalize_titles(ipums_titles['2018 Onward ACS/PRCS INDNAICS CODE'])
    ipums_titles['2013-2017 ACS/PRCS INDNAICS CODE'] = normalize_titles(ipums_titles['2013-2017 ACS/PRCS INDNAICS CODE'])
    ca_ipums['INDNAICS'] = normalize_titles(ca_ipums['INDNAICS'])
    if int(year) >= 2018:
        ipums_titles = ipums_titles[['2018 Onward ACS/PRCS INDNAICS CODE', 'Industry Title']]
        merged_ipums = pd.merge(ca_ipums, ipums_titles, left_on = 'INDNAICS', right_on = '2018 Onward ACS/PRCS INDNAICS CODE')
        merged_ipums = merged_ipums.rename(columns={"2018 Onward ACS/PRCS INDNAICS CODE": "NAICS Code"})
    else:
        ipums_titles = ipums_titles[['2013-2017 ACS/PRCS INDNAICS CODE', 'Industry Title']]
        merged_ipums = pd.merge(ca_ipums, ipums_titles, left_on = 'INDNAICS', right_on = '2013-2017 ACS/PRCS INDNAICS CODE')
        merged_ipums = merged_ipums.rename(columns={"2013-2017 ACS/PRCS INDNAICS CODE": "NAICS Code"})
    naics_parsed_crosswalk = pd.read_csv(f'{cwd}/data/naics_parsed_crosswalk.csv').drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]
    merged_ipums = pd.merge(merged_ipums, naics_parsed_crosswalk, on='INDNAICS')
    return merged_ipums

In [3]:
ca_ipums = cleaned_ipums_demo('2019')

In [4]:
county_info = pd.read_csv('data/county_to_regions_key - Sheet1.csv')

In [5]:
cost_of_living = pd.read_csv('data/united-way-col-1A1PS1C2019.csv') # 1 adult, 1 preschooler, 1 child

In [6]:
ca_ipums = pd.merge(ca_ipums, county_info, on = 'COUNTYFIP')

In [7]:
len(ca_ipums['County'].unique()) # only 34 California counties present in the IPUMS data

34

In [8]:
ca_ipums = ca_ipums[['INDNAICS', 'PERWT', 'INCWAGE', 'RACE',
       'NAICS Code', 'Industry Title_x', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code', 'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions']]

In [9]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'CDI Regions', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'Regional COL'})

In [10]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'Rural/Urban', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'Rural/Urban COL'})

In [11]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'County', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'County COL'})

In [12]:
ca_ipums = ca_ipums[['INDNAICS', 'PERWT', 'INCWAGE', 'RACE',
       'NAICS Code', 'Industry Title_x', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code', 'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions',
                    'Regional COL', 'Rural/Urban COL', 'County COL']]

In [13]:
ca_ipums['Regional Rural/Urban'] = ca_ipums['CDI Regions'] + ' ' + ca_ipums['Rural/Urban']

In [14]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'Regional Rural/Urban', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'Regional Rural/Urban COL'})

In [15]:
ca_ipums = ca_ipums.rename(columns = {'Industry Title_x':'Industry Title'})
ca_ipums = ca_ipums[['INDNAICS', 'PERWT', 'INCWAGE', 'RACE',
       'NAICS Code', 'Industry Title', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code',
       'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions', 'Regional Rural/Urban',
                    'Regional COL', 'Rural/Urban COL', 'County COL', 'Regional Rural/Urban COL']]

In [16]:
ca_ipums['Industry Title'] = normalize_titles(ca_ipums['Industry Title'])

In [17]:
ca_ipums

Unnamed: 0,INDNAICS,PERWT,INCWAGE,RACE,NAICS Code,Industry Title,Main_Code,Sub_1_Code,Sub_2_Code,Sub_3_Code,Sub_4_Code,County,Rural/Urban,CDI Regions,Regional Rural/Urban,Regional COL,Rural/Urban COL,County COL,Regional Rural/Urban COL
0,4853,21.0,23100,2,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,Los Angeles Urban,80216,79472,80216,80216
1,4853,21.0,23100,2,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,Los Angeles Urban,80216,79472,80216,80216
2,4853,21.0,23100,2,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,Los Angeles Urban,80216,79472,80216,80216
3,4853,21.0,23100,2,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,Los Angeles Urban,80216,79472,80216,80216
4,4853,11.0,28000,9,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,Los Angeles Urban,80216,79472,80216,80216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488572,3113,558.0,5300,1,3113,sugar and confectionery products,300,310,311,311,311,Humboldt,Rural,Redwood Coast,Redwood Coast Rural,58625,58812,59608,58625
488573,3113,244.0,5300,1,3113,sugar and confectionery products,300,310,311,311,311,Humboldt,Rural,Redwood Coast,Redwood Coast Rural,58625,58812,59608,58625
488574,4247,117.0,50000,1,4247,petroleum and petroleum products merchant whol...,400,420,424,424d,424d,Humboldt,Rural,Redwood Coast,Redwood Coast Rural,58625,58812,59608,58625
488575,42491,148.0,40000,1,42491,farm supplies merchant wholesalers,400,420,424,424,424,Humboldt,Rural,Redwood Coast,Redwood Coast Rural,58625,58812,59608,58625


## Create county lookup dataframe

In [18]:
county_info = county_info[['County', 'Rural/Urban', 'CDI Regions']]

In [19]:
county_info['Regional Rural/Urban'] = county_info['CDI Regions'] + ' ' + county_info['Rural/Urban']

In [20]:
county_info = pd.merge(county_info, cost_of_living, left_on = 'County', right_on = 'Regions')

In [21]:
county_info = county_info.rename(columns = {'Cost of Living':'County COL'})
county_info = county_info.drop(columns=['Regions'])

In [22]:
county_info = pd.merge(county_info, cost_of_living, left_on = 'Regional Rural/Urban', right_on = 'Regions')

In [23]:
county_info = county_info.rename(columns = {'Cost of Living':'Regional Rural/Urban COL'})
county_info = county_info.drop(columns=['Regions'])

In [24]:
county_info = pd.merge(county_info, cost_of_living, left_on = 'CDI Regions', right_on = 'Regions')

In [25]:
county_info = county_info.rename(columns = {'Cost of Living':'Regional COL'})
county_info = county_info.drop(columns=['Regions'])

In [26]:
county_info = pd.merge(county_info, cost_of_living, left_on = 'Rural/Urban', right_on = 'Regions')

In [27]:
county_info = county_info.rename(columns = {'Cost of Living':'Rural/Urban COL'})
county_info = county_info.drop(columns=['Regions'])

In [28]:
county_info['State COL'] = cost_of_living.iloc[11][1]

In [29]:
county_info

Unnamed: 0,County,Rural/Urban,CDI Regions,Regional Rural/Urban,County COL,Regional Rural/Urban COL,Regional COL,Rural/Urban COL,State COL
0,Alameda,Urban,Bay Area,Bay Area Urban,88296,94329,93392,79472,74448
1,Contra Costa,Urban,Bay Area,Bay Area Urban,86284,94329,93392,79472,74448
2,Solano,Urban,Bay Area,Bay Area Urban,66751,94329,93392,79472,74448
3,San Mateo,Urban,Bay Area,Bay Area Urban,112606,94329,93392,79472,74448
4,Santa Clara,Urban,Bay Area,Bay Area Urban,107879,94329,93392,79472,74448
...,...,...,...,...,...,...,...,...,...
67,Placer,Rural,Sacramento,Sacramento Rural,63983,64653,64520,58812,74448
68,Placer,Rural,Sacramento,Sacramento Rural,63983,64653,63126,58812,74448
69,Sutter,Rural,Sacramento,Sacramento Rural,55477,64653,64520,58812,74448
70,Sutter,Rural,Sacramento,Sacramento Rural,55477,64653,63126,58812,74448


## Merge with EDD Data

In [30]:
edd = pd.read_csv('data/edd_merged.csv')

In [31]:
edd['Area Name'] = edd['Area Name'].str.replace(' County', '')
edd = edd.loc[edd['Area Type'] == 'County']
edd = edd.drop(columns=['Industry Title'])
edd = edd.rename(columns={"LMID Industry Title": "Industry Title"})

In [32]:
edd['Sub_1_Code'] = [str(x) for x in edd['Sub_1_Code']]
edd['Main_Code'] = [str(x) for x in edd['Main_Code']]

In [33]:
edd.head()

Unnamed: 0,Industry Title,Parsed_Code,Area Type,Area Name,Date,Seasonally Adjusted,Current Employment,Main_EDD,Main_Code,Sub_1,Sub_1_Code,Sub_2,Sub_2_Code,Sub_3,Sub_3_Code,Sub_4,Sub_4_Code
0,county,939,County,Madera,3/1/19,N,1600,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
1,county,939,County,Fresno,1/1/19,N,7800,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
2,county,939,County,Kern,1/1/19,N,9900,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
3,county,939,County,Los Angeles,1/1/19,N,106800,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
4,county,939,County,Madera,1/1/19,N,1600,government,900,state and local government,940,local government,930,local government excluding education,932,county,939


## Load NAICS Crosswalk

In [34]:
naics = pd.read_csv('data/naics_parsed_crosswalk.csv').drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]

In [35]:
naics['Industry Title'] = normalize_titles(naics['Industry Title'])
naics['Sub_1_Code'] = [str(x) for x in naics['Sub_1_Code']]
naics['Main_Code'] = [str(x) for x in naics['Main_Code']]

## Breakdown dataframes by race

In [77]:
ca_ipums_white = ca_ipums.loc[ca_ipums['RACE'] == 1]
ca_ipums_black = ca_ipums.loc[ca_ipums['RACE'] == 2]
ca_ipums_native = ca_ipums.loc[ca_ipums['RACE'] == 3]
ca_ipums_asian = ca_ipums.loc[(ca_ipums['RACE'] == 4) |
                                   (ca_ipums['RACE'] == 5) |
                                   (ca_ipums['RACE'] == 6)]
ca_ipums_other = ca_ipums.loc[ca_ipums['RACE'] == 7]
ca_ipums_multi = ca_ipums.loc[(ca_ipums['RACE'] == 8) |
                                   (ca_ipums['RACE'] == 9)]

## Sample size IPUMS function

In [36]:
def add_to_state_df(df):
    df['Above CA Threshold'] = df['INCWAGE'] > 74448
    df['wt_ind_counts'] = df['PERWT'].groupby(df['INDNAICS']).transform('sum')
    df["Above CA Threshold"] = df["Above CA Threshold"].astype(int)
    df["wt_CA_above_thresh"] = df["Above CA Threshold"] * df['PERWT']
    df['wt_CA_high_wage_count'] = df['wt_CA_above_thresh'].groupby(
        df['INDNAICS']).transform('sum')
    df['wt_CA_high_wage_perc'] = (df['wt_CA_high_wage_count'] / df['wt_ind_counts']) * 100
    df['unwt_ind_counts'] = df['INDNAICS'].groupby(df['INDNAICS']).transform('count')
    return df

#### County Level

In [38]:
def add_to_county_df(df):
    df['above_county_thresh'] = df['INCWAGE'] > df['County COL']
    df["above_county_thresh"] = df["above_county_thresh"].astype(int)
    df["wt_county_above_thresh"] = df["above_county_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','County']).agg(wt_county_ind_counts = ('PERWT','sum'),
                                                     wt_county_hw_count = ('wt_county_above_thresh','sum'),
                                                     unwt_county_ind_counts = ('INDNAICS','count')).reset_index()    
    
    df = pd.merge(df, df_agg, on=['INDNAICS', 'County'])
    df['wt_county_hw_perc'] = (df['wt_county_hw_count'] / df['wt_county_ind_counts']) * 100
    df = df.rename(columns={"County_x": "County", 'wt_county_hw_count_x':'wt_county_hw_count','wt_county_ind_counts_x':'wt_county_ind_counts',
                           'unwt_county_ind_counts_x':'unwt_county_ind_counts'})
    return df

#### Regional Level

In [40]:
def add_to_region_df(df): 
    df['above_region_thresh'] = df['INCWAGE'] > df['Regional COL']
    df["above_region_thresh"] = df["above_region_thresh"].astype(int)
    df["wt_reg_above_thresh"] = df["above_region_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','CDI Regions']).agg(wt_reg_ind_counts = ('PERWT','sum'),
                                                     wt_reg_high_wage_count = ('wt_reg_above_thresh','sum'),
                                                     unwt_reg_ind_counts = ('INDNAICS','count')).reset_index()    
    
    df = pd.merge(df, df_agg, on=['INDNAICS', 'CDI Regions'])
    df['wt_reg_high_wage_perc'] = (df['wt_reg_high_wage_count'] / df['wt_reg_ind_counts']) * 100
    df = df.rename(columns={"CDI Regions_x": "CDI Regions", 'wt_reg_high_wage_count_x':'wt_reg_high_wage_count','wt_reg_ind_counts_x':'wt_reg_ind_counts',
                           'unwt_reg_ind_counts_x':'unwt_reg_ind_counts'})
    return df

#### Regional Rural/Urban Level

In [42]:
def add_to_regioncomm_df(df):
    df['above_regcomm_thresh'] = df['INCWAGE'] > df['Regional Rural/Urban COL']
    df["above_regcomm_thresh"] = df["above_regcomm_thresh"].astype(int)
    df["wt_regcomm_above_thresh"] = df["above_regcomm_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','Regional Rural/Urban']).agg(wt_regcomm_ind_counts = ('PERWT','sum'),
                                                     wt_regcomm_hw_count = ('wt_regcomm_above_thresh','sum'),
                                                     unwt_regcomm_ind_counts = ('INDNAICS','count')).reset_index()    
    
    df = pd.merge(df, df_agg, on=['INDNAICS', 'Regional Rural/Urban'])
    df['wt_regcomm_hw_perc'] = (df['wt_regcomm_hw_count'] / df['wt_regcomm_ind_counts']) * 100
    df = df.rename(columns={"Regional Rural/Urban_x": "Regional Rural/Urban", 'wt_regcomm_hw_count_x':'wt_regcomm_hw_count','wt_regcomm_ind_counts_x':'wt_regcomm_ind_counts',
                           'unwt_regcomm_ind_counts_x':'unwt_regcomm_ind_counts'})
    return df

#### Rural/Urban level

In [44]:
def add_to_community_df(df):
    df['above_comm_thresh'] = df['INCWAGE'] > df['Rural/Urban COL']
    df["above_comm_thresh"] = df["above_comm_thresh"].astype(int)
    df["wt_comm_above_thresh"] = df["above_comm_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','Rural/Urban']).agg(wt_comm_ind_counts = ('PERWT','sum'),
                                                     wt_comm_high_wage_count = ('wt_comm_above_thresh','sum'),
                                                     unwt_comm_ind_counts = ('INDNAICS','count')).reset_index()    
    
    df = pd.merge(df, df_agg, on=['INDNAICS', 'Rural/Urban'])
    df['wt_comm_high_wage_perc'] = (df['wt_comm_high_wage_count'] / df['wt_comm_ind_counts']) * 100
    df = df.rename(columns={"Rural/Urban_x": "Rural/Urban", 'wt_comm_high_wage_count_x':'wt_comm_high_wage_count','wt_comm_ind_counts_x':'wt_comm_ind_counts',
                           'unwt_comm_ind_counts_x':'unwt_comm_ind_counts'})
    return df

#### Add all levels

In [46]:
def add_geo_high_wages(df):
    df_new = df.copy() # initialize new dataframe
    df_new = add_to_state_df(df_new) # creating state level counts
    df_new = add_to_community_df(df_new) # creating rural/urban level counts
    df_new = add_to_region_df(df_new) # creating regional level counts
    df_new = add_to_regioncomm_df(df_new) # creating regional rural/urban level counts
    df_new = add_to_county_df(df_new) # creating county level counts
    return df_new

In [81]:
ca_ipums_hw_white = add_geo_high_wages(ca_ipums_white)
ca_ipums_hw_black = add_geo_high_wages(ca_ipums_black)
ca_ipums_hw_native = add_geo_high_wages(ca_ipums_native)
ca_ipums_hw_asian = add_geo_high_wages(ca_ipums_asian)
ca_ipums_hw_other = add_geo_high_wages(ca_ipums_other)
ca_ipums_hw_multi = add_geo_high_wages(ca_ipums_multi)

In [82]:
race_ipums_dfs = [ca_ipums_hw_white, ca_ipums_hw_black, ca_ipums_hw_native, ca_ipums_hw_asian, ca_ipums_hw_other, ca_ipums_hw_multi]

In [84]:
for df in race_ipums_dfs:
    df['Sub_1_Code'] = [str(x) for x in df['Sub_1_Code']]
    df['Main_Code'] = [str(x) for x in df['Main_Code']]

## EDD Hierarchy Function

In [90]:
ca_ipums_hw_white.head().T

Unnamed: 0,0,1,2,3,4
INDNAICS,4853,4853,4853,4853,4853
PERWT,35.0,35.0,35.0,35.0,58.0
INCWAGE,0,0,0,0,0
RACE,1,1,1,1,1
NAICS Code,4853,4853,4853,4853,4853
Industry Title,taxi and limousine service,taxi and limousine service,taxi and limousine service,taxi and limousine service,taxi and limousine service
Main_Code,400,400,400,400,400
Sub_1_Code,430,430,430,430,430
Sub_2_Code,480,480,480,480,480
Sub_3_Code,483,483,483,483,483


#### Incorporate county look up

In [91]:
def edd_to_hw(edd_df, ipums_df_hw, naics_df, county_df, county: str, parsed_code: str, date: str, sample_size: int):
    # filter edd by date, edd county, and industry via parsed code
    edd_df = edd_df.loc[edd_df['Date'] == date].copy()
    if len(edd_df) == 0:
        return "Date not valid or found", np.nan, np.nan, np.nan
    edd_df = edd_df.loc[(edd_df['Sub_4_Code'] == parsed_code) | 
                        (edd_df['Sub_3_Code'] == parsed_code) | 
                        (edd_df['Sub_2_Code'] == parsed_code) | 
                        (edd_df['Sub_1_Code'] == parsed_code) | 
                        (edd_df['Main_Code'] == parsed_code)].copy()
    edd_df = edd_df.loc[edd_df['Area Name'] == county] # this is possible because all counties are in EDD data
    edd_df = edd_df.drop_duplicates(subset='Main_EDD').reset_index().iloc[:,1:]
    
    # merge naics with edd
    edd_df_ = pd.merge(edd_df, naics_df, on='Sub_4_Code')
    if len(edd_df_) == 0:
        edd_df_ = pd.merge(edd_df, naics_df, on='Sub_3_Code')
        if len(edd_df_) == 0:
            edd_df_ = pd.merge(edd_df, naics_df, on='Sub_2_Code')
            if len(edd_df_) == 0:
                edd_df_ = pd.merge(edd_df, naics_df, on='Sub_1_Code')
                if len(edd_df_) == 0:
                    edd_df_ = pd.merge(edd_df, naics_df, on='Main_Code')
                    if len(edd_df_) == 0:
                        return "No parsed code of input industry found within input county", np.nan, np.nan, np.nan
    edd_df = edd_df_.rename(columns = {'Industry Title_x':'EDD Industry',
                                 'Industry Title_y': 'IPUMS Industry'})
    edd_df = edd_df[['EDD Industry', 'Area Name', 'IPUMS Industry', 'INDNAICS', 'Current Employment']]
    employment_count = int(edd_df['Current Employment'].values[0])
    naics_code = edd_df['INDNAICS'].values[0]
    output, hw_perc, industry, wt_ind_count = ca_ipums_filter(ipums_df_hw, county_df, county, naics_code, sample_size)
    hw_count = (employment_count * hw_perc) / 100
    output += f", High wage count: {hw_count}"
    return output, hw_count, industry, wt_ind_count

In [92]:
def ca_ipums_filter(df, county_df, county: str, NAICS: str, n: int):
    df = df.loc[df['County'] == county].copy()
    level = 'county'
    if len(df) == 0:
        county_df = county_df.loc[county_df['County'] == county].copy() # narrow down county df
        reg_rural_urban = county_df['Regional Rural/Urban'].values[0]
        region = county_df['CDI Regions'].values[0]
        rural_urban = county_df['Rural/Urban'].values[0]
        df = df.loc[df['Regional Rural/Urban'] == reg_rural_urban].copy()
        level = 'reg_rural_urban'
        if len(df) == 0:
            df = df.loc[df['CDI Regions'] == region].copy()
            level = 'region'
            if len(df) == 0:
                df = df.loc[df['Rural/Urban'] == rural_urban].copy()
                level = 'rural_urban'
                if len(df) == 0:
                    level = 'state'
    
    df = df.loc[df['INDNAICS'] == NAICS].copy()
    try:
        industry = df['Industry Title'].values[0]
    except:
        return "Industry not valid or found", np.nan, np.nan, np.nan
    if len(df) == 0:
        return "NAICS Code not valid or found", np.nan, np.nan, np.nan
    df = df.drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]
    
    # incorporate tracked levels for conditions
    if level == 'county':
        if df['unwt_county_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: County, Industry: {industry}, High wage percentage: {df['wt_county_hw_perc'].values[0]}", df['wt_county_hw_perc'].values[0], industry, df['wt_county_ind_counts'].values[0]
        elif df['unwt_regcomm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Regional Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_regcomm_hw_perc'].values[0]}", df['wt_regcomm_hw_perc'].values[0], industry, df['wt_regcomm_ind_counts'].values[0]
        elif df['unwt_reg_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Regional, Industry: {industry}, High wage percentage: {df['wt_reg_high_wage_perc'].values[0]}", df['wt_reg_high_wage_perc'].values[0], industry, df['wt_reg_ind_counts'].values[0]
        elif df['unwt_comm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_comm_high_wage_perc'].values[0]}", df['wt_comm_high_wage_perc'].values[0], industry, df['wt_comm_ind_counts'].values[0]
        elif df['unwt_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: State, Industry: {industry}, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}", df['wt_CA_high_wage_perc'].values[0], industry, df['wt_ind_counts'].values[0]
        else:
            return "Not enough records available to satisfy sample size request", np.nan, np.nan, np.nan
        
    elif level == 'reg_rural_urban':
        if df['unwt_regcomm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Regional Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_regcomm_hw_perc'].values[0]}", df['wt_regcomm_hw_perc'].values[0], industry, df['wt_regcomm_ind_counts'].values[0]
        elif df['unwt_reg_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Regional, Industry: {industry}, High wage percentage: {df['wt_reg_high_wage_perc'].values[0]}", df['wt_reg_high_wage_perc'].values[0], industry, df['wt_reg_ind_counts'].values[0]
        elif df['unwt_comm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_comm_high_wage_perc'].values[0]}", df['wt_comm_high_wage_perc'].values[0], industry, df['wt_comm_ind_counts'].values[0]
        elif df['unwt_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: State, Industry: {industry}, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}", df['wt_CA_high_wage_perc'].values[0], industry, df['wt_ind_counts'].values[0]
        else:
            return "Not enough records available to satisfy sample size request", np.nan, np.nan, np.nan
        
    elif level == 'region':
        if df['unwt_reg_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Regional, Industry: {industry}, High wage percentage: {df['wt_reg_high_wage_perc'].values[0]}", df['wt_reg_high_wage_perc'].values[0], industry, df['wt_reg_ind_counts'].values[0]
        elif df['unwt_comm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_comm_high_wage_perc'].values[0]}", df['wt_comm_high_wage_perc'].values[0], industry, df['wt_comm_ind_counts'].values[0]
        elif df['unwt_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: State, Industry: {industry}, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}", df['wt_CA_high_wage_perc'].values[0], industry, df['wt_ind_counts'].values[0]
        else:
            return "Not enough records available to satisfy sample size request", np.nan, np.nan, np.nan
        
    elif level == 'rural_urban':
        if df['unwt_comm_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_comm_high_wage_perc'].values[0]}", df['wt_comm_high_wage_perc'].values[0], industry, df['wt_comm_ind_counts'].values[0]
        elif df['unwt_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: State, Industry: {industry}, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}", df['wt_CA_high_wage_perc'].values[0], industry, df['wt_ind_counts'].values[0]
        else:
            return "Not enough records available to satisfy sample size request", np.nan, np.nan, np.nan
        
    elif level == 'state':
        if df['unwt_ind_counts'].values[0] >= n:
            return f"County: {county}, Geographical level used: State, Industry: {industry}, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}", df['wt_CA_high_wage_perc'].values[0], industry, df['wt_ind_counts'].values[0]
        else:
            return "Not enough records available to satisfy sample size request", np.nan, np.nan, np.nan

### Create tidy dataframe of high wage counts by race breakdown

In [93]:
counties_edd = edd['Area Name'].unique()

In [94]:
parsed_codes = set(list(edd['Main_Code'].unique()) + list(edd['Sub_1_Code'].unique()) + list(edd['Sub_2_Code'].unique()) + list(edd['Sub_3_Code'].unique()) + list(edd['Sub_4_Code'].unique()))

In [95]:
dates_edd = edd['Date'].unique()

White (Code = 1)

In [97]:
total_iterations = len(counties_edd) * len(parsed_codes) * len(dates_edd)

In [96]:
industries = []
dates = []
counties = []
counts = []
emp_counts = [] # currently using weighted industry counts

In [98]:
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd, ca_ipums_hw_white, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10000 == 0:
                print(f'Progress: {progress_count} / {total_iterations}')

Progress: 10000 / 104400
Progress: 20000 / 104400
Progress: 30000 / 104400
Progress: 40000 / 104400
Progress: 50000 / 104400
Progress: 60000 / 104400
Progress: 70000 / 104400
Progress: 80000 / 104400
Progress: 90000 / 104400
Progress: 100000 / 104400


In [99]:
df_dict_white = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_white = pd.DataFrame(df_dict_white)
tidy_df_white = tidy_df_white[tidy_df_white['Industry'].notna()]
tidy_df_white['Date']= pd.to_datetime(tidy_df_white['Date'])
tidy_df_white['High Wage Count'] = tidy_df_white['High Wage Count'].astype(int)
tidy_df_white = tidy_df_white.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_white

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count
6409,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,9336,118652.0
6410,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,10292,118652.0
6408,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,10466,118652.0
6411,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,10335,118652.0
6412,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,9358,118652.0
...,...,...,...,...,...
13006,wholesale electronic markets agents and brokers,2019-10-01,Orange,2105,3364.0
12851,wholesale electronic markets agents and brokers,2019-11-01,Orange,2105,3364.0
13007,wholesale electronic markets agents and brokers,2019-11-01,Orange,2105,3364.0
12849,wholesale electronic markets agents and brokers,2019-12-01,Orange,2105,3364.0


In [110]:
tidy_df_white['Output Race'] = 1 # white
tidy_df_white

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count,Output Race
6409,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,9336,118652.0,1
6410,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,10292,118652.0,1
6408,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,10466,118652.0,1
6411,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,10335,118652.0,1
6412,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,9358,118652.0,1
...,...,...,...,...,...,...
13006,wholesale electronic markets agents and brokers,2019-10-01,Orange,2105,3364.0,1
12851,wholesale electronic markets agents and brokers,2019-11-01,Orange,2105,3364.0,1
13007,wholesale electronic markets agents and brokers,2019-11-01,Orange,2105,3364.0,1
12849,wholesale electronic markets agents and brokers,2019-12-01,Orange,2105,3364.0,1


Black (Code = 2)

In [100]:
industries = []
dates = []
counties = []
counts = []
emp_counts = [] # currently using weighted industry counts
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd, ca_ipums_hw_black, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10000 == 0:
                print(f'Progress: {progress_count} / {total_iterations}')

Progress: 10000 / 104400
Progress: 20000 / 104400
Progress: 30000 / 104400
Progress: 40000 / 104400
Progress: 50000 / 104400
Progress: 60000 / 104400
Progress: 70000 / 104400
Progress: 80000 / 104400
Progress: 90000 / 104400
Progress: 100000 / 104400


In [101]:
df_dict_black = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_black = pd.DataFrame(df_dict_black)
tidy_df_black = tidy_df_black[tidy_df_black['Industry'].notna()]
tidy_df_black['Date']= pd.to_datetime(tidy_df_black['Date'])
tidy_df_black['High Wage Count'] = tidy_df_black['High Wage Count'].astype(int)
tidy_df_black = tidy_df_black.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_black

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count
6409,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,4545,14908.0
6410,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,5010,14908.0
6408,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,5095,14908.0
6411,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,5032,14908.0
6412,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,4556,14908.0
...,...,...,...,...,...
13006,wholesale electronic markets agents and brokers,2019-10-01,Orange,655,1150.0
12851,wholesale electronic markets agents and brokers,2019-11-01,Orange,655,1150.0
13007,wholesale electronic markets agents and brokers,2019-11-01,Orange,655,1150.0
12849,wholesale electronic markets agents and brokers,2019-12-01,Orange,655,1150.0


In [111]:
tidy_df_black['Output Race'] = 2 # black
tidy_df_black

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count,Output Race
6409,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,4545,14908.0,2
6410,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,5010,14908.0,2
6408,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,5095,14908.0,2
6411,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,5032,14908.0,2
6412,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,4556,14908.0,2
...,...,...,...,...,...,...
13006,wholesale electronic markets agents and brokers,2019-10-01,Orange,655,1150.0,2
12851,wholesale electronic markets agents and brokers,2019-11-01,Orange,655,1150.0,2
13007,wholesale electronic markets agents and brokers,2019-11-01,Orange,655,1150.0,2
12849,wholesale electronic markets agents and brokers,2019-12-01,Orange,655,1150.0,2


Native American (Code = 3)

In [102]:
industries = []
dates = []
counties = []
counts = []
emp_counts = [] # currently using weighted industry counts
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd, ca_ipums_hw_native, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10000 == 0:
                print(f'Progress: {progress_count} / {total_iterations}')

Progress: 10000 / 104400
Progress: 20000 / 104400
Progress: 30000 / 104400
Progress: 40000 / 104400
Progress: 50000 / 104400
Progress: 60000 / 104400
Progress: 70000 / 104400
Progress: 80000 / 104400
Progress: 90000 / 104400
Progress: 100000 / 104400


In [103]:
df_dict_native = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_native = pd.DataFrame(df_dict_native)
tidy_df_native = tidy_df_native[tidy_df_native['Industry'].notna()]
tidy_df_native['Date']= pd.to_datetime(tidy_df_native['Date'])
tidy_df_native['High Wage Count'] = tidy_df_native['High Wage Count'].astype(int)
tidy_df_native = tidy_df_native.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_native

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count
6409,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,0,2196.0
6410,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,0,2196.0
6408,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,0,2196.0
6411,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,0,2196.0
6412,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,0,2196.0
...,...,...,...,...,...
17459,warehousing and storage,2019-11-01,San Joaquin,0,4378.0
17891,warehousing and storage,2019-11-01,San Joaquin,0,4378.0
16737,warehousing and storage,2019-12-01,San Joaquin,0,4378.0
17457,warehousing and storage,2019-12-01,San Joaquin,0,4378.0


In [112]:
tidy_df_native['Output Race'] = 3 # native
tidy_df_native

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count,Output Race
6409,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,0,2196.0,3
6410,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,0,2196.0,3
6408,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,0,2196.0,3
6411,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,0,2196.0,3
6412,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,0,2196.0,3
...,...,...,...,...,...,...
17459,warehousing and storage,2019-11-01,San Joaquin,0,4378.0,3
17891,warehousing and storage,2019-11-01,San Joaquin,0,4378.0,3
16737,warehousing and storage,2019-12-01,San Joaquin,0,4378.0,3
17457,warehousing and storage,2019-12-01,San Joaquin,0,4378.0,3


Asian (Code = 4, 5, 6)

In [104]:
industries = []
dates = []
counties = []
counts = []
emp_counts = [] # currently using weighted industry counts
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd, ca_ipums_hw_asian, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10000 == 0:
                print(f'Progress: {progress_count} / {total_iterations}')

Progress: 10000 / 104400
Progress: 20000 / 104400
Progress: 30000 / 104400
Progress: 40000 / 104400
Progress: 50000 / 104400
Progress: 60000 / 104400
Progress: 70000 / 104400
Progress: 80000 / 104400
Progress: 90000 / 104400
Progress: 100000 / 104400


In [105]:
df_dict_asian = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_asian = pd.DataFrame(df_dict_asian)
tidy_df_asian = tidy_df_asian[tidy_df_asian['Industry'].notna()]
tidy_df_asian['Date']= pd.to_datetime(tidy_df_asian['Date'])
tidy_df_asian['High Wage Count'] = tidy_df_asian['High Wage Count'].astype(int)
tidy_df_asian = tidy_df_asian.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_asian

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count
6409,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,9832,56396.0
6410,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,10838,56396.0
6408,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,11021,56396.0
6411,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,10884,56396.0
6412,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,9855,56396.0
...,...,...,...,...,...
13006,wholesale electronic markets agents and brokers,2019-10-01,Orange,396,12248.0
12851,wholesale electronic markets agents and brokers,2019-11-01,Orange,396,12248.0
13007,wholesale electronic markets agents and brokers,2019-11-01,Orange,396,12248.0
12849,wholesale electronic markets agents and brokers,2019-12-01,Orange,396,12248.0


In [113]:
tidy_df_asian['Output Race'] = 4 # asian
tidy_df_asian

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count,Output Race
6409,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,9832,56396.0,4
6410,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,10838,56396.0,4
6408,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,11021,56396.0,4
6411,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,10884,56396.0,4
6412,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,9855,56396.0,4
...,...,...,...,...,...,...
13006,wholesale electronic markets agents and brokers,2019-10-01,Orange,396,12248.0,4
12851,wholesale electronic markets agents and brokers,2019-11-01,Orange,396,12248.0,4
13007,wholesale electronic markets agents and brokers,2019-11-01,Orange,396,12248.0,4
12849,wholesale electronic markets agents and brokers,2019-12-01,Orange,396,12248.0,4


Other (Code = 7)

In [106]:
industries = []
dates = []
counties = []
counts = []
emp_counts = [] # currently using weighted industry counts
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd, ca_ipums_hw_other, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10000 == 0:
                print(f'Progress: {progress_count} / {total_iterations}')

Progress: 10000 / 104400
Progress: 20000 / 104400
Progress: 30000 / 104400
Progress: 40000 / 104400
Progress: 50000 / 104400
Progress: 60000 / 104400
Progress: 70000 / 104400
Progress: 80000 / 104400
Progress: 90000 / 104400
Progress: 100000 / 104400


In [107]:
df_dict_other = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_other = pd.DataFrame(df_dict_other)
tidy_df_other = tidy_df_other[tidy_df_other['Industry'].notna()]
tidy_df_other['Date']= pd.to_datetime(tidy_df_other['Date'])
tidy_df_other['High Wage Count'] = tidy_df_other['High Wage Count'].astype(int)
tidy_df_other = tidy_df_other.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_other

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count
6409,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,3504,17472.0
6410,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,3863,17472.0
6408,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,3928,17472.0
6411,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,3879,17472.0
6412,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,3512,17472.0
...,...,...,...,...,...
5803,wholesale electronic markets agents and brokers,2019-08-01,Los Angeles,0,2068.0
5804,wholesale electronic markets agents and brokers,2019-09-01,Los Angeles,0,2068.0
5806,wholesale electronic markets agents and brokers,2019-10-01,Los Angeles,0,2068.0
5807,wholesale electronic markets agents and brokers,2019-11-01,Los Angeles,0,2068.0


In [114]:
tidy_df_other['Output Race'] = 5 # other
tidy_df_other

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count,Output Race
6409,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,3504,17472.0,5
6410,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,3863,17472.0,5
6408,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,3928,17472.0,5
6411,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,3879,17472.0,5
6412,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,3512,17472.0,5
...,...,...,...,...,...,...
5803,wholesale electronic markets agents and brokers,2019-08-01,Los Angeles,0,2068.0,5
5804,wholesale electronic markets agents and brokers,2019-09-01,Los Angeles,0,2068.0,5
5806,wholesale electronic markets agents and brokers,2019-10-01,Los Angeles,0,2068.0,5
5807,wholesale electronic markets agents and brokers,2019-11-01,Los Angeles,0,2068.0,5


Multiracial (Code = 8, 9)

In [108]:
industries = []
dates = []
counties = []
counts = []
emp_counts = [] # currently using weighted industry counts
progress_count = 0
for county in counties_edd:
    for code in parsed_codes:
        for date in dates_edd:
            output, hw, industry, emp_count = edd_to_hw(edd, ca_ipums_hw_multi, naics, county_info, county, str(code), date, 10)
            industries.append(industry)
            dates.append(date)
            counties.append(county)
            counts.append(hw)
            emp_counts.append(emp_count)
            progress_count += 1
            if progress_count % 10000 == 0:
                print(f'Progress: {progress_count} / {total_iterations}')

Progress: 10000 / 104400
Progress: 20000 / 104400
Progress: 30000 / 104400
Progress: 40000 / 104400
Progress: 50000 / 104400
Progress: 60000 / 104400
Progress: 70000 / 104400
Progress: 80000 / 104400
Progress: 90000 / 104400
Progress: 100000 / 104400


In [109]:
df_dict_multi = {'Industry':industries, 'Date':dates, 'County':counties, 'High Wage Count':counts, 'Employment Count':emp_counts}
tidy_df_multi = pd.DataFrame(df_dict_multi)
tidy_df_multi = tidy_df_multi[tidy_df_multi['Industry'].notna()]
tidy_df_multi['Date']= pd.to_datetime(tidy_df_multi['Date'])
tidy_df_multi['High Wage Count'] = tidy_df_multi['High Wage Count'].astype(int)
tidy_df_multi = tidy_df_multi.sort_values(by=['Industry', 'County', 'Date'])
tidy_df_multi

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count
6409,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,7682,14216.0
6410,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,8469,14216.0
6408,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,8611,14216.0
6411,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,8504,14216.0
6412,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,7700,14216.0
...,...,...,...,...,...
13006,wholesale electronic markets agents and brokers,2019-10-01,Orange,1087,1072.0
12851,wholesale electronic markets agents and brokers,2019-11-01,Orange,1087,1072.0
13007,wholesale electronic markets agents and brokers,2019-11-01,Orange,1087,1072.0
12849,wholesale electronic markets agents and brokers,2019-12-01,Orange,1087,1072.0


In [115]:
tidy_df_multi['Output Race'] = 6 # multi
tidy_df_multi

Unnamed: 0,Industry,Date,County,High Wage Count,Employment Count,Output Race
6409,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,7682,14216.0,6
6410,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,8469,14216.0,6
6408,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,8611,14216.0,6
6411,accounting tax preparation bookkeeping and pay...,2019-04-01,Los Angeles,8504,14216.0,6
6412,accounting tax preparation bookkeeping and pay...,2019-05-01,Los Angeles,7700,14216.0,6
...,...,...,...,...,...,...
13006,wholesale electronic markets agents and brokers,2019-10-01,Orange,1087,1072.0,6
12851,wholesale electronic markets agents and brokers,2019-11-01,Orange,1087,1072.0,6
13007,wholesale electronic markets agents and brokers,2019-11-01,Orange,1087,1072.0,6
12849,wholesale electronic markets agents and brokers,2019-12-01,Orange,1087,1072.0,6


In [116]:
tidy_df_demo = pd.concat([tidy_df_white, tidy_df_black, tidy_df_asian, tidy_df_native, tidy_df_other, tidy_df_multi], ignore_index=True)

In [117]:
tidy_df_demo_col = pd.merge(tidy_df_demo, cost_of_living, left_on='County', right_on='Regions')

In [119]:
tidy_df_demo_col = tidy_df_demo_col[['Industry', 'Date', 'County', 'High Wage Count', 'Cost of Living', 'Employment Count', 'Output Race']]

In [121]:
# tidy_df_demo_col.to_csv('high_wage_outputs_w_race.csv', encoding='utf-8', index=False)

In [122]:
tidy_df_demo_col

Unnamed: 0,Industry,Date,County,High Wage Count,Cost of Living,Employment Count,Output Race
0,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,9336,80216,118652.0,1
1,accounting tax preparation bookkeeping and pay...,2019-01-01,Los Angeles,9336,80216,118652.0,1
2,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,10292,80216,118652.0,1
3,accounting tax preparation bookkeeping and pay...,2019-02-01,Los Angeles,10292,80216,118652.0,1
4,accounting tax preparation bookkeeping and pay...,2019-03-01,Los Angeles,10466,80216,118652.0,1
...,...,...,...,...,...,...,...
33343,crop production,2019-08-01,Yolo,1098,66339,2244.0,6
33344,crop production,2019-09-01,Yolo,1066,66339,2244.0,6
33345,crop production,2019-10-01,Yolo,1003,66339,2244.0,6
33346,crop production,2019-11-01,Yolo,737,66339,2244.0,6
