# Create hierarchical structure for IPUMS and EDD data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import warnings
import os
import re
from jqi_functions import *
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format='retina'

## Creating CSV/dataframe

In [2]:
ca_ipums = cleaned_ipums('2019')

In [3]:
ca_ipums.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 224526 entries, 0 to 224525
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   YEAR              224526 non-null  int64  
 1   STATEFIP          224526 non-null  int64  
 2   COUNTYFIP         224526 non-null  int64  
 3   INDNAICS          224526 non-null  object 
 4   PERWT             224526 non-null  float64
 5   INCWAGE           224526 non-null  int64  
 6   NAICS Code        224526 non-null  object 
 7   Industry Title_x  224526 non-null  object 
 8   Industry Title_y  224526 non-null  object 
 9   Main_Code         224526 non-null  int64  
 10  Sub_1_Code        224526 non-null  int64  
 11  Sub_2_Code        224526 non-null  object 
 12  Sub_3_Code        224526 non-null  object 
 13  Sub_4_Code        224526 non-null  object 
dtypes: float64(1), int64(6), object(7)
memory usage: 25.7+ MB


In [4]:
county_info = pd.read_csv('data/county_to_regions_key - Sheet1.csv')

In [5]:
county_info.head()

Unnamed: 0,FIPS,COUNTYFIP,County,State,"County, State",EDD County,Census County,Population - Households,Rural/Urban,Redstone Regions,WF Regions,CDI Regions,Population
0,6001,1,Alameda,California,"Alameda, California",Alameda County,"Alameda County, California",383468,Urban,Bay Area,Bay Area,Bay Area,1656754
1,6013,13,Contra Costa,California,"Contra Costa, California",Contra Costa County,"Contra Costa County, California",282085,Urban,Bay Area,Bay Area,Bay Area,1142251
2,6095,95,Solano,California,"Solano, California",Solano County,"Solano County, California",107267,Urban,Bay Area,Bay Area,Bay Area,441829
3,6081,81,San Mateo,California,"San Mateo, California",San Mateo County,"San Mateo County, California",183144,Urban,Bay Area,Bay Area,Bay Area,767423
4,6085,85,Santa Clara,California,"Santa Clara, California",Santa Clara County,"Santa Clara County, California",456440,Urban,Bay Area,Bay Area,Bay Area,1927470


In [6]:
cost_of_living = pd.read_csv('data/united-way-col-1A1PS1C2019.csv') # 1 adult, 1 preschooler, 1 child

In [7]:
cost_of_living.head()

Unnamed: 0,Regions,Cost of Living
0,Bay Area,93392
1,Central Coast,76493
2,Central Valley,56747
3,Inland Empire,63170
4,Los Angeles,80216


In [8]:
ca_ipums = pd.merge(ca_ipums, county_info, on = 'COUNTYFIP')

In [9]:
len(ca_ipums['County'].unique())

34

In [10]:
ca_ipums = ca_ipums[['INDNAICS', 'PERWT', 'INCWAGE',
       'NAICS Code', 'Industry Title_x', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code', 'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions']]

In [11]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'CDI Regions', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'Regional COL'})

In [12]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'Rural/Urban', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'Rural/Urban COL'})

In [13]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'County', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'County COL'})

In [14]:
ca_ipums = ca_ipums[['INDNAICS', 'PERWT', 'INCWAGE',
       'NAICS Code', 'Industry Title_x', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code', 'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions',
                    'Regional COL', 'Rural/Urban COL', 'County COL']]

In [15]:
ca_ipums['Regional Rural/Urban'] = ca_ipums['CDI Regions'] + ' ' + ca_ipums['Rural/Urban']

In [16]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'Regional Rural/Urban', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'Regional Rural/Urban COL'})

In [17]:
ca_ipums = ca_ipums.rename(columns = {'Industry Title_x':'Industry Title'})
ca_ipums = ca_ipums[['INDNAICS', 'PERWT', 'INCWAGE',
       'NAICS Code', 'Industry Title', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code',
       'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions', 'Regional Rural/Urban',
                    'Regional COL', 'Rural/Urban COL', 'County COL', 'Regional Rural/Urban COL']]

In [18]:
ca_ipums['Industry Title'] = normalize_titles(ca_ipums['Industry Title'])

In [19]:
ca_ipums

Unnamed: 0,INDNAICS,PERWT,INCWAGE,NAICS Code,Industry Title,Main_Code,Sub_1_Code,Sub_2_Code,Sub_3_Code,Sub_4_Code,County,Rural/Urban,CDI Regions,Regional Rural/Urban,Regional COL,Rural/Urban COL,County COL,Regional Rural/Urban COL
0,4853,21.0,23100,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,Los Angeles Urban,80216,79472,80216,80216
1,4853,21.0,23100,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,Los Angeles Urban,80216,79472,80216,80216
2,4853,21.0,23100,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,Los Angeles Urban,80216,79472,80216,80216
3,4853,21.0,23100,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,Los Angeles Urban,80216,79472,80216,80216
4,4853,11.0,28000,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,Los Angeles Urban,80216,79472,80216,80216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488572,3113,558.0,5300,3113,sugar and confectionery products,300,310,311,311,311,Humboldt,Rural,Redwood Coast,Redwood Coast Rural,58625,58812,59608,58625
488573,3113,244.0,5300,3113,sugar and confectionery products,300,310,311,311,311,Humboldt,Rural,Redwood Coast,Redwood Coast Rural,58625,58812,59608,58625
488574,4247,117.0,50000,4247,petroleum and petroleum products merchant whol...,400,420,424,424d,424d,Humboldt,Rural,Redwood Coast,Redwood Coast Rural,58625,58812,59608,58625
488575,42491,148.0,40000,42491,farm supplies merchant wholesalers,400,420,424,424,424,Humboldt,Rural,Redwood Coast,Redwood Coast Rural,58625,58812,59608,58625


## Create county lookup dataframe

In [20]:
county_info = county_info[['County', 'Rural/Urban', 'CDI Regions']]

In [21]:
county_info['Regional Rural/Urban'] = county_info['CDI Regions'] + ' ' + county_info['Rural/Urban']

In [22]:
county_info = pd.merge(county_info, cost_of_living, left_on = 'County', right_on = 'Regions')

In [23]:
county_info = county_info.rename(columns = {'Cost of Living':'County COL'})
county_info = county_info.drop(columns=['Regions'])

In [24]:
county_info = pd.merge(county_info, cost_of_living, left_on = 'Regional Rural/Urban', right_on = 'Regions')

In [25]:
county_info = county_info.rename(columns = {'Cost of Living':'Regional Rural/Urban COL'})
county_info = county_info.drop(columns=['Regions'])

In [26]:
county_info = pd.merge(county_info, cost_of_living, left_on = 'CDI Regions', right_on = 'Regions')

In [27]:
county_info = county_info.rename(columns = {'Cost of Living':'Regional COL'})
county_info = county_info.drop(columns=['Regions'])

In [28]:
county_info = pd.merge(county_info, cost_of_living, left_on = 'Rural/Urban', right_on = 'Regions')

In [29]:
county_info = county_info.rename(columns = {'Cost of Living':'Rural/Urban COL'})
county_info = county_info.drop(columns=['Regions'])

In [30]:
county_info['State COL'] = cost_of_living.iloc[11][1]

In [31]:
county_info

Unnamed: 0,County,Rural/Urban,CDI Regions,Regional Rural/Urban,County COL,Regional Rural/Urban COL,Regional COL,Rural/Urban COL,State COL
0,Alameda,Urban,Bay Area,Bay Area Urban,88296,94329,93392,79472,74448
1,Contra Costa,Urban,Bay Area,Bay Area Urban,86284,94329,93392,79472,74448
2,Solano,Urban,Bay Area,Bay Area Urban,66751,94329,93392,79472,74448
3,San Mateo,Urban,Bay Area,Bay Area Urban,112606,94329,93392,79472,74448
4,Santa Clara,Urban,Bay Area,Bay Area Urban,107879,94329,93392,79472,74448
...,...,...,...,...,...,...,...,...,...
67,Placer,Rural,Sacramento,Sacramento Rural,63983,64653,64520,58812,74448
68,Placer,Rural,Sacramento,Sacramento Rural,63983,64653,63126,58812,74448
69,Sutter,Rural,Sacramento,Sacramento Rural,55477,64653,64520,58812,74448
70,Sutter,Rural,Sacramento,Sacramento Rural,55477,64653,63126,58812,74448


## Merge with EDD Data

In [32]:
edd = pd.read_csv('data/edd_merged.csv')

In [33]:
edd['Area Name'] = edd['Area Name'].str.replace(' County', '')
edd = edd.loc[edd['Area Type'] == 'County']
edd = edd.drop(columns=['Industry Title'])
edd = edd.rename(columns={"LMID Industry Title": "Industry Title"})

In [34]:
edd.head()

Unnamed: 0,Industry Title,Parsed_Code,Area Type,Area Name,Date,Seasonally Adjusted,Current Employment,Main_EDD,Main_Code,Sub_1,Sub_1_Code,Sub_2,Sub_2_Code,Sub_3,Sub_3_Code,Sub_4,Sub_4_Code
0,county,939,County,Madera,3/1/19,N,1600,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
1,county,939,County,Fresno,1/1/19,N,7800,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
2,county,939,County,Kern,1/1/19,N,9900,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
3,county,939,County,Los Angeles,1/1/19,N,106800,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
4,county,939,County,Madera,1/1/19,N,1600,government,900,state and local government,940,local government,930,local government excluding education,932,county,939


#### Fix merge - losing too many counties this way

In [35]:
ca_ipums_edd = pd.merge(ca_ipums, edd, left_on=['County', 'Sub_4_Code'], right_on=['Area Name', 'Sub_4_Code'])

In [36]:
ca_ipums_edd.columns

Index(['INDNAICS', 'PERWT', 'INCWAGE', 'NAICS Code', 'Industry Title_x',
       'Main_Code_x', 'Sub_1_Code_x', 'Sub_2_Code_x', 'Sub_3_Code_x',
       'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions',
       'Regional Rural/Urban', 'Regional COL', 'Rural/Urban COL', 'County COL',
       'Regional Rural/Urban COL', 'Industry Title_y', 'Parsed_Code',
       'Area Type', 'Area Name', 'Date', 'Seasonally Adjusted',
       'Current Employment', 'Main_EDD', 'Main_Code_y', 'Sub_1',
       'Sub_1_Code_y', 'Sub_2', 'Sub_2_Code_y', 'Sub_3', 'Sub_3_Code_y',
       'Sub_4'],
      dtype='object')

In [38]:
ca_ipums_edd = ca_ipums_edd[['INDNAICS', 'PERWT', 'INCWAGE', 'NAICS Code',
                            'County', 'Rural/Urban', 'CDI Regions', 'Regional Rural/Urban',
                           'Regional COL', 'Rural/Urban COL', 'County COL',
                           'Regional Rural/Urban COL', 'Industry Title_y', 'Parsed_Code', 'Area Type',
                           'Date', 'Seasonally Adjusted', 'Current Employment', 'Main_EDD',
                           'Main_Code_y', 'Sub_1', 'Sub_1_Code_y', 'Sub_2', 'Sub_2_Code_y',
                           'Sub_3', 'Sub_3_Code_y', 'Sub_4', 'Sub_4_Code']]

In [39]:
ca_ipums_edd = ca_ipums_edd.rename(columns={"Main_Code_y": "Main_Code",
                                           "Sub_1_Code_y": "Sub_1_Code",
                                           "Sub_2_Code_y": "Sub_2_Code",
                                           "Sub_3_Code_y": "Sub_3_Code",
                                           "Industry Title_y": "Industry Title"})

In [41]:
ca_ipums_edd['Main_Code'] = ca_ipums_edd['Main_Code'].astype(object)
ca_ipums_edd['Sub_1_Code'] = ca_ipums_edd['Sub_1_Code'].astype(object)

In [44]:
# ca_ipums_edd.to_csv('data/ca_ipums_full_hierarchy.csv', encoding='utf-8', index=False)

### Checking what industries from original EDD data were dropped

In [45]:
edd_antijoin = edd[~edd['Sub_4_Code'].isin(ca_ipums_edd['Sub_4_Code'].unique())]

In [46]:
len(edd_antijoin)

1308

In [47]:
edd_antijoin['Sub_4_Code'].unique()

array(['939', '921', '931', '922', '912', '238c', '236a', '238e', '722d',
       '515b', '238d', '337a', '722c', '713b', '236b', '522c', '531c',
       '613', '238f', '722a', '423e', '515a'], dtype=object)

## Sample size IPUMS function

In [48]:
def add_to_state_df(df):
    df['Above CA Threshold'] = df['INCWAGE'] > 74448
    df['wt_ind_counts'] = df['PERWT'].groupby(df['INDNAICS']).transform('sum')
    df["Above CA Threshold"] = df["Above CA Threshold"].astype(int)
    df["wt_CA_above_thresh"] = df["Above CA Threshold"] * df['PERWT']
    df['wt_CA_high_wage_count'] = df['wt_CA_above_thresh'].groupby(
        df['INDNAICS']).transform('sum')
    df['wt_CA_high_wage_perc'] = (df['wt_CA_high_wage_count'] / df['wt_ind_counts']) * 100
    df['unwt_ind_counts'] = df['INDNAICS'].groupby(df['INDNAICS']).transform('count')
    return df

In [49]:
ca_ipums_hw = add_to_state_df(ca_ipums)

#### County Level

In [50]:
def add_to_county_df(df):
    df['above_county_thresh'] = df['INCWAGE'] > df['County COL']
    df["above_county_thresh"] = df["above_county_thresh"].astype(int)
    df["wt_county_above_thresh"] = df["above_county_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','County']).agg(wt_county_ind_counts = ('PERWT','sum'),
                                                     wt_county_hw_count = ('wt_county_above_thresh','sum'),
                                                     unwt_county_ind_counts = ('INDNAICS','count')).reset_index()    
    
    df = pd.merge(df, df_agg, on=['INDNAICS', 'County'])
    df['wt_county_hw_perc'] = (df['wt_county_hw_count'] / df['wt_county_ind_counts']) * 100
    df = df.rename(columns={"County_x": "County", 'wt_county_hw_count_x':'wt_county_hw_count','wt_county_ind_counts_x':'wt_county_ind_counts',
                           'unwt_county_ind_counts_x':'unwt_county_ind_counts'})
    return df

In [51]:
ca_ipums_hw = add_to_county_df(ca_ipums_hw)

#### Regional Level

In [52]:
def add_to_region_df(df): 
    df['above_region_thresh'] = df['INCWAGE'] > df['Regional COL']
    df["above_region_thresh"] = df["above_region_thresh"].astype(int)
    df["wt_reg_above_thresh"] = df["above_region_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','CDI Regions']).agg(wt_reg_ind_counts = ('PERWT','sum'),
                                                     wt_reg_high_wage_count = ('wt_reg_above_thresh','sum'),
                                                     unwt_reg_ind_counts = ('INDNAICS','count')).reset_index()    
    
    df = pd.merge(df, df_agg, on=['INDNAICS', 'CDI Regions'])
    df['wt_reg_high_wage_perc'] = (df['wt_reg_high_wage_count'] / df['wt_reg_ind_counts']) * 100
    df = df.rename(columns={"CDI Regions_x": "CDI Regions", 'wt_reg_high_wage_count_x':'wt_reg_high_wage_count','wt_reg_ind_counts_x':'wt_reg_ind_counts',
                           'unwt_reg_ind_counts_x':'unwt_reg_ind_counts'})
    return df

In [53]:
ca_ipums_hw = add_to_region_df(ca_ipums_hw)

#### Regional Rural/Urban Level

In [54]:
def add_to_regioncomm_df(df):
    df['above_regcomm_thresh'] = df['INCWAGE'] > df['Regional Rural/Urban COL']
    df["above_regcomm_thresh"] = df["above_regcomm_thresh"].astype(int)
    df["wt_regcomm_above_thresh"] = df["above_regcomm_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','Regional Rural/Urban']).agg(wt_regcomm_ind_counts = ('PERWT','sum'),
                                                     wt_regcomm_hw_count = ('wt_regcomm_above_thresh','sum'),
                                                     unwt_regcomm_ind_counts = ('INDNAICS','count')).reset_index()    
    
    df = pd.merge(df, df_agg, on=['INDNAICS', 'Regional Rural/Urban'])
    df['wt_regcomm_hw_perc'] = (df['wt_regcomm_hw_count'] / df['wt_regcomm_ind_counts']) * 100
    df = df.rename(columns={"Regional Rural/Urban_x": "Regional Rural/Urban", 'wt_regcomm_hw_count_x':'wt_regcomm_hw_count','wt_regcomm_ind_counts_x':'wt_regcomm_ind_counts',
                           'unwt_regcomm_ind_counts_x':'unwt_regcomm_ind_counts'})
    return df

In [55]:
ca_ipums_hw = add_to_regioncomm_df(ca_ipums)

#### Rural/Urban level

In [56]:
def add_to_community_df(df):
    df['above_comm_thresh'] = df['INCWAGE'] > df['Rural/Urban COL']
    df["above_comm_thresh"] = df["above_comm_thresh"].astype(int)
    df["wt_comm_above_thresh"] = df["above_comm_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','Rural/Urban']).agg(wt_comm_ind_counts = ('PERWT','sum'),
                                                     wt_comm_high_wage_count = ('wt_comm_above_thresh','sum'),
                                                     unwt_comm_ind_counts = ('INDNAICS','count')).reset_index()    
    
    df = pd.merge(df, df_agg, on=['INDNAICS', 'Rural/Urban'])
    df['wt_comm_high_wage_perc'] = (df['wt_comm_high_wage_count'] / df['wt_comm_ind_counts']) * 100
    df = df.rename(columns={"Rural/Urban_x": "Rural/Urban", 'wt_comm_high_wage_count_x':'wt_comm_high_wage_count','wt_comm_ind_counts_x':'wt_comm_ind_counts',
                           'unwt_comm_ind_counts_x':'unwt_comm_ind_counts'})
    return df

In [57]:
ca_ipums_hw = add_to_community_df(ca_ipums_hw)

#### Add all levels

In [58]:
def add_geo_high_wages(df):
    df_new = df.copy() # initialize new dataframe
    df_new = add_to_state_df(df_new) # creating state level counts
    df_new = add_to_community_df(df_new) # creating rural/urban level counts
    df_new = add_to_region_df(df_new) # creating regional level counts
    df_new = add_to_regioncomm_df(df_new) # creating regional rural/urban level counts
    df_new = add_to_county_df(df_new) # creating county level counts
    return df_new

In [59]:
ca_ipums_hw = add_geo_high_wages(ca_ipums)

In [60]:
ca_ipums_hw.shape

(488577, 48)

In [61]:
ca_ipums_hw['Main_Code'] = ca_ipums_hw['Main_Code'].astype(object)
ca_ipums_hw['Sub_1_Code'] = ca_ipums_hw['Sub_1_Code'].astype(object)

In [62]:
ca_ipums_hw.head().T

Unnamed: 0,0,1,2,3,4
INDNAICS,4853,4853,4853,4853,4853
PERWT,21.0,21.0,21.0,21.0,11.0
INCWAGE,23100,23100,23100,23100,28000
NAICS Code,4853,4853,4853,4853,4853
Industry Title,taxi and limousine service,taxi and limousine service,taxi and limousine service,taxi and limousine service,taxi and limousine service
Main_Code,400,400,400,400,400
Sub_1_Code,430,430,430,430,430
Sub_2_Code,480,480,480,480,480
Sub_3_Code,483,483,483,483,483
Sub_4_Code,483,483,483,483,483


### Filtering for high wage percentage function

In [63]:
def ca_ipums_filter(df, county: str, NAICS: str, n: int):
    df = df.loc[df['County'] == county].copy()
    if len(df) == 0:
        return "County not valid or found", np.nan
    df = df.loc[df['INDNAICS'] == NAICS].copy()
    industry = df['Industry Title'].values[0]
    if len(df) == 0:
        return "NAICS Code not valid or found", np.nan
    df = df.drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]
    if df['unwt_county_ind_counts'].values[0] >= n:
        return f"County: {county}, Geographical level used: County, Industry: {industry}, High wage percentage: {df['wt_county_hw_perc'].values[0]}", df['wt_county_hw_perc'].values[0]
    elif df['unwt_regcomm_ind_counts'].values[0] >= n:
        return f"County: {county}, Geographical level used: Regional Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_regcomm_hw_perc'].values[0]}", df['wt_regcomm_hw_perc'].values[0]
    elif df['unwt_reg_ind_counts'].values[0] >= n:
        return f"County: {county}, Geographical level used: Regional, Industry: {industry}, High wage percentage: {df['wt_reg_high_wage_perc'].values[0]}", df['wt_reg_high_wage_perc'].values[0]
    elif df['unwt_comm_ind_counts'].values[0] >= n:
        return f"County: {county}, Geographical level used: Rural/Urban, Industry: {industry}, High wage percentage: {df['wt_comm_high_wage_perc'].values[0]}", df['wt_comm_high_wage_perc'].values[0]
    elif df['unwt_ind_counts'].values[0] >= n:
        return f"County: {county}, Geographical level used: State, Industry: {industry}, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}", df['wt_CA_high_wage_perc'].values[0]
    else:
        return "Not enough records available to satisfy sample size request", np.nan

#### Testing function

In [64]:
ca_ipums_filter(ca_ipums_hw, 'San Francisco', '51913', 10)

('County: San Francisco, Geographical level used: County, Industry: internet publishing and broadcasting and web search portals, High wage percentage: 79.44013490725126',
 79.44013490725126)

In [65]:
ca_ipums_filter(ca_ipums_hw, 'San Francisco', '814', 50)

('County: San Francisco, Geographical level used: Regional Rural/Urban, Industry: private households, High wage percentage: 1.5553977272727273',
 1.5553977272727273)

## EDD Hierarchy Function

#### Attempt 1:

In [66]:
def edd_to_naics(edd_df, county: str, industry_title: str):
    # filter edd by county and industry
    edd_df = edd_df.loc[edd_df['Area Name'] == county].copy()
    if len(edd_df) == 0:
        return "County not valid or found"
    industry_title = industry_title.strip().lower().replace('&', 'and')
    industry_title = re.sub(r'[^\w\s]','',industry_title)
    edd_df = edd_df.loc[(edd_df['Sub_4'] == industry_title) | 
                        (edd_df['Sub_3'] == industry_title) | 
                        (edd_df['Sub_2'] == industry_title) | 
                        (edd_df['Sub_1'] == industry_title) | 
                        (edd_df['Main_Code'] == industry_title)].copy()
    edd_df['year_avg_employment_ct'] = edd_df['Current Employment'].mean()
    edd_df = edd_df.drop_duplicates(subset='Main_EDD').reset_index().iloc[:,1:]
    # load naics crosswalk
    cwd = os.getcwd()
    naics = pd.read_csv(f'{cwd}/data/naics_parsed_crosswalk.csv').drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]
    naics['Industry Title'] = normalize_titles(naics['Industry Title'])
    # merge naics with edd
    edd_df_ = pd.merge(edd_df, naics, on='Sub_4_Code')
    if len(edd_df_) == 0:
        edd_df_ = pd.merge(edd_df, naics, on='Sub_3_Code')
        if len(edd_df_) == 0:
            edd_df_ = pd.merge(edd_df, naics, on='Sub_2_Code')
            if len(edd_df_) == 0:
                edd_df_ = pd.merge(edd_df, naics, on='Sub_1_Code')
                if len(edd_df_) == 0:
                    edd_df_ = pd.merge(edd_df, naics, on='Main_Code')
                    if len(edd_df_) == 0:
                        return "No parsed code of input industry found within input county"
    edd_df = edd_df_.rename(columns = {'Industry Title_x':'EDD Industry',
                                 'Industry Title_y': 'IPUMS Industry'})
    edd_df = edd_df[['EDD Industry', 'Area Name', 'IPUMS Industry', 'INDNAICS', 'year_avg_employment_ct']]
    return edd_df

In [67]:
def edd_naics_to_hw(filtered_edd_df, ipums_df_hw, naics: str, sample_size: int): # decide appropriate naics from edd_to_naics
    filtered_edd_df = filtered_edd_df.loc[filtered_edd_df['INDNAICS'] == naics].copy()
    county = filtered_edd_df['Area Name'].values[0]
    employment_count = int(filtered_edd_df['year_avg_employment_ct'].values[0])
    output, hw_perc = ca_ipums_filter(ipums_df_hw, county, naics, sample_size)
    hw_count = int((employment_count * hw_perc) / 100)
    output += f", High wage count: {hw_count}"
    return output, hw_count

Comment: don't think we need to pick a specific NAICS code, since we are simply extracting from a level of EDD data

#### Attempt 2

In [68]:
def edd_to_hw(edd_df, ipums_df_hw, county: str, parsed_code: str, date: str, sample_size: int):
    # filter edd by county, date, and industry via parsed code
    edd_df = edd_df.loc[edd_df['Area Name'] == county].copy()
    if len(edd_df) == 0:
        return "County not valid or found"
    
    edd_df = edd_df.loc[edd_df['Date'] == date].copy()
    if len(edd_df) == 0:
        return "Date not valid or found"
    
#     industry_title = industry_title.strip().lower().replace('&', 'and')
#     industry_title = re.sub(r'[^\w\s]','',industry_title)
#     edd_df = edd_df.loc[(edd_df['Sub_4'] == industry_title) | 
#                         (edd_df['Sub_3'] == industry_title) | 
#                         (edd_df['Sub_2'] == industry_title) | 
#                         (edd_df['Sub_1'] == industry_title) | 
#                         (edd_df['Main_EDD'] == industry_title)].copy()
    edd_df = edd_df.loc[(edd_df['Sub_4_Code'] == parsed_code) | 
                        (edd_df['Sub_3_Code'] == parsed_code) | 
                        (edd_df['Sub_2_Code'] == parsed_code) | 
                        (edd_df['Sub_1_Code'] == parsed_code) | 
                        (edd_df['Main_Code'] == parsed_code)].copy()
#     edd_df['year_avg_employment_ct'] = edd_df['Current Employment'].mean()
    edd_df = edd_df.drop_duplicates(subset='Main_EDD').reset_index().iloc[:,1:]
    
    # load naics crosswalk
    cwd = os.getcwd()
    naics = pd.read_csv(f'{cwd}/data/naics_parsed_crosswalk.csv').drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]
    naics['Industry Title'] = normalize_titles(naics['Industry Title'])
    naics['Main_Code'] = naics['Main_Code'].astype(object)
    naics['Sub_1_Code'] = naics['Sub_1_Code'].astype(object)
    # merge naics with edd
    edd_df_ = pd.merge(edd_df, naics, on='Sub_4_Code')
    if len(edd_df_) == 0:
        edd_df_ = pd.merge(edd_df, naics, on='Sub_3_Code')
        if len(edd_df_) == 0:
            edd_df_ = pd.merge(edd_df, naics, on='Sub_2_Code')
            if len(edd_df_) == 0:
                edd_df_ = pd.merge(edd_df, naics, on='Sub_1_Code')
                if len(edd_df_) == 0:
                    edd_df_ = pd.merge(edd_df, naics, on='Main_Code')
                    if len(edd_df_) == 0:
                        return "No parsed code of input industry found within input county"
    edd_df = edd_df_.rename(columns = {'Industry Title_x':'EDD Industry',
                                 'Industry Title_y': 'IPUMS Industry'})
#     edd_df = edd_df[['EDD Industry', 'Area Name', 'IPUMS Industry', 'INDNAICS', 'year_avg_employment_ct']]
#     edd_df = edd_df.drop_duplicates(subset='year_avg_employment_ct').reset_index().iloc[:,1:]
    edd_df = edd_df[['EDD Industry', 'Area Name', 'IPUMS Industry', 'INDNAICS', 'Current Employment']]
    employment_count = int(edd_df['Current Employment'].values[0])
    naics_code = edd_df['INDNAICS'].values[0]
    output, hw_perc = ca_ipums_filter(ipums_df_hw, county, naics_code, sample_size)
    hw_count = int((employment_count * hw_perc) / 100)
    output += f", High wage count: {hw_count}"
    return output, hw_count

#### Attempt 3 - final as of 2/9/22

In [69]:
def edd_to_hw(edd_df, ipums_df_hw, county: str, parsed_code: str, date: str, sample_size: int):
    # filter edd by county, date, and industry via parsed code
    edd_df = edd_df.loc[edd_df['Area Name'] == county].copy()
    if len(edd_df) == 0:
        return "County not valid or found"
    edd_df = edd_df.loc[edd_df['Date'] == date].copy()
    if len(edd_df) == 0:
        return "Date not valid or found"
    edd_df = edd_df.loc[(edd_df['Sub_4_Code'] == parsed_code) | 
                        (edd_df['Sub_3_Code'] == parsed_code) | 
                        (edd_df['Sub_2_Code'] == parsed_code) | 
                        (edd_df['Sub_1_Code'] == parsed_code) | 
                        (edd_df['Main_Code'] == parsed_code)].copy()
    edd_df = edd_df.drop_duplicates(subset='Main_EDD').reset_index().iloc[:,1:]
    
    # load naics crosswalk
    cwd = os.getcwd()
    naics = pd.read_csv(f'{cwd}/data/naics_parsed_crosswalk.csv').drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]
    naics['Industry Title'] = normalize_titles(naics['Industry Title'])
    naics['Main_Code'] = naics['Main_Code'].astype(object)
    naics['Sub_1_Code'] = naics['Sub_1_Code'].astype(object)
    
    # merge naics with edd
    edd_df_ = pd.merge(edd_df, naics, on='Sub_4_Code')
    if len(edd_df_) == 0:
        edd_df_ = pd.merge(edd_df, naics, on='Sub_3_Code')
        if len(edd_df_) == 0:
            edd_df_ = pd.merge(edd_df, naics, on='Sub_2_Code')
            if len(edd_df_) == 0:
                edd_df_ = pd.merge(edd_df, naics, on='Sub_1_Code')
                if len(edd_df_) == 0:
                    edd_df_ = pd.merge(edd_df, naics, on='Main_Code')
                    if len(edd_df_) == 0:
                        return "No parsed code of input industry found within input county"
    edd_df = edd_df_.rename(columns = {'Industry Title_x':'EDD Industry',
                                 'Industry Title_y': 'IPUMS Industry'})
    edd_df = edd_df[['EDD Industry', 'Area Name', 'IPUMS Industry', 'INDNAICS', 'Current Employment']]
    employment_count = int(edd_df['Current Employment'].values[0])
    naics_code = edd_df['INDNAICS'].values[0]
    output, hw_perc = ca_ipums_filter(ipums_df_hw, county, naics_code, sample_size)
    hw_count = int((employment_count * hw_perc) / 100)
    output += f", High wage count: {hw_count}"
    return output, hw_count

## Test example:

In [70]:
edd_to_hw(edd, ca_ipums_hw, 'Los Angeles', '311', '1/1/19', 100)

('County: Los Angeles, Geographical level used: County, Industry: retail bakeries, High wage percentage: 1.5729109776078645, High wage count: 243',
 243)