# Create hierarchical structure for IPUMS and EDD data

In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import warnings
import os
import re
from jqi_functions import *
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format='retina'

## Creating CSV/dataframe

In [2]:
ca_ipums = cleaned_ipums('2019')

In [3]:
ca_ipums.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 224526 entries, 0 to 224525
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   YEAR              224526 non-null  int64  
 1   STATEFIP          224526 non-null  int64  
 2   COUNTYFIP         224526 non-null  int64  
 3   INDNAICS          224526 non-null  object 
 4   PERWT             224526 non-null  float64
 5   INCWAGE           224526 non-null  int64  
 6   NAICS Code        224526 non-null  object 
 7   Industry Title_x  224526 non-null  object 
 8   Industry Title_y  224526 non-null  object 
 9   Main_Code         224526 non-null  int64  
 10  Sub_1_Code        224526 non-null  int64  
 11  Sub_2_Code        224526 non-null  object 
 12  Sub_3_Code        224526 non-null  object 
 13  Sub_4_Code        224526 non-null  object 
dtypes: float64(1), int64(6), object(7)
memory usage: 25.7+ MB


In [4]:
county_info = pd.read_csv('data/county_to_regions_key - Sheet1.csv')

In [5]:
county_info.head()

Unnamed: 0,FIPS,COUNTYFIP,County,State,"County, State",EDD County,Census County,Population - Households,Rural/Urban,Redstone Regions,WF Regions,CDI Regions,Population
0,6001,1,Alameda,California,"Alameda, California",Alameda County,"Alameda County, California",383468,Urban,Bay Area,Bay Area,Bay Area,1656754
1,6013,13,Contra Costa,California,"Contra Costa, California",Contra Costa County,"Contra Costa County, California",282085,Urban,Bay Area,Bay Area,Bay Area,1142251
2,6095,95,Solano,California,"Solano, California",Solano County,"Solano County, California",107267,Urban,Bay Area,Bay Area,Bay Area,441829
3,6081,81,San Mateo,California,"San Mateo, California",San Mateo County,"San Mateo County, California",183144,Urban,Bay Area,Bay Area,Bay Area,767423
4,6085,85,Santa Clara,California,"Santa Clara, California",Santa Clara County,"Santa Clara County, California",456440,Urban,Bay Area,Bay Area,Bay Area,1927470


In [6]:
cost_of_living = pd.read_csv('data/united-way-col-1A1PS1C2019.csv') # 1 adult, 1 preschooler, 1 child

In [7]:
cost_of_living.head()

Unnamed: 0,Regions,Cost of Living
0,Bay Area,93392
1,Central Coast,76493
2,Central Valley,56747
3,Inland Empire,63170
4,Los Angeles,80216


In [8]:
ca_ipums = pd.merge(ca_ipums, county_info, on = 'COUNTYFIP')

In [9]:
len(ca_ipums['County'].unique())

34

In [10]:
ca_ipums = ca_ipums[['INDNAICS', 'PERWT', 'INCWAGE',
       'NAICS Code', 'Industry Title_x', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code', 'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions']]

In [11]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'CDI Regions', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'Regional COL'})

In [12]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'Rural/Urban', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'Rural/Urban COL'})

In [13]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'County', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'County COL'})

In [14]:
ca_ipums = ca_ipums[['INDNAICS', 'PERWT', 'INCWAGE',
       'NAICS Code', 'Industry Title_x', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code', 'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions',
                    'Regional COL', 'Rural/Urban COL', 'County COL']]

In [15]:
ca_ipums['Regional Rural/Urban'] = ca_ipums['CDI Regions'] + ' ' + ca_ipums['Rural/Urban']

In [16]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'Regional Rural/Urban', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'Regional Rural/Urban COL'})

In [17]:
ca_ipums = ca_ipums.rename(columns = {'Industry Title_x':'Industry Title'})
ca_ipums = ca_ipums[['INDNAICS', 'PERWT', 'INCWAGE',
       'NAICS Code', 'Industry Title', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code',
       'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions', 'Regional Rural/Urban',
                    'Regional COL', 'Rural/Urban COL', 'County COL', 'Regional Rural/Urban COL']]

In [18]:
ca_ipums['Industry Title'] = normalize_titles(ca_ipums['Industry Title'])

In [19]:
ca_ipums

Unnamed: 0,INDNAICS,PERWT,INCWAGE,NAICS Code,Industry Title,Main_Code,Sub_1_Code,Sub_2_Code,Sub_3_Code,Sub_4_Code,County,Rural/Urban,CDI Regions,Regional Rural/Urban,Regional COL,Rural/Urban COL,County COL,Regional Rural/Urban COL
0,4853,21.0,23100,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,Los Angeles Urban,80216,79472,80216,80216
1,4853,21.0,23100,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,Los Angeles Urban,80216,79472,80216,80216
2,4853,21.0,23100,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,Los Angeles Urban,80216,79472,80216,80216
3,4853,21.0,23100,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,Los Angeles Urban,80216,79472,80216,80216
4,4853,11.0,28000,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,Los Angeles Urban,80216,79472,80216,80216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488572,3113,558.0,5300,3113,sugar and confectionery products,300,310,311,311,311,Humboldt,Rural,Redwood Coast,Redwood Coast Rural,58625,58812,59608,58625
488573,3113,244.0,5300,3113,sugar and confectionery products,300,310,311,311,311,Humboldt,Rural,Redwood Coast,Redwood Coast Rural,58625,58812,59608,58625
488574,4247,117.0,50000,4247,petroleum and petroleum products merchant whol...,400,420,424,424d,424d,Humboldt,Rural,Redwood Coast,Redwood Coast Rural,58625,58812,59608,58625
488575,42491,148.0,40000,42491,farm supplies merchant wholesalers,400,420,424,424,424,Humboldt,Rural,Redwood Coast,Redwood Coast Rural,58625,58812,59608,58625


## Merge with EDD Data

In [54]:
edd = pd.read_csv('data/edd_merged.csv')

In [56]:
edd['Area Name'] = edd['Area Name'].str.replace(' County', '')
edd = edd.loc[edd['Area Type'] == 'County']
edd = edd.drop(columns=['Industry Title'])
edd = edd.rename(columns={"LMID Industry Title": "Industry Title"})

In [57]:
edd.head()

Unnamed: 0,Industry Title,Parsed_Code,Area Type,Area Name,Date,Seasonally Adjusted,Current Employment,Main_EDD,Main_Code,Sub_1,Sub_1_Code,Sub_2,Sub_2_Code,Sub_3,Sub_3_Code,Sub_4,Sub_4_Code
0,county,939,County,Madera,3/1/19,N,1600,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
1,county,939,County,Fresno,1/1/19,N,7800,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
2,county,939,County,Kern,1/1/19,N,9900,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
3,county,939,County,Los Angeles,1/1/19,N,106800,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
4,county,939,County,Madera,1/1/19,N,1600,government,900,state and local government,940,local government,930,local government excluding education,932,county,939


#### Fix merge - losing too many counties this way

In [23]:
ca_ipums_edd = pd.merge(ca_ipums, edd, left_on=['County', 'Industry Title'], right_on=['Area Name', 'Sub_4'])

In [27]:
ca_ipums_edd = ca_ipums_edd[['INDNAICS', 'PERWT', 'INCWAGE', 'NAICS Code',
                            'County', 'Rural/Urban', 'CDI Regions', 'Regional Rural/Urban',
                           'Regional COL', 'Rural/Urban COL', 'County COL',
                           'Regional Rural/Urban COL', 'LMID Industry Title', 'Area Type',
                           'Date', 'Seasonally Adjusted', 'Current Employment', 'Main_EDD',
                           'Main_Code_y', 'Sub_1', 'Sub_1_Code_y', 'Sub_2', 'Sub_2_Code_y',
                           'Sub_3', 'Sub_3_Code_y', 'Sub_4', 'Sub_4_Code_y']]

In [28]:
ca_ipums_edd = ca_ipums_edd.rename(columns={"Main_Code_y": "Main_Code",
                                           "Sub_1_Code_y": "Sub_1_Code",
                                           "Sub_2_Code_y": "Sub_2_Code",
                                           "Sub_3_Code_y": "Sub_3_Code",
                                           "Sub_4_Code_y": "Sub_4_Code"})

In [29]:
ca_ipums_edd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 741060 entries, 0 to 741059
Data columns (total 27 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   INDNAICS                  741060 non-null  object 
 1   PERWT                     741060 non-null  float64
 2   INCWAGE                   741060 non-null  int64  
 3   NAICS Code                741060 non-null  object 
 4   County                    741060 non-null  object 
 5   Rural/Urban               741060 non-null  object 
 6   CDI Regions               741060 non-null  object 
 7   Regional Rural/Urban      741060 non-null  object 
 8   Regional COL              741060 non-null  int64  
 9   Rural/Urban COL           741060 non-null  int64  
 10  County COL                741060 non-null  int64  
 11  Regional Rural/Urban COL  741060 non-null  int64  
 12  LMID Industry Title       741060 non-null  object 
 13  Area Type                 741060 non-null  o

In [30]:
# ca_ipums_edd.to_csv('data/ca_ipums_full_hierarchy.csv', encoding='utf-8', index=False)

### Checking what industries from original EDD data were dropped

In [31]:
edd_antijoin = edd[~edd['LMID Industry Title'].isin(ca_ipums_edd['Sub_4'].unique())]

In [32]:
len(edd_antijoin)

3144

In [33]:
edd_antijoin['LMID Industry Title'].unique()

array(['county', 'total farm', 'state government education', 'city',
       'local government education', 'general merchandise stores',
       'state government excluding education', 'department of defense',
       'federal government excluding department of defense',
       'building foundation and exterior contractors',
       'residential building construction',
       'building finishing contractors',
       'travel arrangement and reservation services',
       'health and personal care stores',
       'bakeries and tortilla manufacturing',
       'primary metal manufacturing', 'lessors of real estate',
       'prof and commercial equip merchant wholesalers',
       'mgmt scientific and technical consulting services',
       'transit and ground passenger transportation',
       'limitedservice eating places', 'grocery stores',
       'sporting goods hobby and musical instrument store',
       'textile mills', 'offices of real estate agents and brokers',
       'agencies broker and 

## Sample size IPUMS function

In [34]:
def add_to_state_df(df):
    df['Above CA Threshold'] = df['INCWAGE'] > 74448
    df['wt_ind_counts'] = df['PERWT'].groupby(df['INDNAICS']).transform('sum')
    df["Above CA Threshold"] = df["Above CA Threshold"].astype(int)
    df["wt_CA_above_thresh"] = df["Above CA Threshold"] * df['PERWT']
    df['wt_CA_high_wage_count'] = df['wt_CA_above_thresh'].groupby(
        df['INDNAICS']).transform('sum')
    df['wt_CA_high_wage_perc'] = (df['wt_CA_high_wage_count'] / df['wt_ind_counts']) * 100
    df['unwt_ind_counts'] = df['INDNAICS'].groupby(df['INDNAICS']).transform('count')
    return df

In [35]:
ca_ipums_hw = add_to_state_df(ca_ipums)
ca_ipums_hw.columns.values

array(['INDNAICS', 'PERWT', 'INCWAGE', 'NAICS Code', 'Industry Title',
       'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code',
       'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions',
       'Regional Rural/Urban', 'Regional COL', 'Rural/Urban COL',
       'County COL', 'Regional Rural/Urban COL', 'Above CA Threshold',
       'wt_ind_counts', 'wt_CA_above_thresh', 'wt_CA_high_wage_count',
       'wt_CA_high_wage_perc', 'unwt_ind_counts'], dtype=object)

#### County Level

In [36]:
def add_to_county_df(df):
    df['above_county_thresh'] = df['INCWAGE'] > df['County COL']
    df["above_county_thresh"] = df["above_county_thresh"].astype(int)
    df["wt_county_above_thresh"] = df["above_county_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','County']).agg(wt_county_ind_counts = ('PERWT','sum'),
                                                     wt_county_hw_count = ('wt_county_above_thresh','sum'),
                                                     unwt_county_ind_counts = ('INDNAICS','count')).reset_index()    
    
    df = pd.merge(df, df_agg, on=['INDNAICS', 'County'])
    df['wt_county_hw_perc'] = (df['wt_county_hw_count'] / df['wt_county_ind_counts']) * 100
    df = df.rename(columns={"County_x": "County", 'wt_county_hw_count_x':'wt_county_hw_count','wt_county_ind_counts_x':'wt_county_ind_counts',
                           'unwt_county_ind_counts_x':'unwt_county_ind_counts'})
    return df

In [37]:
ca_ipums_hw = add_to_county_df(ca_ipums_hw)
ca_ipums_hw.columns.values

array(['INDNAICS', 'PERWT', 'INCWAGE', 'NAICS Code', 'Industry Title',
       'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code',
       'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions',
       'Regional Rural/Urban', 'Regional COL', 'Rural/Urban COL',
       'County COL', 'Regional Rural/Urban COL', 'Above CA Threshold',
       'wt_ind_counts', 'wt_CA_above_thresh', 'wt_CA_high_wage_count',
       'wt_CA_high_wage_perc', 'unwt_ind_counts', 'above_county_thresh',
       'wt_county_above_thresh', 'wt_county_ind_counts',
       'wt_county_hw_count', 'unwt_county_ind_counts',
       'wt_county_hw_perc'], dtype=object)

#### Regional Level

In [38]:
def add_to_region_df(df): 
    df['above_region_thresh'] = df['INCWAGE'] > df['Regional COL']
    df["above_region_thresh"] = df["above_region_thresh"].astype(int)
    df["wt_reg_above_thresh"] = df["above_region_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','CDI Regions']).agg(wt_reg_ind_counts = ('PERWT','sum'),
                                                     wt_reg_high_wage_count = ('wt_reg_above_thresh','sum'),
                                                     unwt_reg_ind_counts = ('INDNAICS','count')).reset_index()    
    
    df = pd.merge(df, df_agg, on=['INDNAICS', 'CDI Regions'])
    df['wt_reg_high_wage_perc'] = (df['wt_reg_high_wage_count'] / df['wt_reg_ind_counts']) * 100
    df = df.rename(columns={"CDI Regions_x": "CDI Regions", 'wt_reg_high_wage_count_x':'wt_reg_high_wage_count','wt_reg_ind_counts_x':'wt_reg_ind_counts',
                           'unwt_reg_ind_counts_x':'unwt_reg_ind_counts'})
    return df

In [39]:
ca_ipums_hw = add_to_region_df(ca_ipums_hw)
ca_ipums_hw.columns.values

array(['INDNAICS', 'PERWT', 'INCWAGE', 'NAICS Code', 'Industry Title',
       'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code',
       'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions',
       'Regional Rural/Urban', 'Regional COL', 'Rural/Urban COL',
       'County COL', 'Regional Rural/Urban COL', 'Above CA Threshold',
       'wt_ind_counts', 'wt_CA_above_thresh', 'wt_CA_high_wage_count',
       'wt_CA_high_wage_perc', 'unwt_ind_counts', 'above_county_thresh',
       'wt_county_above_thresh', 'wt_county_ind_counts',
       'wt_county_hw_count', 'unwt_county_ind_counts',
       'wt_county_hw_perc', 'above_region_thresh', 'wt_reg_above_thresh',
       'wt_reg_ind_counts', 'wt_reg_high_wage_count',
       'unwt_reg_ind_counts', 'wt_reg_high_wage_perc'], dtype=object)

#### Regional Rural/Urban Level

In [40]:
def add_to_regioncomm_df(df):
    df['above_regcomm_thresh'] = df['INCWAGE'] > df['Regional Rural/Urban COL']
    df["above_regcomm_thresh"] = df["above_regcomm_thresh"].astype(int)
    df["wt_regcomm_above_thresh"] = df["above_regcomm_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','Regional Rural/Urban']).agg(wt_regcomm_ind_counts = ('PERWT','sum'),
                                                     wt_regcomm_hw_count = ('wt_regcomm_above_thresh','sum'),
                                                     unwt_regcomm_ind_counts = ('INDNAICS','count')).reset_index()    
    
    df = pd.merge(df, df_agg, on=['INDNAICS', 'Regional Rural/Urban'])
    df['wt_regcomm_hw_perc'] = (df['wt_regcomm_hw_count'] / df['wt_regcomm_ind_counts']) * 100
    df = df.rename(columns={"Regional Rural/Urban_x": "Regional Rural/Urban", 'wt_regcomm_hw_count_x':'wt_regcomm_hw_count','wt_regcomm_ind_counts_x':'wt_regcomm_ind_counts',
                           'unwt_regcomm_ind_counts_x':'unwt_regcomm_ind_counts'})
    return df

In [41]:
ca_ipums_hw = add_to_regioncomm_df(ca_ipums)
ca_ipums_hw.columns.values

array(['INDNAICS', 'PERWT', 'INCWAGE', 'NAICS Code', 'Industry Title',
       'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code',
       'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions',
       'Regional Rural/Urban', 'Regional COL', 'Rural/Urban COL',
       'County COL', 'Regional Rural/Urban COL', 'Above CA Threshold',
       'wt_ind_counts', 'wt_CA_above_thresh', 'wt_CA_high_wage_count',
       'wt_CA_high_wage_perc', 'unwt_ind_counts', 'above_county_thresh',
       'wt_county_above_thresh', 'above_regcomm_thresh',
       'wt_regcomm_above_thresh', 'wt_regcomm_ind_counts',
       'wt_regcomm_hw_count', 'unwt_regcomm_ind_counts',
       'wt_regcomm_hw_perc'], dtype=object)

#### Rural/Urban level

In [42]:
def add_to_community_df(df):
    df['above_comm_thresh'] = df['INCWAGE'] > df['Rural/Urban COL']
    df["above_comm_thresh"] = df["above_comm_thresh"].astype(int)
    df["wt_comm_above_thresh"] = df["above_comm_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','Rural/Urban']).agg(wt_comm_ind_counts = ('PERWT','sum'),
                                                     wt_comm_high_wage_count = ('wt_comm_above_thresh','sum'),
                                                     unwt_comm_ind_counts = ('INDNAICS','count')).reset_index()    
    
    df = pd.merge(df, df_agg, on=['INDNAICS', 'Rural/Urban'])
    df['wt_comm_high_wage_perc'] = (df['wt_comm_high_wage_count'] / df['wt_comm_ind_counts']) * 100
    df = df.rename(columns={"Rural/Urban_x": "Rural/Urban", 'wt_comm_high_wage_count_x':'wt_comm_high_wage_count','wt_comm_ind_counts_x':'wt_comm_ind_counts',
                           'unwt_comm_ind_counts_x':'unwt_comm_ind_counts'})
    return df

In [43]:
ca_ipums_hw = add_to_community_df(ca_ipums_hw)
ca_ipums_hw.columns.values

array(['INDNAICS', 'PERWT', 'INCWAGE', 'NAICS Code', 'Industry Title',
       'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code',
       'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions',
       'Regional Rural/Urban', 'Regional COL', 'Rural/Urban COL',
       'County COL', 'Regional Rural/Urban COL', 'Above CA Threshold',
       'wt_ind_counts', 'wt_CA_above_thresh', 'wt_CA_high_wage_count',
       'wt_CA_high_wage_perc', 'unwt_ind_counts', 'above_county_thresh',
       'wt_county_above_thresh', 'above_regcomm_thresh',
       'wt_regcomm_above_thresh', 'wt_regcomm_ind_counts',
       'wt_regcomm_hw_count', 'unwt_regcomm_ind_counts',
       'wt_regcomm_hw_perc', 'above_comm_thresh', 'wt_comm_above_thresh',
       'wt_comm_ind_counts', 'wt_comm_high_wage_count',
       'unwt_comm_ind_counts', 'wt_comm_high_wage_perc'], dtype=object)

In [44]:
def add_geo_high_wages(df):
    df_new = df.copy() # initialize new dataframe
    df_new = add_to_state_df(df_new) # creating state level counts
    df_new = add_to_community_df(df_new) # creating rural/urban level counts
    df_new = add_to_region_df(df_new) # creating regional level counts
    df_new = add_to_regioncomm_df(df_new) # creating regional rural/urban level counts
    df_new = add_to_county_df(df_new) # creating county level counts
    return df_new

In [45]:
ca_ipums_hw = add_geo_high_wages(ca_ipums)

In [161]:
ca_ipums_hw.shape

(488577, 48)

In [46]:
ca_ipums_hw.head().T

Unnamed: 0,0,1,2,3,4
INDNAICS,4853,4853,4853,4853,4853
PERWT,21.0,21.0,21.0,21.0,11.0
INCWAGE,23100,23100,23100,23100,28000
NAICS Code,4853,4853,4853,4853,4853
Industry Title,taxi and limousine service,taxi and limousine service,taxi and limousine service,taxi and limousine service,taxi and limousine service
Main_Code,400,400,400,400,400
Sub_1_Code,430,430,430,430,430
Sub_2_Code,480,480,480,480,480
Sub_3_Code,483,483,483,483,483
Sub_4_Code,483,483,483,483,483


In [106]:
def ca_ipums_filter(df, county: str, NAICS: str, n: int):
    df = df.loc[df['County'] == county].copy()
    if len(df) == 0:
        return "County not valid or found", None
    df = df.loc[df['INDNAICS'] == NAICS].copy()
    if len(df) == 0:
        return "NAICS Code not valid or found", None
    df = df.drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]
    if df['unwt_county_ind_counts'].values[0] >= n:
        return f"County: {county}, Geographical level used: County, High wage percentage: {df['wt_county_hw_perc'].values[0]}", df['wt_county_hw_perc'].values[0]
    elif df['unwt_regcomm_ind_counts'].values[0] >= n:
        return f"County: {county}, Geographical level used: Regional Rural/Urban, High wage percentage: {df['wt_regcomm_hw_perc'].values[0]}", df['wt_regcomm_hw_perc'].values[0]
    elif df['unwt_reg_ind_counts'].values[0] >= n:
        return f"County: {county}, Geographical level used: Regional, High wage percentage: {df['wt_reg_high_wage_perc'].values[0]}", df['wt_reg_high_wage_perc'].values[0]
    elif df['unwt_comm_ind_counts'].values[0] >= n:
        return f"County: {county}, Geographical level used: Rural/Urban, High wage percentage: {df['wt_comm_high_wage_perc'].values[0]}", df['wt_comm_high_wage_perc'].values[0]
    elif df['unwt_ind_counts'].values[0] >= n:
        return f"County: {county}, Geographical level used: State, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}", df['wt_CA_high_wage_perc'].values[0]
    else:
        return "Not enough records available to satisfy sample size request", None

#### Testing function

In [48]:
ca_ipums_filter(ca_ipums_hw, 'San Francisco', '51913', 10)

'County: San Francisco, Geographical level used: County, High wage percentage: 79.44013490725126'

In [49]:
ca_ipums_filter(ca_ipums_hw, 'San Francisco', '814', 50)

'County: San Francisco, Geographical level used: Regional Rural/Urban, High wage percentage: 1.5553977272727273'

## EDD Hierarchy Function

In [194]:
def edd_to_naics(edd_df, county: str, industry_title: str):
    # filter edd by county and industry
    edd_df = edd_df.loc[edd_df['Area Name'] == county].copy()
    if len(edd_df) == 0:
        return "County not valid or found"
    industry_title = industry_title.strip().lower().replace('&', 'and')
    industry_title = re.sub(r'[^\w\s]','',industry_title)
    edd_df = edd_df.loc[(edd_df['Sub_4'] == industry_title) | 
                        (edd_df['Sub_3'] == industry_title) | 
                        (edd_df['Sub_2'] == industry_title) | 
                        (edd_df['Sub_1'] == industry_title) | 
                        (edd_df['Main_Code'] == industry_title)].copy()
    edd_df['year_avg_employment_ct'] = edd_df['Current Employment'].mean()
    edd_df = edd_df.drop_duplicates(subset='Main_EDD').reset_index().iloc[:,1:]
    # load naics crosswalk
    cwd = os.getcwd()
    naics = pd.read_csv(f'{cwd}/data/naics_parsed_crosswalk.csv').drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]
    naics['Industry Title'] = normalize_titles(naics['Industry Title'])
    # merge naics with edd
    edd_df_ = pd.merge(edd_df, naics, on='Sub_4_Code')
    if len(edd_df_) == 0:
        edd_df_ = pd.merge(edd_df, naics, on='Sub_3_Code')
        if len(edd_df_) == 0:
            edd_df_ = pd.merge(edd_df, naics, on='Sub_2_Code')
            if len(edd_df_) == 0:
                edd_df_ = pd.merge(edd_df, naics, on='Sub_1_Code')
                if len(edd_df_) == 0:
                    edd_df_ = pd.merge(edd_df, naics, on='Main_Code')
                    if len(edd_df_) == 0:
                        return "No parsed code of input industry found within input county"
    edd_df = edd_df_.rename(columns = {'Industry Title_x':'EDD Industry',
                                 'Industry Title_y': 'IPUMS Industry'})
    edd_df = edd_df[['EDD Industry', 'Area Name', 'IPUMS Industry', 'INDNAICS', 'year_avg_employment_ct']]
    return edd_df

In [195]:
def edd_naics_to_hw(filtered_edd_df, ipums_df_hw, naics: str, sample_size: int): # decide appropriate naics from edd_function1 output
    filtered_edd_df = filtered_edd_df.loc[filtered_edd_df['INDNAICS'] == naics].copy()
    county = filtered_edd_df['Area Name'].values[0]
    employment_count = int(filtered_edd_df['year_avg_employment_ct'].values[0])
    output, hw_perc = ca_ipums_filter(ipums_df_hw, county, naics, sample_size)
    hw_count = int((employment_count * hw_perc) / 100)
    output += f", High wage count: {hw_count}"
    return output, hw_count

Comment: don't think we need to pick a specific NAICS code, since we are simply extracting from a level of EDD data

In [196]:
def edd_to_hw(edd_df, ipums_df_hw, county: str, industry_title: str, sample_size: int):
    # filter edd by county and industry
    edd_df = edd_df.loc[edd_df['Area Name'] == county].copy()
    if len(edd_df) == 0:
        return "County not valid or found"
    industry_title = industry_title.strip().lower().replace('&', 'and')
    industry_title = re.sub(r'[^\w\s]','',industry_title)
    edd_df = edd_df.loc[(edd_df['Sub_4'] == industry_title) | 
                        (edd_df['Sub_3'] == industry_title) | 
                        (edd_df['Sub_2'] == industry_title) | 
                        (edd_df['Sub_1'] == industry_title) | 
                        (edd_df['Main_Code'] == industry_title)].copy()
    edd_df['year_avg_employment_ct'] = edd_df['Current Employment'].mean()
    edd_df = edd_df.drop_duplicates(subset='Main_EDD').reset_index().iloc[:,1:]
    # load naics crosswalk
    cwd = os.getcwd()
    naics = pd.read_csv(f'{cwd}/data/naics_parsed_crosswalk.csv').drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]
    naics['Industry Title'] = normalize_titles(naics['Industry Title'])
    # merge naics with edd
    edd_df_ = pd.merge(edd_df, naics, on='Sub_4_Code')
    if len(edd_df_) == 0:
        edd_df_ = pd.merge(edd_df, naics, on='Sub_3_Code')
        if len(edd_df_) == 0:
            edd_df_ = pd.merge(edd_df, naics, on='Sub_2_Code')
            if len(edd_df_) == 0:
                edd_df_ = pd.merge(edd_df, naics, on='Sub_1_Code')
                if len(edd_df_) == 0:
                    edd_df_ = pd.merge(edd_df, naics, on='Main_Code')
                    if len(edd_df_) == 0:
                        return "No parsed code of input industry found within input county"
    edd_df = edd_df_.rename(columns = {'Industry Title_x':'EDD Industry',
                                 'Industry Title_y': 'IPUMS Industry'})
    edd_df = edd_df[['EDD Industry', 'Area Name', 'IPUMS Industry', 'INDNAICS', 'year_avg_employment_ct']]
    edd_df = edd_df.drop_duplicates(subset='year_avg_employment_ct').reset_index().iloc[:,1:]
    employment_count = int(edd_df['year_avg_employment_ct'].values[0])
    naics_code = edd_df['INDNAICS'].values[0]
    output, hw_perc = ca_ipums_filter(ipums_df_hw, county, naics_code, sample_size)
    hw_count = int((employment_count * hw_perc) / 100)
    output += f", High wage count: {hw_count}"
    return output, hw_count

## Test example:

In [206]:
edd_to_hw(edd, ca_ipums_hw, 'San Francisco', 'software publishers', 30)

'No parsed code of input industry found within input county'

In [193]:
edd_to_naics(edd, 'Los Angeles', 'nondurable goods')

Unnamed: 0,EDD Industry,Area Name,IPUMS Industry,INDNAICS,year_avg_employment_ct
0,bakeries and tortilla manufacturing,Los Angeles,retail bakeries,311811,10012.037037
1,bakeries and tortilla manufacturing,Los Angeles,bakeries and tortilla manufacturing except ret...,3118z,10012.037037


In [207]:
edd.shape

(3960, 17)

In [186]:
cwd = os.getcwd()
naics = pd.read_csv(f'{cwd}/data/naics_parsed_crosswalk.csv').drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]

In [187]:
naics

Unnamed: 0,Industry Title,INDNAICS,Main_Code,Sub_1_Code,Sub_2_Code,Sub_3_Code,Sub_4_Code
0,Crop production,111,111,111,111,111,111
1,Support activities for agriculture and forestry,115,111,111,111,111,111
2,Animal production and aquaculture,112,111,111,111,111,111
3,"Fishing, hunting, and trapping",114,111,111,111,111,111
4,"Forestry, except logging",113m,100,113,113,113,113
...,...,...,...,...,...,...,...
264,Other general government and support,92119,900,900,900,900,900
265,Executive offices and legislative bodies,9211mp,900,900,900,900,900
266,Public finance activities,92113,900,900,900,900,900
267,"Justice, public order, and safety activities",92mp,900,900,900,900,900
