In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import warnings
from jqi_functions import *
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format='retina'

## Loading Data

In [2]:
ipums = pd.read_csv('data/IPUMS_2019.csv')

In [3]:
ipums_titles = pd.read_csv('data/ind_indnaics_crosswalk_2000_onward_without_code_descriptions.csv')
ipums_titles = ipums_titles.iloc[2:]
ipums_titles = ipums_titles.iloc[:,10:]

In [4]:
ca_ipums = pd.read_csv('data/ca_ipums_merged.csv') # previously merged, manually parsed/encoded

In [5]:
county_info = pd.read_csv('data/county_to_regions_key - Sheet1.csv')

In [6]:
cost_of_living = pd.read_csv('data/united-way-reg-col.csv') # using united way real cost measure
cost_of_living = cost_of_living.iloc[0:11, :2]

In [7]:
naics_parsed_crosswalk = pd.read_csv('data/naics_parsed_crosswalk.csv').drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]

## Cleaning Data

In [8]:
ipums = ipums[['STATEFIP', 
               'COUNTYFIP',
              'INDNAICS',
              'PERWT',
              'INCWAGE']]

In [9]:
ca_ipums = ipums.loc[ipums['STATEFIP'] == 6].copy()
ca_ipums = ca_ipums.reset_index()

In [10]:
ca_ipums['INDNAICS'] = normalize_titles(ca_ipums['INDNAICS'])

In [11]:
ipums_titles['2018 Onward ACS/PRCS INDNAICS CODE'] = normalize_titles(ipums_titles['2018 Onward ACS/PRCS INDNAICS CODE'])

## Merging IPUMS Data

In [12]:
ipums_merged = pd.merge(ca_ipums, ipums_titles,
                              left_on = 'INDNAICS', right_on = '2018 Onward ACS/PRCS INDNAICS CODE')

In [13]:
ipums_merged = pd.merge(ipums_merged, naics_parsed_crosswalk, on='INDNAICS')

In [14]:
show_null(ipums_merged)

TOTAL ROWS: 224526
index: 0 null values, 0.00%
STATEFIP: 0 null values, 0.00%
COUNTYFIP: 0 null values, 0.00%
INDNAICS: 0 null values, 0.00%
PERWT: 0 null values, 0.00%
INCWAGE: 0 null values, 0.00%
2018 Onward ACS/PRCS INDNAICS CODE: 0 null values, 0.00%
Industry Title: 0 null values, 0.00%
Parsed_Code: 0 null values, 0.00%


## Merging on counties

In [15]:
ipums_w_counties = pd.merge(ipums_merged, county_info, on='COUNTYFIP', how='left')
ipums_w_col = pd.merge(ipums_w_counties, cost_of_living, left_on='CDI Regions', right_on = 'Regions', how='left')

In [16]:
ipums_regions = ipums_w_col[['INDNAICS', 'Parsed_Code', 
                                          'INCWAGE',
                                          'Industry Title',
                                          'County',
                                          'CDI Regions',
                                          'PERWT', 'Cost of Living']]
ipums_regions = ipums_regions.dropna(subset=['Cost of Living'])

## Cost of Living Threshold - State Level

In [17]:
ca_cost_of_living = 32466 # from united way real cost measure - weighted average

In [18]:
ipums_regions['Above CA Threshold'] = ipums_regions['INCWAGE'] > ca_cost_of_living
ipums_regions['wt_ind_counts'] = ipums_regions['PERWT'].groupby(ipums_regions['Industry Title']).transform('sum')
ipums_regions["Above CA Threshold"] = ipums_regions["Above CA Threshold"].astype(int)
ipums_regions["wt_CA_above_thresh"] = ipums_regions["Above CA Threshold"] * ipums_regions['PERWT']
ipums_regions['wt_CA_high_wage_count'] = ipums_regions['wt_CA_above_thresh'].groupby(
    ipums_regions['Industry Title']).transform('sum')
ipums_regions['wt_CA_high_wage_perc'] = (ipums_regions['wt_CA_high_wage_count'] / ipums_regions['wt_ind_counts']) * 100
ipums_regions['unweighted_ind_counts'] = ipums_regions['Industry Title'].groupby(ipums_regions['Industry Title']).transform('count')

In [19]:
high_wage_ca_wt = ipums_regions.drop_duplicates(subset='Industry Title')
high_wage_ca_wt = high_wage_ca_wt[['Industry Title', 'INDNAICS', 'Parsed_Code', 'unweighted_ind_counts','wt_ind_counts', 'wt_CA_high_wage_count', 'wt_CA_high_wage_perc']]
high_wage_ca_wt = high_wage_ca_wt.sort_values(by='wt_CA_high_wage_perc', ascending=False)
high_wage_ca_wt = high_wage_ca_wt.reset_index().iloc[:,1:]

In [20]:
high_wage_ca_wt.head(10)

Unnamed: 0,Industry Title,INDNAICS,Parsed_Code,unweighted_ind_counts,wt_ind_counts,wt_CA_high_wage_count,wt_CA_high_wage_perc
0,Coal mining,2121,212,3,713.0,713.0,100.0
1,Tobacco,3122,312,1,55.0,55.0,100.0
2,Pipeline transportation,486,400,16,1672.0,1672.0,100.0
3,"Engine, turbine, and power transmission equipm...",3336,333,52,6383.0,5722.0,89.644368
4,Software publishers,5112,511,403,40200.0,35362.0,87.965174
5,Sewage treatment facilities,22132,221,100,9362.0,8090.0,86.41316
6,"Electric and gas, and other combinations",221mp,221,340,29657.0,25471.0,85.885288
7,Internet publishing and broadcasting and web s...,51913,519,890,83933.0,70992.0,84.58175
8,Not specified utilities,22s,220,40,3206.0,2711.0,84.5602
9,Nonmetallic mineral mining and quarrying,2123,212,58,4867.0,4038.0,82.96692


## Cost of Living Threshold - Regional Level

In [21]:
regions = ipums_regions['CDI Regions'].unique()

In [22]:
df_LA = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[0]].copy()
df_Bay_Area = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[1]].copy()
df_Cent_Coast = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[2]].copy()
df_Cent_Valley = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[3]].copy()
df_IE = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[4]].copy()
df_OC = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[5]].copy()
df_Sac = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[6]].copy()
df_SD = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[7]].copy()
df_Shasta = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[8]].copy()
df_Redwood_Coast = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[9]].copy()

In [23]:
region_dfs = [df_LA, df_Bay_Area, df_Cent_Coast, df_Cent_Valley, df_IE, df_OC,
              df_Sac, df_SD, df_Shasta, df_Redwood_Coast]

In [24]:
def add_to_region_df(df):
    df['above_region_thresh'] = df['INCWAGE'] > df['Cost of Living']
    df['wt_reg_ind_counts'] = df['PERWT'].groupby(df['Industry Title']).transform('sum')
    df["above_region_thresh"] = df["above_region_thresh"].astype(int)
    df["wt_reg_above_thresh"] = df["above_region_thresh"] * df['PERWT']
    df['wt_reg_high_wage_count'] = df['wt_reg_above_thresh'].groupby(
        df['Industry Title']).transform('sum')
    df['wt_reg_high_wage_perc'] = (df['wt_reg_high_wage_count'] / df['wt_reg_ind_counts']) * 100
    return df

In [25]:
for df in region_dfs:
    df = add_to_region_df(df)

In [26]:
def get_region_high_wage(df, new_df):
    new_df = df.drop_duplicates(subset='Industry Title')
    new_df = new_df[['CDI Regions', 'Industry Title', 'INDNAICS', 'Parsed_Code', 'Cost of Living', 'unweighted_ind_counts',
                     'wt_reg_ind_counts', 'wt_reg_high_wage_count', 'wt_reg_high_wage_perc']]
    new_df = new_df.sort_values(by='wt_reg_high_wage_perc', ascending=False)
    new_df = new_df.reset_index().iloc[:,1:]
    return new_df

In [27]:
# initialize new dfs
df_LA_high_wage = df_LA.copy()
df_Bay_Area_high_wage = df_Bay_Area.copy()
df_Cent_Coast_high_wage = df_Cent_Coast.copy()
df_Cent_Valley_high_wage = df_Cent_Valley.copy()
df_IE_high_wage = df_IE.copy()
df_OC_high_wage = df_OC.copy()
df_Sac_high_wage = df_Sac.copy()
df_SD_high_wage = df_SD.copy()
df_Shasta_high_wage = df_Shasta.copy()
df_Redwood_Coast_high_wage = df_Redwood_Coast.copy()

In [28]:
region_high_wage_dfs = [df_LA_high_wage, df_Bay_Area_high_wage, df_Cent_Coast_high_wage, 
                        df_Cent_Valley_high_wage, df_IE_high_wage, df_OC_high_wage,
              df_Sac_high_wage, df_SD_high_wage, df_Shasta_high_wage, df_Redwood_Coast_high_wage]

In [29]:
for i in range(len(region_dfs)):
    region_high_wage_dfs[i] = get_region_high_wage(region_dfs[i], region_dfs[i].copy())

In [30]:
regions_high_wage_merged = pd.concat(region_high_wage_dfs)

In [31]:
regions_high_wage_merged_ = regions_high_wage_merged.groupby(['CDI Regions','Industry Title']).agg(lambda x: x)
regions_high_wage_merged_ = regions_high_wage_merged_.sort_values(['CDI Regions', 'wt_reg_high_wage_perc'], ascending=(True, False))

In [32]:
regions_high_wage_merged_.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,INDNAICS,Parsed_Code,Cost of Living,unweighted_ind_counts,wt_reg_ind_counts,wt_reg_high_wage_count,wt_reg_high_wage_perc
CDI Regions,Industry Title,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bay Area,"Engine, turbine, and power transmission equipment manufacturing",3336,333,40083.0,52,522.0,522.0,100.0
Bay Area,"Knitting fabric mills, and apparel knitting mills",31m,314,40083.0,13,197.0,197.0,100.0
Bay Area,Metal ore mining,2122,212,40083.0,11,82.0,82.0,100.0
Bay Area,Miscellaneous paper and pulp products,3222m,322,40083.0,38,206.0,206.0,100.0
Bay Area,Miscellaneous petroleum and coal products,3241m,324,40083.0,21,1048.0,1048.0,100.0
Bay Area,"Nonferrous metal, except aluminum, production and processing",3314,331,40083.0,25,20.0,20.0,100.0
Bay Area,Not specified metal industries,33ms,331,40083.0,22,425.0,425.0,100.0
Bay Area,Railroad rolling stock manufacturing,3365,336,40083.0,14,53.0,53.0,100.0
Bay Area,Tires,32621,326,40083.0,23,67.0,67.0,100.0
Bay Area,"Veneer, plywood, and engineered wood products",3212,321,40083.0,12,66.0,66.0,100.0


## EDD Employment Counts

### Loading manually parsed data

In [33]:
edd_merged = pd.read_csv('data/edd_merged.csv')
ca_ipums_merged = pd.read_csv('data/ca_ipums_merged.csv')

In [34]:
ca_ipums_merged['Parsed_Code'] = ca_ipums_merged['Parsed_Code'].astype(int)

In [35]:
edd_ipums_merged = pd.merge(ca_ipums_merged, edd_merged, on='Parsed_Code')

In [36]:
thresholds = pd.read_feather('data/high_wage_ca_wt.fea')

In [37]:
thresholds_edd = pd.merge(edd_ipums_merged, thresholds, on='INDNAICS')
thresholds_edd = thresholds_edd.drop_duplicates(subset='Industry Title')

### High wage employment per industry in California

#### Employment counts from 12/01/2019, no seasonally adjusted counts included

In [38]:
thresholds_edd = thresholds_edd[['Industry Title', 'INDNAICS', 'Parsed_Code', 'COUNTYFIP', 'unweighted_ind_counts', 'wt_ind_counts', 'wt_CA_high_wage_count', 'wt_CA_high_wage_perc', 'Current Employment']]
thresholds_edd['High Wage Employment'] = round((thresholds_edd['wt_CA_high_wage_perc'] / 100) * thresholds_edd['Current Employment'])
thresholds_edd = thresholds_edd.sort_values(by='wt_CA_high_wage_perc', ascending=False)
thresholds_edd = thresholds_edd.reset_index().iloc[:,1:]

In [39]:
thresholds_edd.head(10)

Unnamed: 0,Industry Title,INDNAICS,Parsed_Code,COUNTYFIP,unweighted_ind_counts,wt_ind_counts,wt_CA_high_wage_count,wt_CA_high_wage_perc,Current Employment,High Wage Employment
0,Tobacco,3122,312,0,1,55.0,55.0,100.0,64000,64000.0
1,Coal mining,2121,212,19,3,713.0,713.0,100.0,5900,5900.0
2,Pipeline transportation,486,400,67,16,1672.0,1672.0,100.0,3939400,3939400.0
3,"Engine, turbine, and power transmission equipm...",3336,333,73,52,6383.0,5722.0,89.644368,137400,123171.0
4,Software publishers,5112,511,59,403,40200.0,35362.0,87.965174,251800,221496.0
5,Sewage treatment facilities,22132,221,41,100,9362.0,8090.0,86.41316,52200,45108.0
6,"Electric and gas, and other combinations",221mp,221,37,340,29657.0,25471.0,85.885288,52200,44832.0
7,Internet publishing and broadcasting and web s...,51913,519,37,890,83933.0,70992.0,84.58175,125100,105812.0
8,Not specified utilities,22s,220,37,40,3206.0,2711.0,84.5602,57200,48368.0
9,Nonmetallic mineral mining and quarrying,2123,212,99,58,4867.0,4038.0,82.96692,5900,4895.0


### High wage employment per industry per region

### Loading data by county

In [40]:
full_edd = pd.read_csv('data/Current_EDD_1121.csv')
edd_crosswalk = pd.read_excel('data/Industry_Title_Crosswalk.xlsx')
county_edd = full_edd.loc[full_edd['Area Type'] == 'County'].copy()
county_edd = county_edd.loc[county_edd['Date'] == '12/01/2019'].copy()
county_edd = county_edd.loc[county_edd['Seasonally Adjusted'] == 'N'].copy()
county_edd['Area Name'] = county_edd['Area Name'].str.replace(' County', '')
county_edd = county_edd.reset_index()

In [41]:
county_edd_merged = pd.merge(county_edd, edd_crosswalk, left_on='Industry Title', right_on='EDD Industry Title')

In [42]:
regions_edd = pd.merge(county_edd_merged, county_info, left_on='Area Name', right_on='County')

In [43]:
regions_edd = regions_edd[['Industry Title', 'Current Employment', 'Parsed_Code', 'COUNTYFIP', 'County', 'CDI Regions']]

In [44]:
edd_LA = regions_edd.loc[regions_edd['CDI Regions'] == regions[0]].copy()
edd_Bay_Area = regions_edd.loc[regions_edd['CDI Regions'] == regions[1]].copy()
edd_Cent_Coast = regions_edd.loc[regions_edd['CDI Regions'] == regions[2]].copy()
edd_Cent_Valley = regions_edd.loc[regions_edd['CDI Regions'] == regions[3]].copy()
edd_IE = regions_edd.loc[regions_edd['CDI Regions'] == regions[4]].copy()
edd_OC = regions_edd.loc[regions_edd['CDI Regions'] == regions[5]].copy()
edd_Sac = regions_edd.loc[regions_edd['CDI Regions'] == regions[6]].copy()
edd_SD = regions_edd.loc[regions_edd['CDI Regions'] == regions[7]].copy()
edd_Shasta = regions_edd.loc[regions_edd['CDI Regions'] == regions[8]].copy()
edd_Redwood_Coast = regions_edd.loc[regions_edd['CDI Regions'] == regions[9]].copy()

In [45]:
def get_region_edd(df):
    df['Current Employment'] = df['Current Employment'].groupby(df['Industry Title']).transform('sum')
    df = df[['CDI Regions', 'Industry Title', 'Current Employment', 'Parsed_Code']]
    df = df.drop_duplicates(subset='Industry Title')
    df = df.reset_index().iloc[:,1:]
    return df

In [46]:
edd_LA = get_region_edd(edd_LA)
edd_Bay_Area = get_region_edd(edd_Bay_Area)
edd_Cent_Coast = get_region_edd(edd_Cent_Coast)
edd_Cent_Valley = get_region_edd(edd_Cent_Valley)
edd_IE = get_region_edd(edd_IE)
edd_OC = get_region_edd(edd_OC)
edd_Sac = get_region_edd(edd_Sac)
edd_SD = get_region_edd(edd_SD)
edd_Shasta = get_region_edd(edd_Shasta)
edd_Redwood_Coast = get_region_edd(edd_Redwood_Coast)

In [47]:
high_wage_LA = regions_high_wage_merged.loc[regions_high_wage_merged['CDI Regions'] == regions[0]].copy()
high_wage_Bay_Area = regions_high_wage_merged.loc[regions_high_wage_merged['CDI Regions'] == regions[1]].copy()
high_wage_Cent_Coast = regions_high_wage_merged.loc[regions_high_wage_merged['CDI Regions'] == regions[2]].copy()
high_wage_Cent_Valley = regions_high_wage_merged.loc[regions_high_wage_merged['CDI Regions'] == regions[3]].copy()
high_wage_IE = regions_high_wage_merged.loc[regions_high_wage_merged['CDI Regions'] == regions[4]].copy()
high_wage_OC = regions_high_wage_merged.loc[regions_high_wage_merged['CDI Regions'] == regions[5]].copy()
high_wage_Sac = regions_high_wage_merged.loc[regions_high_wage_merged['CDI Regions'] == regions[6]].copy()
high_wage_SD = regions_high_wage_merged.loc[regions_high_wage_merged['CDI Regions'] == regions[7]].copy()
high_wage_Shasta = regions_high_wage_merged.loc[regions_high_wage_merged['CDI Regions'] == regions[8]].copy()
high_wage_Redwood_Coast = regions_high_wage_merged.loc[regions_high_wage_merged['CDI Regions'] == regions[9]].copy()

In [48]:
def get_region_high_wage(df, edd_df):
    df['parsed_ind_counts'] = df['Parsed_Code'].groupby(df['Parsed_Code']).transform('count')
    df['parsed_perc_sum'] = df['wt_reg_high_wage_perc'].groupby(df['Parsed_Code']).transform('sum')
    df['final_high_wage_perc'] = df['parsed_perc_sum'] / df['parsed_ind_counts']
    df = df.sort_values(by='final_high_wage_perc', ascending=False)
    df = pd.merge(df, edd_df, on='Parsed_Code')
    df['High Wage Count'] = df['Current Employment'] * (df['final_high_wage_perc'] / 100)
    df = df[['CDI Regions_x', 'Industry Title_x', 'Parsed_Code', 'INDNAICS', 'Cost of Living', 'unweighted_ind_counts', 'wt_reg_ind_counts', 'wt_reg_high_wage_perc', 'final_high_wage_perc', 'Current Employment', 'High Wage Count']].reset_index().iloc[:,1:]
    return df

In [49]:
high_wage_LA = get_region_high_wage(high_wage_LA, edd_LA)
high_wage_Bay_Area = get_region_high_wage(high_wage_Bay_Area, edd_Bay_Area)
high_wage_Cent_Coast = get_region_high_wage(high_wage_Cent_Coast, edd_Cent_Coast)
high_wage_Cent_Valley = get_region_high_wage(high_wage_Cent_Valley, edd_Cent_Valley)
high_wage_IE = get_region_high_wage(high_wage_IE, edd_IE)
high_wage_OC = get_region_high_wage(high_wage_OC, edd_OC)
high_wage_Sac = get_region_high_wage(high_wage_Sac, edd_Sac)
high_wage_SD = get_region_high_wage(high_wage_SD, edd_SD)
high_wage_Shasta = get_region_high_wage(high_wage_Shasta, edd_Shasta)
high_wage_Redwood_Coast = get_region_high_wage(high_wage_Redwood_Coast, edd_Redwood_Coast)

In [50]:
region_high_wage_dfs = [high_wage_LA, high_wage_Bay_Area, high_wage_Cent_Coast, 
                        high_wage_Cent_Valley, high_wage_IE, high_wage_OC,
              high_wage_Sac, high_wage_SD, high_wage_Shasta, high_wage_Redwood_Coast]

In [51]:
regions_high_wage_concat = pd.concat(region_high_wage_dfs)

In [52]:
regions_high_wage_concat = regions_high_wage_concat.sort_values(['CDI Regions_x', 'final_high_wage_perc'], ascending=(True, False))

In [53]:
regions_high_wage_concat.head(10)

Unnamed: 0,CDI Regions_x,Industry Title_x,Parsed_Code,INDNAICS,Cost of Living,unweighted_ind_counts,wt_reg_ind_counts,wt_reg_high_wage_perc,final_high_wage_perc,Current Employment,High Wage Count
0,Bay Area,Management of companies and enterprises,550,55,40083.0,320,8825.0,82.787535,82.787535,69700,57702.912181
1,Bay Area,"Justice, public order, and safety activities",922,92mp,40083.0,3485,55406.0,73.353066,73.353066,8200,6014.951449
2,Bay Area,Public finance activities,921,92113,40083.0,454,6063.0,84.265215,72.51549,3200,2320.495693
3,Bay Area,Executive offices and legislative bodies,921,9211mp,40083.0,1764,38797.0,64.247236,72.51549,3200,2320.495693
4,Bay Area,Other general government and support,921,92119,40083.0,248,4027.0,69.03402,72.51549,3200,2320.495693
5,Bay Area,Administration of environmental quality and ho...,920,92m1,40083.0,392,6915.0,70.802603,67.585444,486100,328532.843177
6,Bay Area,Administration of environmental quality and ho...,920,92m1,40083.0,392,6915.0,70.802603,67.585444,101600,68666.811082
7,Bay Area,Administration of human resource programs,920,923,40083.0,1939,37205.0,52.697218,67.585444,486100,328532.843177
8,Bay Area,Administration of human resource programs,920,923,40083.0,1939,37205.0,52.697218,67.585444,101600,68666.811082
9,Bay Area,Administration of economic programs and space ...,920,92m2,40083.0,657,14284.0,79.256511,67.585444,486100,328532.843177
