In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format='retina'

### Loading & Exploring Data

In [2]:
ipums = pd.read_csv('data/IPUMS_2019.csv')

In [3]:
ipums.columns

Index(['YEAR', 'SAMPLE', 'SERIAL', 'CBSERIAL', 'HHWT', 'CLUSTER', 'CPI99',
       'STATEFIP', 'COUNTYFIP', 'CITY', 'STRATA', 'GQ', 'OWNERSHP',
       'OWNERSHPD', 'OWNCOST', 'RENTGRS', 'HHINCOME', 'CILAPTOP', 'CIHISPEED',
       'PERNUM', 'PERWT', 'FAMSIZE', 'AGE', 'RACE', 'RACED', 'HISPAN',
       'HISPAND', 'EDUC', 'EDUCD', 'OCCSOC', 'INDNAICS', 'INCTOT', 'FTOTINC',
       'INCWAGE', 'MIGCOUNTY1', 'MOVEDIN', 'TRANTIME'],
      dtype='object')

In [4]:
ipums_titles = pd.read_csv('data/ind_indnaics_crosswalk_2000_onward_without_code_descriptions.csv')

In [5]:
ipums_titles = ipums_titles.iloc[2:]

In [6]:
ipums_titles = ipums_titles.iloc[:,10:]

In [7]:
county_info = pd.read_csv('data/county_to_regions_key - Sheet1.csv')

In [8]:
cost_of_living = pd.read_csv('data/regional-cost-of-living.csv')

In [9]:
cost_of_living = cost_of_living.iloc[0:11, :2]

In [10]:
cost_of_living

Unnamed: 0,Regions,Cost of Living
0,Bay Area,48273.0
1,Central Coast,41093.0
2,Central Valley,32388.0
3,Inland Empire,34188.0
4,Los Angeles,40244.0
5,Orange,46673.0
6,Redwood Coast,30984.0
7,Sacramento,35358.0
8,San Diego-Imperial,30389.0
9,Shasta / Cascades,29418.0


### Cleaning

In [11]:
ipums = ipums[['STATEFIP', 
               'COUNTYFIP',
              'INDNAICS',
              'PERWT',
              'INCWAGE']]

In [12]:
ca_ipums = ipums.loc[ipums['STATEFIP'] == 6].copy()

In [13]:
ca_ipums = ca_ipums.reset_index()

In [14]:
len(ca_ipums)

380091

In [15]:
def normalize_titles(col):
    col = col.astype(str)
    col = col.str.strip()
    col = col.str.lower()
    col = col.str.replace('&', 'and')
    col = col.apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
    return col

In [16]:
ca_ipums['INDNAICS'] = normalize_titles(ca_ipums['INDNAICS'])

In [17]:
ipums_titles['2018 Onward ACS/PRCS INDNAICS CODE'] = normalize_titles(ipums_titles['2018 Onward ACS/PRCS INDNAICS CODE'])

### Merging on NAICS Codes

In [18]:
ca_ipums.columns.values

array(['index', 'STATEFIP', 'COUNTYFIP', 'INDNAICS', 'PERWT', 'INCWAGE'],
      dtype=object)

In [19]:
ipums_titles.columns.values

array(['2018 Onward ACS/PRCS INDNAICS CODE', 'Industry Title'],
      dtype=object)

In [20]:
ipums_merged = pd.merge(ca_ipums, ipums_titles,
                              left_on = 'INDNAICS', right_on = '2018 Onward ACS/PRCS INDNAICS CODE')

In [21]:
print(len(ca_ipums), len(ipums_titles), ipums_merged.shape)

380091 402 (224526, 8)


In [22]:
print(f'''TOTAL ROWS: {len(ipums_merged)}''')
print('\n')
for column in ipums_merged.columns.values:
    total = len(ipums_merged)
    nulls = ipums_merged[column].isna().sum()
    percentage = (nulls / total) * 100
    print(f'''{column}: {nulls} null values, {percentage:.2f}%''')

TOTAL ROWS: 224526


index: 0 null values, 0.00%
STATEFIP: 0 null values, 0.00%
COUNTYFIP: 0 null values, 0.00%
INDNAICS: 0 null values, 0.00%
PERWT: 0 null values, 0.00%
INCWAGE: 0 null values, 0.00%
2018 Onward ACS/PRCS INDNAICS CODE: 0 null values, 0.00%
Industry Title: 0 null values, 0.00%


In [23]:
for column in ipums_merged.columns.values:
    print(f'''{column}: {len(ipums_merged[column].unique())} unique values''')

index: 224526 unique values
STATEFIP: 1 unique values
COUNTYFIP: 35 unique values
INDNAICS: 269 unique values
PERWT: 804 unique values
INCWAGE: 889 unique values
2018 Onward ACS/PRCS INDNAICS CODE: 269 unique values
Industry Title: 269 unique values


### Merging on counties

In [24]:
ipums_w_counties = pd.merge(ipums_merged, county_info, on='COUNTYFIP', how='left')

In [25]:
print(len(ipums_merged), len(county_info), ipums_w_counties.shape)

224526 58 (224526, 20)


In [26]:
ipums_w_col = pd.merge(ipums_w_counties, cost_of_living, left_on='CDI Regions', right_on = 'Regions', how='left')

In [27]:
print(ipums_w_counties.shape, cost_of_living.shape, ipums_w_col.shape)

(224526, 20) (11, 2) (224526, 22)


In [28]:
ipums_w_col.head()

Unnamed: 0,index,STATEFIP,COUNTYFIP,INDNAICS,PERWT,INCWAGE,2018 Onward ACS/PRCS INDNAICS CODE,Industry Title,FIPS,County,...,EDD County,Census County,Population - Households,Rural/Urban,Redstone Regions,WF Regions,CDI Regions,Population,Regions,Cost of Living
0,156866,6,37,4853,21.0,23100,4853,Taxi and limousine service,6037.0,Los Angeles,...,Los Angeles County,"Los Angeles County, California",2207265,Urban,Los Angeles,Greater Los Angeles,Los Angeles,10081570,Los Angeles,40244.0
1,157639,6,37,4853,11.0,28000,4853,Taxi and limousine service,6037.0,Los Angeles,...,Los Angeles County,"Los Angeles County, California",2207265,Urban,Los Angeles,Greater Los Angeles,Los Angeles,10081570,Los Angeles,40244.0
2,157919,6,75,4853,4.0,1000,4853,Taxi and limousine service,6075.0,San Francisco,...,San Francisco County,"San Francisco County, California",172190,Urban,Bay Area,Bay Area,Bay Area,874961,Bay Area,48273.0
3,158355,6,75,4853,147.0,1000,4853,Taxi and limousine service,6075.0,San Francisco,...,San Francisco County,"San Francisco County, California",172190,Urban,Bay Area,Bay Area,Bay Area,874961,Bay Area,48273.0
4,158357,6,37,4853,35.0,28000,4853,Taxi and limousine service,6037.0,Los Angeles,...,Los Angeles County,"Los Angeles County, California",2207265,Urban,Los Angeles,Greater Los Angeles,Los Angeles,10081570,Los Angeles,40244.0


In [29]:
ipums_regions = ipums_w_col[['INDNAICS',
                                          'INCWAGE',
                                          'Industry Title',
                                          'County',
                                          'CDI Regions',
                                          'PERWT', 'Cost of Living']]

In [30]:
ipums_regions = ipums_regions.dropna(subset=['Cost of Living'])

### Adjust for cost of living

In [31]:
ca_cost_of_living = 38823

In [32]:
# create cost of living weight
ipums_regions['col_factor'] = ipums_regions['Cost of Living'] / ca_cost_of_living

In [33]:
# normalize wage with cost of living factor
ipums_regions['norm_wage'] = ipums_regions['col_factor'] * ipums_regions['INCWAGE']

In [34]:
ipums_regions

Unnamed: 0,INDNAICS,INCWAGE,Industry Title,County,CDI Regions,PERWT,Cost of Living,col_factor,norm_wage
0,4853,23100,Taxi and limousine service,Los Angeles,Los Angeles,21.0,40244.0,1.036602,23945.506530
1,4853,28000,Taxi and limousine service,Los Angeles,Los Angeles,11.0,40244.0,1.036602,29024.856400
2,4853,1000,Taxi and limousine service,San Francisco,Bay Area,4.0,48273.0,1.243412,1243.412410
3,4853,1000,Taxi and limousine service,San Francisco,Bay Area,147.0,48273.0,1.243412,1243.412410
4,4853,28000,Taxi and limousine service,Los Angeles,Los Angeles,35.0,40244.0,1.036602,29024.856400
...,...,...,...,...,...,...,...,...,...
224521,3241m,65000,Miscellaneous petroleum and coal products,Sonoma,Bay Area,301.0,48273.0,1.243412,80821.806661
224522,3241m,9600,Miscellaneous petroleum and coal products,San Diego,San Diego-Imperial,111.0,30389.0,0.782758,7514.473379
224523,3241m,53000,Miscellaneous petroleum and coal products,Los Angeles,Los Angeles,122.0,40244.0,1.036602,54939.906756
224524,3241m,98000,Miscellaneous petroleum and coal products,Santa Clara,Bay Area,42.0,48273.0,1.243412,121854.416197


### Statewide Industry % - wage > state mean

In [35]:
# now using normalized wage
ca_wage_thresh = \
    (sum([row['norm_wage'] * row['PERWT'] for index, row in ipums_regions.iterrows()])) / \
    ipums_regions['PERWT'].sum()
ca_wage_thresh

53224.35558134873

Person weight, not normalized mean wage for comparison:

In [36]:
ca_wt_mean_wage = \
    (sum([row['INCWAGE'] * row['PERWT'] for index, row in ipums_regions.iterrows()])) / \
    ipums_regions['PERWT'].sum()
ca_wt_mean_wage

50408.48252383878

Unweighted median and mean wage for comparison:

In [37]:
ca_median_wage = ipums_regions['INCWAGE'].median()
ca_median_wage

30000.0

In [38]:
ca_mean_wage = ipums_regions['INCWAGE'].mean()
ca_mean_wage

53080.0238519243

In [39]:
# same code because INCWAGE accounts for PERWT
ipums_regions['Above CA Threshold'] = ipums_regions['INCWAGE'] > ca_wage_thresh

In [40]:
ipums_regions['wt_ind_counts'] = ipums_regions['PERWT'].groupby(ipums_regions['Industry Title']).transform('sum')

In [41]:
# cast boolean as 1/0
ipums_regions["Above CA Threshold"] = ipums_regions["Above CA Threshold"].astype(int)

In [42]:
ipums_regions["wt_CA_above_thresh"] = ipums_regions["Above CA Threshold"] * ipums_regions['PERWT']

In [43]:
ipums_regions['wt_CA_high_wage_count'] = ipums_regions['wt_CA_above_thresh'].groupby(
    ipums_regions['Industry Title']).transform('sum')

In [44]:
ipums_regions['wt_CA_high_wage_perc'] = (ipums_regions['wt_CA_high_wage_count'] / ipums_regions['wt_ind_counts']) * 100

In [45]:
ipums_regions['unweighted_ind_counts'] = ipums_regions['Industry Title'].groupby(ipums_regions['Industry Title']).transform('count')

In [46]:
ipums_regions

Unnamed: 0,INDNAICS,INCWAGE,Industry Title,County,CDI Regions,PERWT,Cost of Living,col_factor,norm_wage,Above CA Threshold,wt_ind_counts,wt_CA_above_thresh,wt_CA_high_wage_count,wt_CA_high_wage_perc,unweighted_ind_counts
0,4853,23100,Taxi and limousine service,Los Angeles,Los Angeles,21.0,40244.0,1.036602,23945.506530,0,130256.0,0.0,12073.0,9.268671,1136
1,4853,28000,Taxi and limousine service,Los Angeles,Los Angeles,11.0,40244.0,1.036602,29024.856400,0,130256.0,0.0,12073.0,9.268671,1136
2,4853,1000,Taxi and limousine service,San Francisco,Bay Area,4.0,48273.0,1.243412,1243.412410,0,130256.0,0.0,12073.0,9.268671,1136
3,4853,1000,Taxi and limousine service,San Francisco,Bay Area,147.0,48273.0,1.243412,1243.412410,0,130256.0,0.0,12073.0,9.268671,1136
4,4853,28000,Taxi and limousine service,Los Angeles,Los Angeles,35.0,40244.0,1.036602,29024.856400,0,130256.0,0.0,12073.0,9.268671,1136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224521,3241m,65000,Miscellaneous petroleum and coal products,Sonoma,Bay Area,301.0,48273.0,1.243412,80821.806661,1,2227.0,301.0,1146.0,51.459362,21
224522,3241m,9600,Miscellaneous petroleum and coal products,San Diego,San Diego-Imperial,111.0,30389.0,0.782758,7514.473379,0,2227.0,0.0,1146.0,51.459362,21
224523,3241m,53000,Miscellaneous petroleum and coal products,Los Angeles,Los Angeles,122.0,40244.0,1.036602,54939.906756,0,2227.0,0.0,1146.0,51.459362,21
224524,3241m,98000,Miscellaneous petroleum and coal products,Santa Clara,Bay Area,42.0,48273.0,1.243412,121854.416197,1,2227.0,42.0,1146.0,51.459362,21


In [47]:
high_wage_ca_wt = ipums_regions.drop_duplicates(subset='Industry Title')
high_wage_ca_wt = high_wage_ca_wt[['Industry Title', 'unweighted_ind_counts','wt_ind_counts', 'wt_CA_high_wage_count', 'wt_CA_high_wage_perc']]
high_wage_ca_wt = high_wage_ca_wt.sort_values(by='wt_CA_high_wage_perc', ascending=False)

In [48]:
high_wage_ca_wt = high_wage_ca_wt.reset_index().iloc[:,1:]

In [49]:
# high_wage_ca_wt.to_feather('data/high_wage_ca_wt.fea')

In [50]:
# high_wage_ca_wt.to_csv('data/high_wage_ca_wt.csv', index=False)

In [51]:
pd.set_option('display.max_rows', None)
high_wage_ca_wt

Unnamed: 0,Industry Title,unweighted_ind_counts,wt_ind_counts,wt_CA_high_wage_count,wt_CA_high_wage_perc
0,Coal mining,3,713.0,713.0,100.0
1,Tobacco,1,55.0,55.0,100.0
2,Software publishers,403,40200.0,33769.0,84.002488
3,Metal ore mining,11,1542.0,1220.0,79.118029
4,Internet publishing and broadcasting and web s...,890,83933.0,65500.0,78.038435
5,"Electric and gas, and other combinations",340,29657.0,23036.0,77.674748
6,Not specified utilities,40,3206.0,2430.0,75.795384
7,Computer systems design and related services,6168,606792.0,451647.0,74.431931
8,"Engine, turbine, and power transmission equipm...",52,6383.0,4655.0,72.92809
9,"Other information services, except libraries a...",74,6872.0,4892.0,71.187427


### Breakdown by Regions Pipeline

In [52]:
regions = ipums_regions['CDI Regions'].unique()
regions

array(['Los Angeles', 'Bay Area', 'Central Coast', 'Central Valley',
       'Inland Empire', 'Orange', 'Sacramento', 'San Diego-Imperial',
       'Shasta / Cascades', 'Redwood Coast'], dtype=object)

In [53]:
df_LA = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[0]].copy()
df_Bay_Area = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[1]].copy()
df_Cent_Coast = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[2]].copy()
df_Cent_Valley = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[3]].copy()
df_IE = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[4]].copy()
df_OC = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[5]].copy()
df_Sac = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[6]].copy()
df_SD = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[7]].copy()
df_Shasta = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[8]].copy()
# df_nan = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[9]].copy()
df_Redwood_Coast = ipums_regions.loc[ipums_regions['CDI Regions'] == regions[9]].copy()

In [54]:
region_dfs = [df_LA, df_Bay_Area, df_Cent_Coast, df_Cent_Valley, df_IE, df_OC,
              df_Sac, df_SD, df_Shasta, df_Redwood_Coast]

In [55]:
def add_to_region_df(df):
    df['region_wage_thresh'] = \
        (sum([row['norm_wage'] * row['PERWT'] for index, row in df.iterrows()])) / df['PERWT'].sum()
    df['above_region_thresh'] = df['INCWAGE'] > df['region_wage_thresh']
    df['wt_reg_ind_counts'] = df['PERWT'].groupby(df['Industry Title']).transform('sum')
    df["above_region_thresh"] = df["above_region_thresh"].astype(int)
    df["wt_reg_above_thresh"] = df["above_region_thresh"] * df['PERWT']
    df['wt_reg_high_wage_count'] = df['wt_reg_above_thresh'].groupby(
        df['Industry Title']).transform('sum')
    df['wt_reg_high_wage_perc'] = (df['wt_reg_high_wage_count'] / df['wt_reg_ind_counts']) * 100
    return df

In [56]:
for df in region_dfs:
    df = add_to_region_df(df)

### Region Industry % - wage > region mean

In [57]:
def get_region_high_wage(df, new_df):
    new_df = df.drop_duplicates(subset='Industry Title')
    new_df = new_df[['CDI Regions', 'Industry Title', 'region_wage_thresh', 'unweighted_ind_counts',
                     'wt_reg_ind_counts', 'wt_reg_high_wage_count', 'wt_reg_high_wage_perc']]
    new_df = new_df.sort_values(by='wt_reg_high_wage_perc', ascending=False)
    new_df = new_df.reset_index().iloc[:,1:]
    return new_df

In [58]:
# initialize new dfs
df_LA_high_wage = df_LA.copy()
df_Bay_Area_high_wage = df_Bay_Area.copy()
df_Cent_Coast_high_wage = df_Cent_Coast.copy()
df_Cent_Valley_high_wage = df_Cent_Valley.copy()
df_IE_high_wage = df_IE.copy()
df_OC_high_wage = df_OC.copy()
df_Sac_high_wage = df_Sac.copy()
df_SD_high_wage = df_SD.copy()
df_Shasta_high_wage = df_Shasta.copy()
# df_nan_high_wage = df_nan.copy()
df_Redwood_Coast_high_wage = df_Redwood_Coast.copy()

In [59]:
region_high_wage_dfs = [df_LA_high_wage, df_Bay_Area_high_wage, df_Cent_Coast_high_wage, 
                        df_Cent_Valley_high_wage, df_IE_high_wage, df_OC_high_wage,
              df_Sac_high_wage, df_SD_high_wage, df_Shasta_high_wage, df_Redwood_Coast_high_wage]

In [60]:
for i in range(len(region_dfs)):
    region_high_wage_dfs[i] = get_region_high_wage(region_dfs[i], region_dfs[i].copy())

In [61]:
regions_high_wage_merged = pd.concat(region_high_wage_dfs)

In [62]:
# regions_high_wage_merged.reset_index().iloc[:,1:].to_feather('data/regions_high_wage_merged.fea')

In [63]:
# regions_high_wage_merged.reset_index().iloc[:,1:].to_csv('data/regions_high_wage_merged.csv', index=False)

In [64]:
regions_high_wage_merged = regions_high_wage_merged.groupby(['CDI Regions','Industry Title']).agg(lambda x: x)

In [65]:
regions_high_wage_merged = regions_high_wage_merged.sort_values(['CDI Regions', 'wt_reg_high_wage_perc'], ascending=(True, False))

In [66]:
pd.set_option('display.max_rows', None)
regions_high_wage_merged

Unnamed: 0_level_0,Unnamed: 1_level_0,region_wage_thresh,unweighted_ind_counts,wt_reg_ind_counts,wt_reg_high_wage_count,wt_reg_high_wage_perc
CDI Regions,Industry Title,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bay Area,Metal ore mining,89015.964501,11,82.0,82.0,100.0
Bay Area,Tires,89015.964501,23,67.0,67.0,100.0
Bay Area,"Veneer, plywood, and engineered wood products",89015.964501,12,66.0,66.0,100.0
Bay Area,Nonmetallic mineral mining and quarrying,89015.964501,58,444.0,361.0,81.306306
Bay Area,Internet publishing and broadcasting and web search portals,89015.964501,890,61429.0,49880.0,81.199433
Bay Area,Software publishers,89015.964501,403,22083.0,17250.0,78.114387
Bay Area,Footwear Manufacturing,89015.964501,28,633.0,459.0,72.511848
Bay Area,Agricultural implements,89015.964501,25,403.0,292.0,72.456576
Bay Area,Computer systems design and related services,89015.964501,6168,346075.0,248452.0,71.791375
Bay Area,"Paint, coating, and adhesives",89015.964501,59,646.0,462.0,71.517028
