In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
from jqi_functions import *
pd.options.display.max_rows = 4000
%config InlineBackend.figure_format='retina'

### Loading Data

In [64]:
edd = pd.read_csv('data/Current_EDD_1121.csv')
ipums = pd.read_csv('data/IPUMS_2019.csv')

In [71]:
edd

Unnamed: 0,Area Type,Area Name,Year,Month,Date,Series Code,Industry Title,Seasonally Adjusted,Current Employment
0,County,Alameda County,2014,January,01/01/2014,6000000,Goods Producing,N,100000
1,County,Alameda County,2014,January,01/01/2014,70710000,"Arts, Entertainment, and Recreation",N,8800
2,County,Alameda County,2014,January,01/01/2014,80000000,Other Services,N,24400
3,County,Alameda County,2014,January,01/01/2014,65000000,Educational and Health Services,N,110700
4,County,Alameda County,2014,January,01/01/2014,50000000,Information,N,13900
...,...,...,...,...,...,...,...,...,...
1315851,County,San Diego County,1997,May,05/01/1997,90932994,Special Districts plus Indian Tribes,N,14200
1315852,County,San Diego County,1997,May,05/01/1997,90920000,State Government,N,32600
1315853,County,San Diego County,1997,May,05/01/1997,90940000,State and Local Government,N,151400
1315854,County,San Diego County,1997,May,05/01/1997,42453000,Miscellaneous Store Retailers,N,9300


In [69]:
ipums_titles = pd.read_csv('data/ind_indnaics_crosswalk_2000_onward_without_code_descriptions.csv')

In [29]:
ipums_titles = ipums_titles.iloc[2:]

In [30]:
ipums_titles = ipums_titles.iloc[:,10:]
ipums_titles

Unnamed: 0,2018 Onward ACS/PRCS INDNAICS CODE,Industry Title
2,111,Crop production
3,,Animal production
4,112,Animal production and aquaculture
5,113M,"Forestry, except logging"
6,1133,Logging
7,114,"Fishing, hunting, and trapping"
8,115,Support activities for agriculture and forestry
9,,
10,211,Oil and gas extraction
11,2121,Coal mining


In [70]:
edd_crosswalk = pd.read_excel('data/Industry_Title_Crosswalk.xlsx')
edd_crosswalk

Unnamed: 0,Series Code,EDD Industry Title,SS-NAICS,LMID Industry Title
0,10000000,Mining and Logging,10000000,Mining and Logging
1,10113300,Logging,10113300,Logging
2,10210000,Mining,10210000,Mining
3,10211000,Oil & Gas Extraction,10211000,Oil & Gas Extraction
4,10212000,"Mining, except Oil & Gas",10212000,"Mining, except Oil & Gas"
5,10213000,Support Activities for Mining,10213000,Support Activities for Mining
6,11000000,Total Farm,11000000,Total Farm
7,20000000,Construction,20000000,Construction
8,20236000,Construction of Buildings,20236000,Construction of Buildings
9,20236100,Residential Building Construction,20236100,Residential Building Construction


### Cleaning

In [32]:
ipums = ipums[['STATEFIP', 
               'COUNTYFIP',
              'INDNAICS',
              'INCWAGE']]

In [72]:
ca_ipums = ipums.loc[ipums['STATEFIP'] == 6].copy()

In [34]:
len(ca_ipums)

380091

In [35]:
edd['Industry Title'] = normalize_titles(edd['Industry Title'])

In [36]:
ipums_titles['Industry Title'] = normalize_titles(ipums_titles['Industry Title'])

In [37]:
ipums_titles['2018 Onward ACS/PRCS INDNAICS CODE'] = normalize_titles(ipums_titles['2018 Onward ACS/PRCS INDNAICS CODE'])

In [38]:
edd_crosswalk['EDD Industry Title'] = normalize_titles(edd_crosswalk['EDD Industry Title'])

In [39]:
edd_crosswalk['LMID Industry Title'] = normalize_titles(edd_crosswalk['LMID Industry Title'])

In [40]:
ca_ipums['INDNAICS'] = normalize_titles(ca_ipums['INDNAICS'])

Only looking at state level, no seasonal adjustments, during December 2019 (Later: take average across months in 2019)

In [41]:
edd = edd.loc[edd['Area Type'] == 'State']

In [42]:
edd = edd.loc[edd['Date'] == '12/01/2019']

In [43]:
edd = edd.loc[edd['Seasonally Adjusted'] == 'N']

In [44]:
edd

Unnamed: 0,Area Type,Area Name,Year,Month,Date,Series Code,Industry Title,Seasonally Adjusted,Current Employment
278604,State,California,2019,December,12/01/2019,80000000,other services,N,577900
291347,State,California,2019,December,12/01/2019,41000000,wholesale trade,N,689700
291495,State,California,2019,December,12/01/2019,50000000,information,N,573800
291666,State,California,2019,December,12/01/2019,42453300,used merchandise stores,N,18600
291675,State,California,2019,December,12/01/2019,32315100,apparel knitting mills,N,1000
291676,State,California,2019,December,12/01/2019,32311600,animal slaughtering and processing,N,22400
291677,State,California,2019,December,12/01/2019,32312000,beverage and tobacco product mfg,N,64000
291678,State,California,2019,December,12/01/2019,32311000,food manufacturing,N,161300
291679,State,California,2019,December,12/01/2019,32311500,dairy product manufacturing,N,17300
291680,State,California,2019,December,12/01/2019,32315200,cut and sew apparel manufacturing,N,29500


### Merge IPUMS with IPUMS Titles

In [45]:
ipums_merged = pd.merge(ca_ipums, ipums_titles,
                              left_on = 'INDNAICS', right_on = '2018 Onward ACS/PRCS INDNAICS CODE')

In [46]:
ipums_merged.head()

Unnamed: 0,STATEFIP,COUNTYFIP,INDNAICS,INCWAGE,2018 Onward ACS/PRCS INDNAICS CODE,Industry Title
0,6,37,4853,23100,4853,taxi and limousine service
1,6,37,4853,28000,4853,taxi and limousine service
2,6,75,4853,1000,4853,taxi and limousine service
3,6,75,4853,1000,4853,taxi and limousine service
4,6,37,4853,28000,4853,taxi and limousine service


In [47]:
show_null(ipums_merged)

TOTAL ROWS: 224526
STATEFIP: 0 null values, 0.00%
COUNTYFIP: 0 null values, 0.00%
INDNAICS: 0 null values, 0.00%
INCWAGE: 0 null values, 0.00%
2018 Onward ACS/PRCS INDNAICS CODE: 0 null values, 0.00%
Industry Title: 0 null values, 0.00%


### Merge EDD with EDD Titles & Code Parsing

In [48]:
edd_crosswalk['Parsed_Code'] = edd_crosswalk['Series Code'].astype(str)

In [49]:
parsed_codes = []
for i in range(len(edd_crosswalk['Parsed_Code'])):
    parsed_codes.append(edd_crosswalk['Parsed_Code'][i][2:5])

In [50]:
edd_crosswalk['Parsed_Code'] = parsed_codes

In [51]:
# edd_crosswalk = edd_crosswalk.loc[edd_crosswalk['Parsed_Code'] != '000'].copy()

In [52]:
edd_crosswalk

Unnamed: 0,Series Code,EDD Industry Title,SS-NAICS,LMID Industry Title,Parsed_Code
0,10000000,mining and logging,10000000,mining and logging,0
1,10113300,logging,10113300,logging,113
2,10210000,mining,10210000,mining,210
3,10211000,oil and gas extraction,10211000,oil and gas extraction,211
4,10212000,mining except oil and gas,10212000,mining except oil and gas,212
5,10213000,support activities for mining,10213000,support activities for mining,213
6,11000000,total farm,11000000,total farm,0
7,20000000,construction,20000000,construction,0
8,20236000,construction of buildings,20236000,construction of buildings,236
9,20236100,residential building construction,20236100,residential building construction,236


In [54]:
edd_merged = pd.merge(edd, edd_crosswalk,
                              left_on = 'Industry Title', right_on = 'EDD Industry Title')

In [55]:
edd_merged = edd_merged[['Industry Title', 'LMID Industry Title', 'Parsed_Code', 'Current Employment']]

In [56]:
show_null(edd_merged)

TOTAL ROWS: 276
Industry Title: 0 null values, 0.00%
LMID Industry Title: 0 null values, 0.00%
Parsed_Code: 0 null values, 0.00%
Current Employment: 0 null values, 0.00%


In [57]:
edd_merged

Unnamed: 0,Industry Title,LMID Industry Title,Parsed_Code,Current Employment
0,other services,other services,0,577900
1,wholesale trade,wholesale trade,0,689700
2,information,information,0,573800
3,used merchandise stores,used merchandise stores,453,18600
4,apparel knitting mills,apparel knitting mills,315,1000
5,animal slaughtering and processing,animal slaughtering and processing,311,22400
6,beverage and tobacco product mfg,beverage and tobacco product manufacturing,312,64000
7,food manufacturing,food manufacturing,311,161300
8,dairy product manufacturing,dairy product manufacturing,311,17300
9,cut and sew apparel manufacturing,cut and sew apparel manufacturing,315,29500


In [59]:
# edd_merged.to_csv('edd_merged.csv', index=False)

In [60]:
# unique_codes = edd_merged.groupby(['Parsed_Code'], as_index=False)['Current Employment'].sum()

In [79]:
# edd_merged_test = pd.merge(edd_merged, unique_codes, on='Parsed_Code')

In [62]:
# edd_merged_test.sort_values(by='Parsed_Code', ascending=True)

In [36]:
len(edd_crosswalk['Series Code'].unique())

277

In [37]:
edd_merged = edd_merged_test.drop_duplicates(subset='Parsed_Code')

In [38]:
edd_merged = edd_merged[['Industry Title', 'LMID Industry Title', 'Parsed_Code', 'Current Employment_y']]

In [70]:
ipums_merged['Parsed_Code'] = ipums_merged['INDNAICS'].astype(str)
parsed_codes = []
for i in range(len(ipums_merged['Parsed_Code'])):
    parsed_codes.append(ipums_merged['Parsed_Code'][i][:3])
ipums_merged['Parsed_Code'] = parsed_codes

In [71]:
ipums_merged

Unnamed: 0,STATEFIP,COUNTYFIP,INDNAICS,INCWAGE,2018 Onward ACS/PRCS INDNAICS CODE,Industry Title,Parsed_Code
0,6,37,4853,23100,4853,taxi and limousine service,485
1,6,37,4853,28000,4853,taxi and limousine service,485
2,6,75,4853,1000,4853,taxi and limousine service,485
3,6,75,4853,1000,4853,taxi and limousine service,485
4,6,37,4853,28000,4853,taxi and limousine service,485
...,...,...,...,...,...,...,...
224521,6,97,3241m,65000,3241m,miscellaneous petroleum and coal products,324
224522,6,73,3241m,9600,3241m,miscellaneous petroleum and coal products,324
224523,6,37,3241m,53000,3241m,miscellaneous petroleum and coal products,324
224524,6,85,3241m,98000,3241m,miscellaneous petroleum and coal products,324


In [72]:
# ipums_merged.to_csv('ca_ipums_merged.csv', index=False)

### Merging on Industry Titles

In [68]:
title_merge = pd.merge(ipums_merged, edd_merged,
                              left_on = 'Industry Title', right_on = 'LMID Industry Title', how='left')

In [69]:
show_null(title_merge)

TOTAL ROWS: 224526
STATEFIP: 0 null values, 0.00%
COUNTYFIP: 0 null values, 0.00%
INDNAICS: 0 null values, 0.00%
INCWAGE: 0 null values, 0.00%
2018 Onward ACS/PRCS INDNAICS CODE: 0 null values, 0.00%
Industry Title_x: 0 null values, 0.00%
Parsed_Code_x: 0 null values, 0.00%
Industry Title_y: 198964 null values, 88.62%
LMID Industry Title: 198964 null values, 88.62%
Parsed_Code_y: 198964 null values, 88.62%
Current Employment_y: 198964 null values, 88.62%


In [70]:
ipums_merged[~ipums_merged['Industry Title'].isin(edd_merged['LMID Industry Title'])]

Unnamed: 0,STATEFIP,COUNTYFIP,INDNAICS,INCWAGE,2018 Onward ACS/PRCS INDNAICS CODE,Industry Title,Parsed_Code
0,6,37,4853,23100,4853,taxi and limousine service,485
1,6,37,4853,28000,4853,taxi and limousine service,485
2,6,75,4853,1000,4853,taxi and limousine service,485
3,6,75,4853,1000,4853,taxi and limousine service,485
4,6,37,4853,28000,4853,taxi and limousine service,485
...,...,...,...,...,...,...,...
224521,6,97,3241m,65000,3241m,miscellaneous petroleum and coal products,324
224522,6,73,3241m,9600,3241m,miscellaneous petroleum and coal products,324
224523,6,37,3241m,53000,3241m,miscellaneous petroleum and coal products,324
224524,6,85,3241m,98000,3241m,miscellaneous petroleum and coal products,324


In [71]:
ipums_merged[~ipums_merged['Industry Title'].isin(edd_merged['Industry Title'])] # checking EDD Industry Title

Unnamed: 0,STATEFIP,COUNTYFIP,INDNAICS,INCWAGE,2018 Onward ACS/PRCS INDNAICS CODE,Industry Title,Parsed_Code
0,6,37,4853,23100,4853,taxi and limousine service,485
1,6,37,4853,28000,4853,taxi and limousine service,485
2,6,75,4853,1000,4853,taxi and limousine service,485
3,6,75,4853,1000,4853,taxi and limousine service,485
4,6,37,4853,28000,4853,taxi and limousine service,485
...,...,...,...,...,...,...,...
224521,6,97,3241m,65000,3241m,miscellaneous petroleum and coal products,324
224522,6,73,3241m,9600,3241m,miscellaneous petroleum and coal products,324
224523,6,37,3241m,53000,3241m,miscellaneous petroleum and coal products,324
224524,6,85,3241m,98000,3241m,miscellaneous petroleum and coal products,324


In [72]:
# even more mismatches with EDD Industry Title

In [73]:
224526 - 198964

25562

### Merging on Parsed Codes

In [81]:
code_merge = pd.merge(ipums_merged, edd_merged, on='Parsed_Code')

In [82]:
show_null(code_merge)

TOTAL ROWS: 908369
STATEFIP: 0 null values, 0.00%
COUNTYFIP: 0 null values, 0.00%
INDNAICS: 0 null values, 0.00%
INCWAGE: 0 null values, 0.00%
2018 Onward ACS/PRCS INDNAICS CODE: 0 null values, 0.00%
Industry Title_x: 0 null values, 0.00%
Parsed_Code: 0 null values, 0.00%
Industry Title_y: 0 null values, 0.00%
LMID Industry Title: 0 null values, 0.00%
Current Employment: 0 null values, 0.00%


In [83]:
code_merge

Unnamed: 0,STATEFIP,COUNTYFIP,INDNAICS,INCWAGE,2018 Onward ACS/PRCS INDNAICS CODE,Industry Title_x,Parsed_Code,Industry Title_y,LMID Industry Title,Current Employment
0,6,37,4853,23100,4853,taxi and limousine service,485,transit and ground passenger transportat,transit and ground passenger transportation,52400
1,6,37,4853,28000,4853,taxi and limousine service,485,transit and ground passenger transportat,transit and ground passenger transportation,52400
2,6,75,4853,1000,4853,taxi and limousine service,485,transit and ground passenger transportat,transit and ground passenger transportation,52400
3,6,75,4853,1000,4853,taxi and limousine service,485,transit and ground passenger transportat,transit and ground passenger transportation,52400
4,6,37,4853,28000,4853,taxi and limousine service,485,transit and ground passenger transportat,transit and ground passenger transportation,52400
...,...,...,...,...,...,...,...,...,...,...
908364,6,41,3133,0,3133,textile and fabric finishing and fabric coatin...,313,residualtextile mills,textile mills,5600
908365,6,59,3133,15100,3133,textile and fabric finishing and fabric coatin...,313,residualtextile mills,textile mills,5600
908366,6,37,3133,3600,3133,textile and fabric finishing and fabric coatin...,313,residualtextile mills,textile mills,5600
908367,6,65,3133,70000,3133,textile and fabric finishing and fabric coatin...,313,residualtextile mills,textile mills,5600


In [84]:
pd.set_option('display.max_rows', None)
code_merge_antijoin = ipums_merged[~ipums_merged['Parsed_Code'].isin(edd_merged['Parsed_Code'])]
len(code_merge_antijoin)

37467

In [85]:
antijoin_unique = code_merge_antijoin.drop_duplicates(subset='Parsed_Code')
antijoin_unique

Unnamed: 0,STATEFIP,COUNTYFIP,INDNAICS,INCWAGE,2018 Onward ACS/PRCS INDNAICS CODE,Industry Title,Parsed_Code
14768,6,111,92m1,9800,92m1,administration of environmental quality and ho...,92m
15193,6,73,928110p4,30000,928110p4,us marines,928
15693,6,99,4ms,0,4ms,not specified retail trade,4ms
57183,6,107,111,6000,111,crop production,111
59958,6,0,23,13000,23,construction the cleaning of buildings and dwe...,23
74093,6,0,115,28000,115,support activities for agriculture and forestry,115
81319,6,19,112,1500,112,animal production and aquaculture,112
81798,6,37,999920,0,999920,unemployed with no work experience in past 5 y...,999
83811,6,85,52m2,6000,52m2,securities commodities funds trusts and other ...,52m
156668,6,83,814,0,814,private households,814
