In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
from jqi_functions import *
pd.options.display.max_rows = 4000
%config InlineBackend.figure_format='retina'

### Loading Data

In [2]:
edd = pd.read_csv('data/Current_EDD_1121.csv')
ipums = pd.read_csv('data/IPUMS_2019.csv')

In [3]:
ipums_titles = pd.read_csv('data/ind_indnaics_crosswalk_2000_onward_without_code_descriptions.csv')

In [4]:
ipums_titles = ipums_titles.iloc[2:]

In [5]:
ipums_titles = ipums_titles.iloc[:,10:]
ipums_titles

Unnamed: 0,2018 Onward ACS/PRCS INDNAICS CODE,Industry Title
2,111,Crop production
3,,Animal production
4,112,Animal production and aquaculture
5,113M,"Forestry, except logging"
6,1133,Logging
7,114,"Fishing, hunting, and trapping"
8,115,Support activities for agriculture and forestry
9,,
10,211,Oil and gas extraction
11,2121,Coal mining


In [6]:
edd_crosswalk = pd.read_excel('data/Industry_Title_Crosswalk.xlsx')
edd_crosswalk.head()

Unnamed: 0,Series Code,EDD Industry Title,SS-NAICS,LMID Industry Title
0,10000000,Mining and Logging,10000000,Mining and Logging
1,10113300,Logging,10113300,Logging
2,10210000,Mining,10210000,Mining
3,10211000,Oil & Gas Extraction,10211000,Oil & Gas Extraction
4,10212000,"Mining, except Oil & Gas",10212000,"Mining, except Oil & Gas"


### Cleaning

In [7]:
ipums = ipums[['STATEFIP', 
               'COUNTYFIP',
              'INDNAICS',
              'INCWAGE']]

In [8]:
ca_ipums = ipums.loc[ipums['STATEFIP'] == 6].copy()

In [9]:
len(ca_ipums)

380091

In [10]:
edd['Industry Title'] = normalize_titles(edd['Industry Title'])

In [11]:
ipums_titles['Industry Title'] = normalize_titles(ipums_titles['Industry Title'])

In [12]:
ipums_titles['2018 Onward ACS/PRCS INDNAICS CODE'] = normalize_titles(ipums_titles['2018 Onward ACS/PRCS INDNAICS CODE'])

In [13]:
edd_crosswalk['EDD Industry Title'] = normalize_titles(edd_crosswalk['EDD Industry Title'])

In [14]:
edd_crosswalk['LMID Industry Title'] = normalize_titles(edd_crosswalk['LMID Industry Title'])

In [15]:
ca_ipums['INDNAICS'] = normalize_titles(ca_ipums['INDNAICS'])

Only looking at state level, no seasonal adjustments, during December 2019 (Later: take average across months in 2019)

In [16]:
edd = edd.loc[edd['Area Type'] == 'State']

In [17]:
edd = edd.loc[edd['Date'] == '12/01/2019']

In [18]:
edd = edd.loc[edd['Seasonally Adjusted'] == 'N']

In [19]:
edd

Unnamed: 0,Area Type,Area Name,Year,Month,Date,Series Code,Industry Title,Seasonally Adjusted,Current Employment
278604,State,California,2019,December,12/01/2019,80000000,other services,N,577900
291347,State,California,2019,December,12/01/2019,41000000,wholesale trade,N,689700
291495,State,California,2019,December,12/01/2019,50000000,information,N,573800
291666,State,California,2019,December,12/01/2019,42453300,used merchandise stores,N,18600
291675,State,California,2019,December,12/01/2019,32315100,apparel knitting mills,N,1000
291676,State,California,2019,December,12/01/2019,32311600,animal slaughtering and processing,N,22400
291677,State,California,2019,December,12/01/2019,32312000,beverage and tobacco product mfg,N,64000
291678,State,California,2019,December,12/01/2019,32311000,food manufacturing,N,161300
291679,State,California,2019,December,12/01/2019,32311500,dairy product manufacturing,N,17300
291680,State,California,2019,December,12/01/2019,32315200,cut and sew apparel manufacturing,N,29500


### Merge IPUMS with IPUMS Titles

In [20]:
ipums_merged = pd.merge(ca_ipums, ipums_titles,
                              left_on = 'INDNAICS', right_on = '2018 Onward ACS/PRCS INDNAICS CODE')

In [21]:
ipums_merged.head()

Unnamed: 0,STATEFIP,COUNTYFIP,INDNAICS,INCWAGE,2018 Onward ACS/PRCS INDNAICS CODE,Industry Title
0,6,37,4853,23100,4853,taxi and limousine service
1,6,37,4853,28000,4853,taxi and limousine service
2,6,75,4853,1000,4853,taxi and limousine service
3,6,75,4853,1000,4853,taxi and limousine service
4,6,37,4853,28000,4853,taxi and limousine service


In [22]:
show_null(ipums_merged)

TOTAL ROWS: 224526
STATEFIP: 0 null values, 0.00%
COUNTYFIP: 0 null values, 0.00%
INDNAICS: 0 null values, 0.00%
INCWAGE: 0 null values, 0.00%
2018 Onward ACS/PRCS INDNAICS CODE: 0 null values, 0.00%
Industry Title: 0 null values, 0.00%


### Merge EDD with EDD Titles & Code Parsing

In [23]:
edd_crosswalk['Parsed_Code'] = edd_crosswalk['Series Code'].astype(str)

In [24]:
parsed_codes = []
for i in range(len(edd_crosswalk['Parsed_Code'])):
    parsed_codes.append(edd_crosswalk['Parsed_Code'][i][2:5])

In [25]:
edd_crosswalk['Parsed_Code'] = parsed_codes

In [26]:
edd_crosswalk = edd_crosswalk.loc[edd_crosswalk['Parsed_Code'] != '000'].copy()

In [27]:
edd_merged = pd.merge(edd, edd_crosswalk,
                              left_on = 'Industry Title', right_on = 'EDD Industry Title')

In [28]:
edd_merged = edd_merged[['Industry Title', 'LMID Industry Title', 'Parsed_Code', 'Current Employment']]

In [29]:
show_null(edd_merged)

TOTAL ROWS: 259
Industry Title: 0 null values, 0.00%
LMID Industry Title: 0 null values, 0.00%
Parsed_Code: 0 null values, 0.00%
Current Employment: 0 null values, 0.00%


In [30]:
edd_merged['Parsed_Code'].nunique()

99

In [33]:
unique_codes = edd_merged.groupby(['Parsed_Code'], as_index=False)['Current Employment'].sum()

In [34]:
edd_merged_test = pd.merge(edd_merged, unique_codes, on='Parsed_Code')

In [49]:
edd_merged_test.sort_values(by='Parsed_Code', ascending=True)

Unnamed: 0,Industry Title,LMID Industry Title,Parsed_Code,Current Employment_x,Current Employment_y
236,logging,logging,113,1700,1700
199,mining,mining,210,20300,20300
200,oil and gas extraction,oil and gas extraction,211,4500,4500
251,mining except oil and gas,mining except oil and gas,212,5900,5900
237,support activities for mining,support activities for mining,213,9900,9900
231,utilities,utilities,220,57200,57200
166,natural gas distribution,natural gas distribution,221,33400,52200
165,electric power gen trans and distribution,electric power generation trans and distribution,221,18800,52200
129,residential building construction,residential building construction,236,122200,411000
127,nonresidential building construction,nonresidential building construction,236,83300,411000


In [36]:
edd_merged = edd_merged_test.drop_duplicates(subset='Parsed_Code')

In [38]:
edd_merged = edd_merged[['LMID Industry Title', 'Parsed_Code', 'Current Employment_y']]

In [39]:
ipums_merged['Parsed_Code'] = ipums_merged['INDNAICS'].astype(str)
parsed_codes = []
for i in range(len(ipums_merged['Parsed_Code'])):
    parsed_codes.append(ipums_merged['Parsed_Code'][i][:3])
ipums_merged['Parsed_Code'] = parsed_codes

In [40]:
ipums_merged

Unnamed: 0,STATEFIP,COUNTYFIP,INDNAICS,INCWAGE,2018 Onward ACS/PRCS INDNAICS CODE,Industry Title,Parsed_Code
0,6,37,4853,23100,4853,taxi and limousine service,485
1,6,37,4853,28000,4853,taxi and limousine service,485
2,6,75,4853,1000,4853,taxi and limousine service,485
3,6,75,4853,1000,4853,taxi and limousine service,485
4,6,37,4853,28000,4853,taxi and limousine service,485
...,...,...,...,...,...,...,...
224521,6,97,3241m,65000,3241m,miscellaneous petroleum and coal products,324
224522,6,73,3241m,9600,3241m,miscellaneous petroleum and coal products,324
224523,6,37,3241m,53000,3241m,miscellaneous petroleum and coal products,324
224524,6,85,3241m,98000,3241m,miscellaneous petroleum and coal products,324


### Merging on Industry Titles

In [41]:
title_merge = pd.merge(ipums_merged, edd_merged,
                              left_on = 'Industry Title', right_on = 'LMID Industry Title', how='left')

In [42]:
show_null(title_merge)

TOTAL ROWS: 224526
STATEFIP: 0 null values, 0.00%
COUNTYFIP: 0 null values, 0.00%
INDNAICS: 0 null values, 0.00%
INCWAGE: 0 null values, 0.00%
2018 Onward ACS/PRCS INDNAICS CODE: 0 null values, 0.00%
Industry Title: 0 null values, 0.00%
Parsed_Code_x: 0 null values, 0.00%
LMID Industry Title: 198964 null values, 88.62%
Parsed_Code_y: 198964 null values, 88.62%
Current Employment_y: 198964 null values, 88.62%


In [46]:
224526 - 198964

25562

### Merging on Parsed Codes

In [50]:
code_merge = pd.merge(ipums_merged, edd_merged, on='Parsed_Code')

In [51]:
show_null(code_merge)

TOTAL ROWS: 187059
STATEFIP: 0 null values, 0.00%
COUNTYFIP: 0 null values, 0.00%
INDNAICS: 0 null values, 0.00%
INCWAGE: 0 null values, 0.00%
2018 Onward ACS/PRCS INDNAICS CODE: 0 null values, 0.00%
Industry Title: 0 null values, 0.00%
Parsed_Code: 0 null values, 0.00%
LMID Industry Title: 0 null values, 0.00%
Current Employment_y: 0 null values, 0.00%


In [52]:
code_merge

Unnamed: 0,STATEFIP,COUNTYFIP,INDNAICS,INCWAGE,2018 Onward ACS/PRCS INDNAICS CODE,Industry Title,Parsed_Code,LMID Industry Title,Current Employment_y
0,6,37,4853,23100,4853,taxi and limousine service,485,transit and ground passenger transportation,52400
1,6,37,4853,28000,4853,taxi and limousine service,485,transit and ground passenger transportation,52400
2,6,75,4853,1000,4853,taxi and limousine service,485,transit and ground passenger transportation,52400
3,6,75,4853,1000,4853,taxi and limousine service,485,transit and ground passenger transportation,52400
4,6,37,4853,28000,4853,taxi and limousine service,485,transit and ground passenger transportation,52400
...,...,...,...,...,...,...,...,...,...
187054,6,41,3133,0,3133,textile and fabric finishing and fabric coatin...,313,textile mills,5600
187055,6,59,3133,15100,3133,textile and fabric finishing and fabric coatin...,313,textile mills,5600
187056,6,37,3133,3600,3133,textile and fabric finishing and fabric coatin...,313,textile mills,5600
187057,6,65,3133,70000,3133,textile and fabric finishing and fabric coatin...,313,textile mills,5600
