In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format='retina'

### Loading Data

In [50]:
edd = pd.read_csv('data/Current_EDD_0917.csv')

In [51]:
high_wage_ca_wt = pd.read_feather('data/high_wage_ca_wt.fea') 

In [52]:
regions_high_wage_merged = pd.read_feather('data/regions_high_wage_merged.fea') 

In [53]:
edd_2019 = edd.loc[edd['Year'] == 2019].copy()

In [54]:
ca_edd_2019 = edd_2019.loc[edd_2019['Area Name'] == 'California'].copy()

### Cleaning Data

In [55]:
def normalize_titles(col):
    col = col.astype(str)
    col = col.str.strip()
    col = col.str.lower()
    col = col.str.replace('&', 'and')
    col = col.apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
    return col

In [56]:
ca_edd_2019['Industry Title'] = normalize_titles(ca_edd_2019['Industry Title'])

In [57]:
high_wage_ca_wt['Industry Title'] = normalize_titles(high_wage_ca_wt['Industry Title'])

In [58]:
edd_merged = pd.merge(ca_edd_2019, high_wage_ca_wt, on = 'Industry Title', how = 'right')

In [59]:
print(len(edd_merged), len(edd), len(high_wage_ca_wt))

787 1308590 269


In [60]:
edd_merged

Unnamed: 0,Area Type,Area Name,Year,Month,Date,Series Code,Industry Title,Seasonally Adjusted,Current Employment,unweighted_ind_counts,wt_ind_counts,wt_CA_high_wage_count,wt_CA_high_wage_perc
0,,,,,,,coal mining,,,3,713.0,713.0,100.000000
1,,,,,,,tobacco,,,2,62.0,55.0,88.709677
2,State,California,2019.0,January,01/01/2019,50511200.0,software publishers,N,92700.0,405,40383.0,34271.0,84.864918
3,State,California,2019.0,February,02/01/2019,50511200.0,software publishers,N,93800.0,405,40383.0,34271.0,84.864918
4,State,California,2019.0,March,03/01/2019,50511200.0,software publishers,N,94500.0,405,40383.0,34271.0,84.864918
...,...,...,...,...,...,...,...,...,...,...,...,...,...
782,,,,,,,florists,,,172,18010.0,921.0,5.113826
783,,,,,,,book stores and news dealers,,,162,15111.0,697.0,4.612534
784,,,,,,,sewing needlework and piece goods stores,,,83,8011.0,316.0,3.944576
785,,,,,,,private households,,,1529,180264.0,6958.0,3.859894


In [63]:
edd_merged = edd_merged[['Industry Title', 'Seasonally Adjusted', 'Current Employment', 'unweighted_ind_counts',
                        'wt_ind_counts', 'wt_CA_high_wage_count', 'wt_CA_high_wage_perc']]

In [64]:
edd_merged

Unnamed: 0,Industry Title,Seasonally Adjusted,Current Employment,unweighted_ind_counts,wt_ind_counts,wt_CA_high_wage_count,wt_CA_high_wage_perc
0,coal mining,,,3,713.0,713.0,100.000000
1,tobacco,,,2,62.0,55.0,88.709677
2,software publishers,N,92700.0,405,40383.0,34271.0,84.864918
3,software publishers,N,93800.0,405,40383.0,34271.0,84.864918
4,software publishers,N,94500.0,405,40383.0,34271.0,84.864918
...,...,...,...,...,...,...,...
782,florists,,,172,18010.0,921.0,5.113826
783,book stores and news dealers,,,162,15111.0,697.0,4.612534
784,sewing needlework and piece goods stores,,,83,8011.0,316.0,3.944576
785,private households,,,1529,180264.0,6958.0,3.859894


In [30]:
print(f'''TOTAL ROWS: {len(edd_merged)}''')
print('\n')
for column in edd_merged.columns.values:
    total = len(edd_merged)
    nulls = edd_merged[column].isna().sum()
    percentage = (nulls / total) * 100
    print(f'''{column}: {nulls} null values, {percentage:.2f}%''')

TOTAL ROWS: 787


Industry Title: 0 null values, 0.00%
Seasonally Adjusted: 223 null values, 28.34%
Current Employment: 223 null values, 28.34%
wt_ind_counts: 0 null values, 0.00%
wt_CA_high_wage_count: 0 null values, 0.00%
wt_CA_high_wage_perc: 0 null values, 0.00%


In [31]:
for column in edd_merged.columns.values:
    print(f'''{column}: {len(edd_merged[column].unique())} unique values''')

Industry Title: 269 unique values
Seasonally Adjusted: 3 unique values
Current Employment: 421 unique values
wt_ind_counts: 269 unique values
wt_CA_high_wage_count: 269 unique values
wt_CA_high_wage_perc: 269 unique values


### EDD Analysis

In [69]:
edd_merged['weighted_sample_perc'] = (edd_merged['wt_ind_counts'] / edd_merged['Current Employment']) * 100
edd_merged['unweighted_sample_perc'] = (edd_merged['unweighted_ind_counts'] / edd_merged['Current Employment']) * 100

In [70]:
edd_merged

Unnamed: 0,Industry Title,Seasonally Adjusted,Current Employment,unweighted_ind_counts,wt_ind_counts,wt_CA_high_wage_count,wt_CA_high_wage_perc,weighted_sample,unweighted_sample,weighted_sample_perc,unweighted_sample_perc
0,coal mining,,,3,713.0,713.0,100.000000,,,,
1,tobacco,,,2,62.0,55.0,88.709677,,,,
2,software publishers,N,92700.0,405,40383.0,34271.0,84.864918,43.563107,0.436893,43.563107,0.436893
3,software publishers,N,93800.0,405,40383.0,34271.0,84.864918,43.052239,0.431770,43.052239,0.431770
4,software publishers,N,94500.0,405,40383.0,34271.0,84.864918,42.733333,0.428571,42.733333,0.428571
...,...,...,...,...,...,...,...,...,...,...,...
782,florists,,,172,18010.0,921.0,5.113826,,,,
783,book stores and news dealers,,,162,15111.0,697.0,4.612534,,,,
784,sewing needlework and piece goods stores,,,83,8011.0,316.0,3.944576,,,,
785,private households,,,1529,180264.0,6958.0,3.859894,,,,
