In [155]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format='retina'

### Loading Data

In [35]:
ipums = pd.read_csv('IPUMS_2019.csv')

In [36]:
ipums_titles = pd.read_csv('ind_indnaics_crosswalk_2000_onward_without_code_descriptions.csv')

In [37]:
ipums_titles = ipums_titles.iloc[2:]

In [38]:
ipums_titles = ipums_titles.iloc[:,10:]

In [39]:
county_info = pd.read_csv('county_to_regions_key - Sheet1.csv')

### Cleaning

In [6]:
ipums = ipums[['STATEFIP', 
               'COUNTYFIP',
              'CITY',
              'INDNAICS',
              'INCTOT',
              'INCWAGE']]

In [7]:
ca_ipums = ipums.loc[ipums['STATEFIP'] == 6].copy()

In [114]:
ca_ipums = ca_ipums.reset_index()

In [115]:
len(ca_ipums)

380091

In [116]:
def normalize_titles(col):
    col = col.astype(str)
    col = col.str.strip()
    col = col.str.lower()
    col = col.str.replace('&', 'and')
    col = col.apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
    return col

In [117]:
ca_ipums['INDNAICS'] = normalize_titles(ca_ipums['INDNAICS'])

In [118]:
ipums_titles['2018 Onward ACS/PRCS INDNAICS CODE'] = normalize_titles(ipums_titles['2018 Onward ACS/PRCS INDNAICS CODE'])

### Merging on NAICS Codes

In [119]:
ca_ipums.columns.values

array(['level_0', 'index', 'STATEFIP', 'COUNTYFIP', 'CITY', 'INDNAICS',
       'INCTOT', 'INCWAGE'], dtype=object)

In [120]:
ipums_titles.columns.values

array(['2018 Onward ACS/PRCS INDNAICS CODE', 'Industry Title'],
      dtype=object)

In [121]:
ipums_merged = pd.merge(ca_ipums, ipums_titles,
                              left_on = 'INDNAICS', right_on = '2018 Onward ACS/PRCS INDNAICS CODE')

In [122]:
print(len(ca_ipums), len(ipums_titles), ipums_merged.shape)

380091 402 (224526, 10)


In [123]:
ipums_merged

Unnamed: 0,level_0,index,STATEFIP,COUNTYFIP,CITY,INDNAICS,INCTOT,INCWAGE,2018 Onward ACS/PRCS INDNAICS CODE,Industry Title
0,0,156866,6,37,5140,4853,23100,23100,4853,Taxi and limousine service
1,773,157639,6,37,3730,4853,53500,28000,4853,Taxi and limousine service
2,1053,157919,6,75,6290,4853,35800,1000,4853,Taxi and limousine service
3,1489,158355,6,75,6290,4853,2800,1000,4853,Taxi and limousine service
4,1491,158357,6,37,3730,4853,53500,28000,4853,Taxi and limousine service
...,...,...,...,...,...,...,...,...,...,...
224521,323705,480571,6,97,0,3241m,80000,65000,3241m,Miscellaneous petroleum and coal products
224522,363693,520559,6,73,0,3241m,9600,9600,3241m,Miscellaneous petroleum and coal products
224523,366539,523405,6,37,0,3241m,53000,53000,3241m,Miscellaneous petroleum and coal products
224524,369272,526138,6,85,0,3241m,98000,98000,3241m,Miscellaneous petroleum and coal products


In [124]:
print(f'''TOTAL ROWS: {len(ipums_merged)}''')
print('\n')
for column in ipums_merged.columns.values:
    total = len(ipums_merged)
    nulls = ipums_merged[column].isna().sum()
    percentage = (nulls / total) * 100
    print(f'''{column}: {nulls} null values, {percentage:.2f}%''')

TOTAL ROWS: 224526


level_0: 0 null values, 0.00%
index: 0 null values, 0.00%
STATEFIP: 0 null values, 0.00%
COUNTYFIP: 0 null values, 0.00%
CITY: 0 null values, 0.00%
INDNAICS: 0 null values, 0.00%
INCTOT: 0 null values, 0.00%
INCWAGE: 0 null values, 0.00%
2018 Onward ACS/PRCS INDNAICS CODE: 0 null values, 0.00%
Industry Title: 0 null values, 0.00%


In [125]:
for column in ipums_merged.columns.values:
    print(f'''{column}: {len(ipums_merged[column].unique())} unique values''')

level_0: 224526 unique values
index: 224526 unique values
STATEFIP: 1 unique values
COUNTYFIP: 35 unique values
CITY: 23 unique values
INDNAICS: 269 unique values
INCTOT: 7610 unique values
INCWAGE: 889 unique values
2018 Onward ACS/PRCS INDNAICS CODE: 269 unique values
Industry Title: 269 unique values


### Merging on counties

In [126]:
ipums_w_counties = pd.merge(ipums_merged, county_info, on='COUNTYFIP', how='left')

In [127]:
print(len(ipums_merged), len(county_info), ipums_w_counties.shape)

224526 58 (224526, 22)


In [128]:
ipums_w_counties.head()

Unnamed: 0,level_0,index,STATEFIP,COUNTYFIP,CITY,INDNAICS,INCTOT,INCWAGE,2018 Onward ACS/PRCS INDNAICS CODE,Industry Title,...,State,"County, State",EDD County,Census County,Population - Households,Rural/Urban,Redstone Regions,WF Regions,CDI Regions,Population
0,0,156866,6,37,5140,4853,23100,23100,4853,Taxi and limousine service,...,California,"Los Angeles, California",Los Angeles County,"Los Angeles County, California",2207265,Urban,Los Angeles,Greater Los Angeles,Los Angeles,10081570
1,773,157639,6,37,3730,4853,53500,28000,4853,Taxi and limousine service,...,California,"Los Angeles, California",Los Angeles County,"Los Angeles County, California",2207265,Urban,Los Angeles,Greater Los Angeles,Los Angeles,10081570
2,1053,157919,6,75,6290,4853,35800,1000,4853,Taxi and limousine service,...,California,"San Francisco, California",San Francisco County,"San Francisco County, California",172190,Urban,Bay Area,Bay Area,Bay Area,874961
3,1489,158355,6,75,6290,4853,2800,1000,4853,Taxi and limousine service,...,California,"San Francisco, California",San Francisco County,"San Francisco County, California",172190,Urban,Bay Area,Bay Area,Bay Area,874961
4,1491,158357,6,37,3730,4853,53500,28000,4853,Taxi and limousine service,...,California,"Los Angeles, California",Los Angeles County,"Los Angeles County, California",2207265,Urban,Los Angeles,Greater Los Angeles,Los Angeles,10081570


In [156]:
ipums_w_counties_short = ipums_w_counties[['INDNAICS',
                                          'INCTOT',
                                          'INCWAGE',
                                          'Industry Title',
                                          'County',
                                          'CDI Regions',
                                          'Population']]

In [157]:
ipums_w_counties_short

Unnamed: 0,INDNAICS,INCTOT,INCWAGE,Industry Title,County,CDI Regions,Population
0,4853,23100,23100,Taxi and limousine service,Los Angeles,Los Angeles,10081570
1,4853,53500,28000,Taxi and limousine service,Los Angeles,Los Angeles,10081570
2,4853,35800,1000,Taxi and limousine service,San Francisco,Bay Area,874961
3,4853,2800,1000,Taxi and limousine service,San Francisco,Bay Area,874961
4,4853,53500,28000,Taxi and limousine service,Los Angeles,Los Angeles,10081570
...,...,...,...,...,...,...,...
224521,3241m,80000,65000,Miscellaneous petroleum and coal products,Sonoma,Bay Area,499772
224522,3241m,9600,9600,Miscellaneous petroleum and coal products,San Diego,San Diego-Imperial,3316073
224523,3241m,53000,53000,Miscellaneous petroleum and coal products,Los Angeles,Los Angeles,10081570
224524,3241m,98000,98000,Miscellaneous petroleum and coal products,Santa Clara,Bay Area,1927470


### Statewide Industry % - wage > state median

In [158]:
ca_median_income = ipums_w_counties_short['INCTOT'].median()
ca_median_income

40000.0

In [159]:
ca_median_wage = ipums_w_counties_short['INCWAGE'].median()
ca_median_wage

30000.0

In [160]:
ca_mean_income = ipums_w_counties_short['INCTOT'].mean()
ca_mean_income

64166.89874223921

In [161]:
ca_mean_wage = ipums_w_counties_short['INCWAGE'].mean()
ca_mean_wage

52498.9479614833

In [162]:
ipums_w_counties_short

Unnamed: 0,INDNAICS,INCTOT,INCWAGE,Industry Title,County,CDI Regions,Population
0,4853,23100,23100,Taxi and limousine service,Los Angeles,Los Angeles,10081570
1,4853,53500,28000,Taxi and limousine service,Los Angeles,Los Angeles,10081570
2,4853,35800,1000,Taxi and limousine service,San Francisco,Bay Area,874961
3,4853,2800,1000,Taxi and limousine service,San Francisco,Bay Area,874961
4,4853,53500,28000,Taxi and limousine service,Los Angeles,Los Angeles,10081570
...,...,...,...,...,...,...,...
224521,3241m,80000,65000,Miscellaneous petroleum and coal products,Sonoma,Bay Area,499772
224522,3241m,9600,9600,Miscellaneous petroleum and coal products,San Diego,San Diego-Imperial,3316073
224523,3241m,53000,53000,Miscellaneous petroleum and coal products,Los Angeles,Los Angeles,10081570
224524,3241m,98000,98000,Miscellaneous petroleum and coal products,Santa Clara,Bay Area,1927470


In [163]:
ipums_w_counties_short['Above CA Median'] = ipums_w_counties_short['INCWAGE'] > ca_median_wage

In [164]:
ipums_w_counties_short['Industry Counts'] = ipums_w_counties_short['Industry Title'].groupby(ipums_w_counties_short['Industry Title']).transform('count')

In [165]:
ipums_w_counties_short['High wage count'] = ipums_w_counties_short['Above CA Median'].groupby(
    ipums_w_counties_short['Industry Title']).transform('sum')

In [166]:
ipums_w_counties_short['High wage percentage'] = (ipums_w_counties_short['High wage count'] / ipums_w_counties_short['Industry Counts']) * 100

In [170]:
high_wage_ca = ipums_w_counties_short.drop_duplicates(subset='Industry Title')
high_wage_ca = high_wage_ca[['Industry Title', 'Industry Counts', 'High wage count', 'High wage percentage']]
high_wage_ca = high_wage_ca.sort_values(by='High wage percentage', ascending=False)

In [171]:
high_wage_ca = high_wage_ca.reset_index().iloc[:,1:]

In [172]:
high_wage_ca

Unnamed: 0,Industry Title,Industry Counts,High wage count,High wage percentage
0,Pipeline transportation,16,16,100.0
1,Coal mining,3,3,100.0
2,"Engine, turbine, and power transmission equipm...",53,49,92.45283
3,Software publishers,405,356,87.901235
4,Internet publishing and broadcasting and web s...,898,776,86.414254
5,"Electric and gas, and other combinations",364,309,84.89011
6,Not specified utilities,41,34,82.926829
7,Computer and peripheral equipment manufacturing,524,434,82.824427
8,"Other information services, except libraries a...",74,61,82.432432
9,Sewage treatment facilities,108,89,82.407407


### County Industry % - wage > county median

In [173]:
ipums_w_counties_short['County Median Wage'] = ipums_w_counties_short['INCWAGE'].groupby(ipums_w_counties_short['County']).transform('median')

In [175]:
ipums_w_counties_short['Above County Median'] = ipums_w_counties_short['INCWAGE'] > ipums_w_counties_short['County Median Wage']

In [180]:
df = ipums_w_counties_short.groupby(['County','Industry Title']).size().reset_index().rename(columns={0:'County Industry Count'})

In [181]:
df

Unnamed: 0,County,Industry Title,County Industry Count
0,Alameda,"Accounting, tax preparation, bookkeeping and p...",86
1,Alameda,Administration of economic programs and space ...,33
2,Alameda,Administration of environmental quality and ho...,11
3,Alameda,Administration of human resource programs,82
4,Alameda,"Advertising, public relations, and related ser...",54
...,...,...,...
6936,Yolo,Vocational rehabilitation services,2
6937,Yolo,Warehousing and storage,4
6938,Yolo,Waste management and remediation services,1
6939,Yolo,"Water, steam, air conditioning, and irrigation...",7
