In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format='retina'

### Loading Data

In [132]:
ipums = pd.read_csv('IPUMS_2019.csv')

In [147]:
ipums.columns

Index(['YEAR', 'SAMPLE', 'SERIAL', 'CBSERIAL', 'HHWT', 'CLUSTER', 'CPI99',
       'STATEFIP', 'COUNTYFIP', 'CITY', 'STRATA', 'GQ', 'OWNERSHP',
       'OWNERSHPD', 'OWNCOST', 'RENTGRS', 'HHINCOME', 'CILAPTOP', 'CIHISPEED',
       'PERNUM', 'PERWT', 'FAMSIZE', 'AGE', 'RACE', 'RACED', 'HISPAN',
       'HISPAND', 'EDUC', 'EDUCD', 'OCCSOC', 'INDNAICS', 'INCTOT', 'FTOTINC',
       'INCWAGE', 'MIGCOUNTY1', 'MOVEDIN', 'TRANTIME'],
      dtype='object')

In [142]:
ipums_eda = ipums[['HHWT', 'HHINCOME', 'PERNUM', 'PERWT', 'FAMSIZE', 'INCTOT', 'FTOTINC', 'INCWAGE']]

In [144]:
ipums_eda.head(20)

Unnamed: 0,HHWT,HHINCOME,PERNUM,PERWT,FAMSIZE,INCTOT,FTOTINC,INCWAGE
0,11.0,9999999,1,11.0,1,9000,9999999,0
1,70.0,9999999,1,70.0,1,150,9999999,0
2,20.0,9999999,1,20.0,1,1400,9999999,1400
3,79.0,9999999,1,79.0,1,22700,9999999,0
4,53.0,9999999,1,53.0,1,0,9999999,0
5,77.0,9999999,1,77.0,1,0,9999999,0
6,8.0,9999999,1,8.0,1,36000,9999999,0
7,15.0,9999999,1,15.0,1,9300,9999999,0
8,61.0,9999999,1,61.0,1,60000,9999999,60000
9,152.0,9999999,1,152.0,1,0,9999999,0


In [146]:
print(len(ipums_eda['HHINCOME'].unique()), ipums_eda['HHINCOME'].unique())

31854 [9999999  297000   16700 ...  501250   68724  431180]


In [3]:
ipums_titles = pd.read_csv('ind_indnaics_crosswalk_2000_onward_without_code_descriptions.csv')

In [4]:
ipums_titles = ipums_titles.iloc[2:]

In [5]:
ipums_titles = ipums_titles.iloc[:,10:]

In [6]:
county_info = pd.read_csv('county_to_regions_key - Sheet1.csv')

### Cleaning

In [7]:
ipums = ipums[['STATEFIP', 
               'COUNTYFIP',
              'CITY',
              'INDNAICS',
              'INCTOT',
              'INCWAGE']]

In [8]:
ca_ipums = ipums.loc[ipums['STATEFIP'] == 6].copy()

In [9]:
ca_ipums = ca_ipums.reset_index()

In [10]:
len(ca_ipums)

380091

In [11]:
def normalize_titles(col):
    col = col.astype(str)
    col = col.str.strip()
    col = col.str.lower()
    col = col.str.replace('&', 'and')
    col = col.apply(lambda x:''.join([i for i in x if i not in string.punctuation]))
    return col

In [12]:
ca_ipums['INDNAICS'] = normalize_titles(ca_ipums['INDNAICS'])

In [13]:
ipums_titles['2018 Onward ACS/PRCS INDNAICS CODE'] = normalize_titles(ipums_titles['2018 Onward ACS/PRCS INDNAICS CODE'])

### Merging on NAICS Codes

In [14]:
ca_ipums.columns.values

array(['index', 'STATEFIP', 'COUNTYFIP', 'CITY', 'INDNAICS', 'INCTOT',
       'INCWAGE'], dtype=object)

In [15]:
ipums_titles.columns.values

array(['2018 Onward ACS/PRCS INDNAICS CODE', 'Industry Title'],
      dtype=object)

In [16]:
ipums_merged = pd.merge(ca_ipums, ipums_titles,
                              left_on = 'INDNAICS', right_on = '2018 Onward ACS/PRCS INDNAICS CODE')

In [17]:
print(len(ca_ipums), len(ipums_titles), ipums_merged.shape)

380091 402 (224526, 9)


In [18]:
ca_ipums[~ca_ipums['INDNAICS'].isin(ipums_titles['2018 Onward ACS/PRCS INDNAICS CODE'])]

Unnamed: 0,index,STATEFIP,COUNTYFIP,CITY,INDNAICS,INCTOT,INCWAGE
1,156867,6,73,0,0,0,0
3,156869,6,71,0,0,0,0
5,156871,6,37,3730,0,36700,0
7,156873,6,37,0,0,7400,0
8,156874,6,111,0,0,8500,0
...,...,...,...,...,...,...,...
380080,536946,6,97,0,0,9999999,999999
380082,536948,6,85,0,0,0,0
380085,536951,6,59,0,0,7000,0
380087,536953,6,59,0,0,0,0


In [19]:
ipums_merged

Unnamed: 0,index,STATEFIP,COUNTYFIP,CITY,INDNAICS,INCTOT,INCWAGE,2018 Onward ACS/PRCS INDNAICS CODE,Industry Title
0,156866,6,37,5140,4853,23100,23100,4853,Taxi and limousine service
1,157639,6,37,3730,4853,53500,28000,4853,Taxi and limousine service
2,157919,6,75,6290,4853,35800,1000,4853,Taxi and limousine service
3,158355,6,75,6290,4853,2800,1000,4853,Taxi and limousine service
4,158357,6,37,3730,4853,53500,28000,4853,Taxi and limousine service
...,...,...,...,...,...,...,...,...,...
224521,480571,6,97,0,3241m,80000,65000,3241m,Miscellaneous petroleum and coal products
224522,520559,6,73,0,3241m,9600,9600,3241m,Miscellaneous petroleum and coal products
224523,523405,6,37,0,3241m,53000,53000,3241m,Miscellaneous petroleum and coal products
224524,526138,6,85,0,3241m,98000,98000,3241m,Miscellaneous petroleum and coal products


In [20]:
print(f'''TOTAL ROWS: {len(ipums_merged)}''')
print('\n')
for column in ipums_merged.columns.values:
    total = len(ipums_merged)
    nulls = ipums_merged[column].isna().sum()
    percentage = (nulls / total) * 100
    print(f'''{column}: {nulls} null values, {percentage:.2f}%''')

TOTAL ROWS: 224526


index: 0 null values, 0.00%
STATEFIP: 0 null values, 0.00%
COUNTYFIP: 0 null values, 0.00%
CITY: 0 null values, 0.00%
INDNAICS: 0 null values, 0.00%
INCTOT: 0 null values, 0.00%
INCWAGE: 0 null values, 0.00%
2018 Onward ACS/PRCS INDNAICS CODE: 0 null values, 0.00%
Industry Title: 0 null values, 0.00%


In [21]:
for column in ipums_merged.columns.values:
    print(f'''{column}: {len(ipums_merged[column].unique())} unique values''')

index: 224526 unique values
STATEFIP: 1 unique values
COUNTYFIP: 35 unique values
CITY: 23 unique values
INDNAICS: 269 unique values
INCTOT: 7610 unique values
INCWAGE: 889 unique values
2018 Onward ACS/PRCS INDNAICS CODE: 269 unique values
Industry Title: 269 unique values


### Merging on counties

In [22]:
ipums_w_counties = pd.merge(ipums_merged, county_info, on='COUNTYFIP', how='left')

In [23]:
print(len(ipums_merged), len(county_info), ipums_w_counties.shape)

224526 58 (224526, 21)


In [24]:
ipums_w_counties.head()

Unnamed: 0,index,STATEFIP,COUNTYFIP,CITY,INDNAICS,INCTOT,INCWAGE,2018 Onward ACS/PRCS INDNAICS CODE,Industry Title,FIPS,...,State,"County, State",EDD County,Census County,Population - Households,Rural/Urban,Redstone Regions,WF Regions,CDI Regions,Population
0,156866,6,37,5140,4853,23100,23100,4853,Taxi and limousine service,6037.0,...,California,"Los Angeles, California",Los Angeles County,"Los Angeles County, California",2207265,Urban,Los Angeles,Greater Los Angeles,Los Angeles,10081570
1,157639,6,37,3730,4853,53500,28000,4853,Taxi and limousine service,6037.0,...,California,"Los Angeles, California",Los Angeles County,"Los Angeles County, California",2207265,Urban,Los Angeles,Greater Los Angeles,Los Angeles,10081570
2,157919,6,75,6290,4853,35800,1000,4853,Taxi and limousine service,6075.0,...,California,"San Francisco, California",San Francisco County,"San Francisco County, California",172190,Urban,Bay Area,Bay Area,Bay Area,874961
3,158355,6,75,6290,4853,2800,1000,4853,Taxi and limousine service,6075.0,...,California,"San Francisco, California",San Francisco County,"San Francisco County, California",172190,Urban,Bay Area,Bay Area,Bay Area,874961
4,158357,6,37,3730,4853,53500,28000,4853,Taxi and limousine service,6037.0,...,California,"Los Angeles, California",Los Angeles County,"Los Angeles County, California",2207265,Urban,Los Angeles,Greater Los Angeles,Los Angeles,10081570


In [68]:
ipums_w_counties_short = ipums_w_counties[['INDNAICS',
                                          'INCTOT',
                                          'INCWAGE',
                                          'Industry Title',
                                          'County',
                                          'CDI Regions',
                                          'Population']]

In [69]:
ipums_w_counties_short

Unnamed: 0,INDNAICS,INCTOT,INCWAGE,Industry Title,County,CDI Regions,Population
0,4853,23100,23100,Taxi and limousine service,Los Angeles,Los Angeles,10081570
1,4853,53500,28000,Taxi and limousine service,Los Angeles,Los Angeles,10081570
2,4853,35800,1000,Taxi and limousine service,San Francisco,Bay Area,874961
3,4853,2800,1000,Taxi and limousine service,San Francisco,Bay Area,874961
4,4853,53500,28000,Taxi and limousine service,Los Angeles,Los Angeles,10081570
...,...,...,...,...,...,...,...
224521,3241m,80000,65000,Miscellaneous petroleum and coal products,Sonoma,Bay Area,499772
224522,3241m,9600,9600,Miscellaneous petroleum and coal products,San Diego,San Diego-Imperial,3316073
224523,3241m,53000,53000,Miscellaneous petroleum and coal products,Los Angeles,Los Angeles,10081570
224524,3241m,98000,98000,Miscellaneous petroleum and coal products,Santa Clara,Bay Area,1927470


### Statewide Industry % - wage > state mean

In [54]:
ca_median_income = ipums_w_counties_short['INCTOT'].median()
ca_median_income

40000.0

In [55]:
ca_median_wage = ipums_w_counties_short['INCWAGE'].median()
ca_median_wage

30000.0

In [56]:
ca_mean_income = ipums_w_counties_short['INCTOT'].mean()
ca_mean_income

64166.89874223921

In [57]:
ca_mean_wage = ipums_w_counties_short['INCWAGE'].mean()
ca_mean_wage

52498.9479614833

In [70]:
ipums_w_counties_short['Above CA Mean'] = ipums_w_counties_short['INCWAGE'] > ca_mean_wage

In [71]:
ipums_w_counties_short['Industry Counts'] = ipums_w_counties_short['Industry Title'].groupby(ipums_w_counties_short['Industry Title']).transform('count')

In [72]:
ipums_w_counties_short['CA High wage count'] = ipums_w_counties_short['Above CA Mean'].groupby(
    ipums_w_counties_short['Industry Title']).transform('sum')

In [73]:
ipums_w_counties_short['CA High wage percentage'] = (ipums_w_counties_short['CA High wage count'] / ipums_w_counties_short['Industry Counts']) * 100

In [74]:
high_wage_ca = ipums_w_counties_short.drop_duplicates(subset='Industry Title')
high_wage_ca = high_wage_ca[['Industry Title', 'Industry Counts', 'CA High wage count', 'CA High wage percentage']]
high_wage_ca = high_wage_ca.sort_values(by='CA High wage percentage', ascending=False)

In [75]:
high_wage_ca = high_wage_ca.reset_index().iloc[:,1:]

In [76]:
high_wage_ca

Unnamed: 0,Industry Title,Industry Counts,CA High wage count,CA High wage percentage
0,Coal mining,3,3,100.000000
1,Pipeline transportation,16,14,87.500000
2,Software publishers,405,338,83.456790
3,Internet publishing and broadcasting and web s...,898,718,79.955457
4,"Engine, turbine, and power transmission equipm...",53,42,79.245283
...,...,...,...,...
264,Florists,172,8,4.651163
265,Nail salons and other personal care services,839,32,3.814064
266,Book stores and news dealers,162,6,3.703704
267,Private households,1529,49,3.204709


### Region Industry % - wage > region mean

In [77]:
ipums_w_counties_short['Region Mean Wage'] = ipums_w_counties_short['INCWAGE'].groupby(ipums_w_counties_short['CDI Regions']).transform('mean')

In [78]:
ipums_w_counties_short['Above Region Mean'] = ipums_w_counties_short['INCWAGE'] > ipums_w_counties_short['Region Mean Wage']

In [119]:
df = ipums_w_counties_short.groupby(['CDI Regions','Industry Title']).size().reset_index().rename(columns={0:'Region Industry Count'})

In [127]:
high_wage_regions = ipums_w_counties_short.groupby(['CDI Regions','Industry Title']).agg(
    {'Above Region Mean': 'sum'})
high_wage_regions['Region Industry Count'] = df['Region Industry Count'].values
high_wage_regions['Region High Wage Percentage'] = (
    high_wage_regions['Above Region Mean'] / high_wage_regions['Region Industry Count']) * 100

In [130]:
high_wage_regions = high_wage_regions.sort_values(['CDI Regions', 'Region High Wage Percentage'], ascending=(True, False))

In [131]:
high_wage_regions

Unnamed: 0_level_0,Unnamed: 1_level_0,Above Region Mean,Region Industry Count,Region High Wage Percentage
CDI Regions,Industry Title,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bay Area,"Engine, turbine, and power transmission equipment manufacturing",5,5,100.0
Bay Area,Metal ore mining,1,1,100.0
Bay Area,"Nonferrous metal, except aluminum, production and processing",1,1,100.0
Bay Area,Tires,1,1,100.0
Bay Area,"Veneer, plywood, and engineered wood products",1,1,100.0
...,...,...,...,...
Shasta / Cascades,"Telecommunications, except wired telecommunications carriers",0,1,0.0
Shasta / Cascades,U.S. Army,0,1,0.0
Shasta / Cascades,U.S. Navy,0,1,0.0
Shasta / Cascades,"Unemployed, with no work experience in past 5 years",0,22,0.0
