# Create hierarchical structure for IPUMS and EDD data

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import warnings
from jqi_functions import *
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format='retina'

## Creating CSV/dataframe

In [21]:
ca_ipums = cleaned_ipums('2019')

In [22]:
ca_ipums.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 224526 entries, 0 to 224525
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   YEAR              224526 non-null  int64  
 1   STATEFIP          224526 non-null  int64  
 2   COUNTYFIP         224526 non-null  int64  
 3   INDNAICS          224526 non-null  object 
 4   PERWT             224526 non-null  float64
 5   INCWAGE           224526 non-null  int64  
 6   NAICS Code        224526 non-null  object 
 7   Industry Title_x  224526 non-null  object 
 8   Industry Title_y  224526 non-null  object 
 9   Main_Code         224526 non-null  int64  
 10  Sub_1_Code        224526 non-null  int64  
 11  Sub_2_Code        224526 non-null  object 
 12  Sub_3_Code        224526 non-null  object 
 13  Sub_4_Code        224526 non-null  object 
dtypes: float64(1), int64(6), object(7)
memory usage: 25.7+ MB


In [23]:
county_info = pd.read_csv('data/county_to_regions_key - Sheet1.csv')

In [24]:
county_info.head()

Unnamed: 0,FIPS,COUNTYFIP,County,State,"County, State",EDD County,Census County,Population - Households,Rural/Urban,Redstone Regions,WF Regions,CDI Regions,Population
0,6001,1,Alameda,California,"Alameda, California",Alameda County,"Alameda County, California",383468,Urban,Bay Area,Bay Area,Bay Area,1656754
1,6013,13,Contra Costa,California,"Contra Costa, California",Contra Costa County,"Contra Costa County, California",282085,Urban,Bay Area,Bay Area,Bay Area,1142251
2,6095,95,Solano,California,"Solano, California",Solano County,"Solano County, California",107267,Urban,Bay Area,Bay Area,Bay Area,441829
3,6081,81,San Mateo,California,"San Mateo, California",San Mateo County,"San Mateo County, California",183144,Urban,Bay Area,Bay Area,Bay Area,767423
4,6085,85,Santa Clara,California,"Santa Clara, California",Santa Clara County,"Santa Clara County, California",456440,Urban,Bay Area,Bay Area,Bay Area,1927470


In [25]:
cost_of_living = pd.read_csv('data/united-way-col-1A1PS1C2019.csv') # 1 adult, 1 preschooler, 1 child

In [26]:
cost_of_living.head()

Unnamed: 0,Regions,Cost of Living
0,Bay Area,93392
1,Central Coast,76493
2,Central Valley,56747
3,Inland Empire,63170
4,Los Angeles,80216


In [27]:
ca_ipums = pd.merge(ca_ipums, county_info, on = 'COUNTYFIP')

In [31]:
ca_ipums = ca_ipums[['INDNAICS', 'PERWT', 'INCWAGE',
       'NAICS Code', 'Industry Title_x', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code', 'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions']]

In [32]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'CDI Regions', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'Regional COL'})

In [33]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'Rural/Urban', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'Rural/Urban COL'})

In [34]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'County', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'County COL'})

In [36]:
ca_ipums = ca_ipums[['INDNAICS', 'PERWT', 'INCWAGE',
       'NAICS Code', 'Industry Title_x', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code', 'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions',
                    'Regional COL', 'Rural/Urban COL', 'County COL']]

In [37]:
ca_ipums['Regional Rural/Urban'] = ca_ipums['CDI Regions'] + ' ' + ca_ipums['Rural/Urban']

In [38]:
ca_ipums = pd.merge(ca_ipums, cost_of_living, left_on = 'Regional Rural/Urban', right_on = 'Regions')
ca_ipums = ca_ipums.rename(columns = {'Cost of Living':'Regional Rural/Urban COL'})

In [41]:
ca_ipums = ca_ipums[['INDNAICS', 'PERWT', 'INCWAGE',
       'NAICS Code', 'Industry Title_x', 'Main_Code', 'Sub_1_Code', 'Sub_2_Code', 'Sub_3_Code',
       'Sub_4_Code', 'County', 'Rural/Urban', 'CDI Regions',
                    'Regional COL', 'Rural/Urban COL', 'County COL', 'Regional Rural/Urban COL']]

In [50]:
ca_ipums['Industry Title_x'] = normalize_titles(ca_ipums['Industry Title_x'])

In [51]:
ca_ipums

Unnamed: 0,INDNAICS,PERWT,INCWAGE,NAICS Code,Industry Title_x,Main_Code,Sub_1_Code,Sub_2_Code,Sub_3_Code,Sub_4_Code,County,Rural/Urban,CDI Regions,Regional COL,Rural/Urban COL,County COL,Regional Rural/Urban COL
0,4853,21.0,23100,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,80216,79472,80216,80216
1,4853,21.0,23100,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,80216,79472,80216,80216
2,4853,21.0,23100,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,80216,79472,80216,80216
3,4853,21.0,23100,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,80216,79472,80216,80216
4,4853,11.0,28000,4853,taxi and limousine service,400,430,480,483,483,Los Angeles,Urban,Los Angeles,80216,79472,80216,80216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488572,3113,558.0,5300,3113,sugar and confectionery products,300,310,311,311,311,Humboldt,Rural,Redwood Coast,58625,58812,59608,58625
488573,3113,244.0,5300,3113,sugar and confectionery products,300,310,311,311,311,Humboldt,Rural,Redwood Coast,58625,58812,59608,58625
488574,4247,117.0,50000,4247,petroleum and petroleum products merchant whol...,400,420,424,424d,424d,Humboldt,Rural,Redwood Coast,58625,58812,59608,58625
488575,42491,148.0,40000,42491,farm supplies merchant wholesalers,400,420,424,424,424,Humboldt,Rural,Redwood Coast,58625,58812,59608,58625


## Merge with EDD Data

In [47]:
edd = pd.read_csv('data/edd_merged.csv')

In [48]:
edd['Area Name'] = edd['Area Name'].str.replace(' County', '')
edd = edd.loc[edd['Area Type'] == 'County']

In [49]:
edd.head()

Unnamed: 0,Industry Title,LMID Industry Title,Parsed_Code,Area Type,Area Name,Date,Seasonally Adjusted,Current Employment,Main_EDD,Main_Code,Sub_1,Sub_1_Code,Sub_2,Sub_2_Code,Sub_3,Sub_3_Code,Sub_4,Sub_4_Code
0,county,county,939,County,Madera,3/1/19,N,1600,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
1,county,county,939,County,Fresno,1/1/19,N,7800,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
2,county,county,939,County,Kern,1/1/19,N,9900,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
3,county,county,939,County,Los Angeles,1/1/19,N,106800,government,900,state and local government,940,local government,930,local government excluding education,932,county,939
4,county,county,939,County,Madera,1/1/19,N,1600,government,900,state and local government,940,local government,930,local government excluding education,932,county,939


In [56]:
ca_ipums_edd = pd.merge(ca_ipums, edd, left_on=['County', 'Industry Title_x'], right_on=['Area Name', 'Sub_4'])

In [61]:
ca_ipums_edd.head().T

Unnamed: 0,0,1,2,3,4
INDNAICS,5241,5241,5241,5241,5241
PERWT,36.0,36.0,36.0,36.0,36.0
INCWAGE,110000,110000,110000,110000,110000
NAICS Code,5241,5241,5241,5241,5241
Industry Title_x,insurance carriers,insurance carriers,insurance carriers,insurance carriers,insurance carriers
Main_Code_x,520,520,520,520,520
Sub_1_Code_x,521,521,521,521,521
Sub_2_Code_x,524,524,524,524,524
Sub_3_Code_x,524a,524a,524a,524a,524a
Sub_4_Code_x,524a,524a,524a,524a,524a


In [59]:
ca_ipums_edd.columns.values

array(['INDNAICS', 'PERWT', 'INCWAGE', 'NAICS Code', 'Industry Title_x',
       'Main_Code_x', 'Sub_1_Code_x', 'Sub_2_Code_x', 'Sub_3_Code_x',
       'Sub_4_Code_x', 'County', 'Rural/Urban', 'CDI Regions',
       'Regional COL', 'Rural/Urban COL', 'County COL',
       'Regional Rural/Urban COL', 'Industry Title',
       'LMID Industry Title', 'Parsed_Code', 'Area Type', 'Area Name',
       'Date', 'Seasonally Adjusted', 'Current Employment', 'Main_EDD',
       'Main_Code_y', 'Sub_1', 'Sub_1_Code_y', 'Sub_2', 'Sub_2_Code_y',
       'Sub_3', 'Sub_3_Code_y', 'Sub_4', 'Sub_4_Code_y'], dtype=object)

In [63]:
ca_ipums_edd = ca_ipums_edd[['INDNAICS', 'PERWT', 'INCWAGE', 'NAICS Code',
                            'County', 'Rural/Urban', 'CDI Regions',
                           'Regional COL', 'Rural/Urban COL', 'County COL',
                           'Regional Rural/Urban COL', 'LMID Industry Title', 'Area Type',
                           'Date', 'Seasonally Adjusted', 'Current Employment', 'Main_EDD',
                           'Main_Code_y', 'Sub_1', 'Sub_1_Code_y', 'Sub_2', 'Sub_2_Code_y',
                           'Sub_3', 'Sub_3_Code_y', 'Sub_4', 'Sub_4_Code_y']]

In [64]:
ca_ipums_edd = ca_ipums_edd.rename(columns={"Main_Code_y": "Main_Code",
                                           "Sub_1_Code_y": "Sub_1_Code",
                                           "Sub_2_Code_y": "Sub_2_Code",
                                           "Sub_3_Code_y": "Sub_3_Code",
                                           "Sub_4_Code_y": "Sub_4_Code"})

In [66]:
ca_ipums_edd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 741060 entries, 0 to 741059
Data columns (total 26 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   INDNAICS                  741060 non-null  object 
 1   PERWT                     741060 non-null  float64
 2   INCWAGE                   741060 non-null  int64  
 3   NAICS Code                741060 non-null  object 
 4   County                    741060 non-null  object 
 5   Rural/Urban               741060 non-null  object 
 6   CDI Regions               741060 non-null  object 
 7   Regional COL              741060 non-null  int64  
 8   Rural/Urban COL           741060 non-null  int64  
 9   County COL                741060 non-null  int64  
 10  Regional Rural/Urban COL  741060 non-null  int64  
 11  LMID Industry Title       741060 non-null  object 
 12  Area Type                 741060 non-null  object 
 13  Date                      741060 non-null  o

In [68]:
# ca_ipums_edd.to_csv('data/ca_ipums_full_hierarchy.csv', encoding='utf-8', index=False)

### Checking what industries from original EDD data were dropped

In [72]:
edd_antijoin = edd[~edd['LMID Industry Title'].isin(ca_ipums_edd['Sub_4'].unique())]

In [73]:
len(edd_antijoin)

3144

In [75]:
edd_antijoin['LMID Industry Title'].unique()

array(['county', 'total farm', 'state government education', 'city',
       'local government education', 'general merchandise stores',
       'state government excluding education', 'department of defense',
       'federal government excluding department of defense',
       'building foundation and exterior contractors',
       'residential building construction',
       'building finishing contractors',
       'travel arrangement and reservation services',
       'health and personal care stores',
       'bakeries and tortilla manufacturing',
       'primary metal manufacturing', 'lessors of real estate',
       'prof and commercial equip merchant wholesalers',
       'mgmt scientific and technical consulting services',
       'transit and ground passenger transportation',
       'limitedservice eating places', 'grocery stores',
       'sporting goods hobby and musical instrument store',
       'textile mills', 'offices of real estate agents and brokers',
       'agencies broker and 

## Attempting sample size function

In [77]:
ca_ipums_edd.head().T

Unnamed: 0,0,1,2,3,4
INDNAICS,5241,5241,5241,5241,5241
PERWT,36.0,36.0,36.0,36.0,36.0
INCWAGE,110000,110000,110000,110000,110000
NAICS Code,5241,5241,5241,5241,5241
County,Los Angeles,Los Angeles,Los Angeles,Los Angeles,Los Angeles
Rural/Urban,Urban,Urban,Urban,Urban,Urban
CDI Regions,Los Angeles,Los Angeles,Los Angeles,Los Angeles,Los Angeles
Regional COL,80216,80216,80216,80216,80216
Rural/Urban COL,79472,79472,79472,79472,79472
County COL,80216,80216,80216,80216,80216


In [96]:
def add_to_state_df(df):
    df['Above CA Threshold'] = df['INCWAGE'] > 74448
    df['wt_ind_counts'] = df['PERWT'].groupby(df['INDNAICS']).transform('sum')
    df["Above CA Threshold"] = df["Above CA Threshold"].astype(int)
    df["wt_CA_above_thresh"] = df["Above CA Threshold"] * df['PERWT']
    df['wt_CA_high_wage_count'] = df['wt_CA_above_thresh'].groupby(
        df['INDNAICS']).transform('sum')
    df['wt_CA_high_wage_perc'] = (df['wt_CA_high_wage_count'] / df['wt_ind_counts']) * 100
    df['unwt_ind_counts'] = df['INDNAICS'].groupby(df['INDNAICS']).transform('count')
    return df

#### Rural/Urban level

In [145]:
def add_to_community_df(df):
    df['above_comm_thresh'] = df['INCWAGE'] > df['Rural/Urban COL']
    df_agg = df.groupby(['INDNAICS','Rural/Urban']).agg(wt_comm_ind_counts = ('PERWT','sum')).reset_index()
    df = pd.merge(df, df_agg, on='INDNAICS')
    df["above_comm_thresh"] = df["above_comm_thresh"].astype(int)
    df["wt_comm_above_thresh"] = df["above_comm_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','Rural/Urban_x']).agg(wt_comm_high_wage_count = ('wt_comm_above_thresh','sum')).reset_index()
    df = pd.merge(df, df_agg, on='INDNAICS')
    df['wt_comm_high_wage_perc'] = (df['wt_comm_high_wage_count'] / df['wt_comm_ind_counts']) * 100
    df_agg = df.groupby(['INDNAICS','Rural/Urban_x_x']).agg(unwt_comm_ind_counts = ('INDNAICS','count')).reset_index()
    df = pd.merge(df, df_agg, on='INDNAICS')
    return df

#### Regional Level

In [149]:
def add_to_region_df(df):
    df['above_region_thresh'] = df['INCWAGE'] > df['Regional COL']
    df_agg = df.groupby(['INDNAICS','CDI Regions']).agg(wt_reg_ind_counts = ('PERWT','sum')).reset_index()
    df = pd.merge(df, df_agg, on='INDNAICS')
    df["above_region_thresh"] = df["above_region_thresh"].astype(int)
    df["wt_reg_above_thresh"] = df["above_region_thresh"] * df['PERWT']
    df_agg = df.groupby(['INDNAICS','CDI Regions_x']).agg(wt_reg_high_wage_count = ('wt_reg_above_thresh','sum')).reset_index()
    df = pd.merge(df, df_agg, on='INDNAICS')
    df['wt_reg_high_wage_perc'] = (df['wt_reg_high_wage_count'] / df['wt_reg_ind_counts']) * 100
    df_agg = df.groupby(['INDNAICS','CDI Regions_x_x']).agg(unwt_comm_ind_counts = ('INDNAICS','count')).reset_index()
    df = pd.merge(df, df_agg, on='INDNAICS')
    return df

#### Regional Rural/Urban Level

In [99]:
def add_to_regioncomm_df(df):
    df['above_regcomm_thresh'] = df['INCWAGE'] > df['Regional Rural/Urban COL']
    df['wt_regcomm_ind_counts'] = df['PERWT'].groupby(df['INDNAICS']).transform('sum')
    df["above_regcomm_thresh"] = df["above_regcomm_thresh"].astype(int)
    df["wt_regcomm_above_thresh"] = df["above_regcomm_thresh"] * df['PERWT']
    df['wt_regcomm_hw_count'] = df['wt_regcomm_above_thresh'].groupby(
        df['INDNAICS']).transform('sum')
    df['wt_regcomm_hw_perc'] = (df['wt_regcomm_hw_count'] / df['wt_regcomm_ind_counts']) * 100
    df['unwt_regcomm_ind_counts'] = df['INDNAICS'].groupby(df['INDNAICS']).transform('count')
    return df

#### County Level

In [100]:
def add_to_county_df(df):
    df['above_county_thresh'] = df['INCWAGE'] > df['County COL']
    df['wt_county_ind_counts'] = df['PERWT'].groupby(df['INDNAICS']).transform('sum')
    df["above_county_thresh"] = df["above_county_thresh"].astype(int)
    df["wt_county_above_thresh"] = df["above_county_thresh"] * df['PERWT']
    df['wt_county_hw_count'] = df['wt_county_above_thresh'].groupby(
        df['INDNAICS']).transform('sum')
    df['wt_county_hw_perc'] = (df['wt_county_hw_count'] / df['wt_county_ind_counts']) * 100
    df['unwt_county_ind_counts'] = df['INDNAICS'].groupby(df['INDNAICS']).transform('count')
    return df

In [150]:
def add_geo_high_wages(df): # need to fix - giving same numbers for each geo level
    df_new = df.copy()
    # creating state level counts
    df_new = add_to_state_df(df_new)
    
    # creating rural/urban level counts
    df_new = add_to_community_df(df_new)
    
    # creating regional level counts
    df_new = add_to_region_df(df_new)
    
    # creating regional rural/urban level counts
    df_new = add_to_regioncomm_df(df_new)
    
    # creating county level counts
    df_new = add_to_county_df(df_new)
    
    # filter out dead columns
    pass
    return df_new

In [151]:
ca_ipums_edd_hw = add_geo_high_wages(ca_ipums_edd)

In [153]:
ca_ipums_edd_hw.tail().T

Unnamed: 0,44796631,44796632,44796633,44796634,44796635
INDNAICS,5112,5112,5112,5112,5112
PERWT,105.0,105.0,105.0,105.0,105.0
INCWAGE,200000,200000,200000,200000,200000
NAICS Code,5112,5112,5112,5112,5112
County,San Diego,San Diego,San Diego,San Diego,San Diego
Rural/Urban_x_x_x,Urban,Urban,Urban,Urban,Urban
CDI Regions_x_x_x,San Diego-Imperial,San Diego-Imperial,San Diego-Imperial,San Diego-Imperial,San Diego-Imperial
Regional COL,51652,51652,51652,51652,51652
Rural/Urban COL,79472,79472,79472,79472,79472
County COL,79472,79472,79472,79472,79472


In [None]:
def ca_ipums_edd_filter(df, county, NAICS, n):
    df = df.loc[df['County'] == county].copy()
    if len(df) == 0:
        return "County not valid"
    df = df.loc[df['INDNAICS'] == NAICS].copy()
    if len(df) == 0:
        return "NAICS Code not valid"
    df = df.drop_duplicates(subset='INDNAICS').reset_index().iloc[:,1:]
    if df['unwt_county_ind_counts'].values[0] >= n:
        return f"County: {county}, Geographical level used: County, High wage percentage: {df['wt_county_hw_perc'].values[0]}"
    elif df['unwt_regcomm_ind_counts'].values[0] >= n:
        return f"County: {county}, Geographical level used: Regional Rural/Urban, High wage percentage: {df['wt_regcomm_hw_perc'].values[0]}"
    elif df['unwt_reg_ind_counts'].values[0] >= n:
        return f"County: {county}, Geographical level used: Regional, High wage percentage: {df['wt_reg_high_wage_perc'].values[0]}"
    elif df['unwt_comm_ind_counts'].values[0] >= n:
        return f"County: {county}, Geographical level used: Rural/Urban, High wage percentage: {df['wt_comm_high_wage_perc'].values[0]}"
    elif df['unwt_ind_counts'].values[0] >= n:
        return f"County: {county}, Geographical level used: State, High wage percentage: {df['wt_CA_high_wage_perc'].values[0]}"
    else:
        return "Not enough records available to satisfy sample size request"

In [71]:
def check_sample_size(n, df, geo_level: str):
    df_new = df.copy()
    # creating state level counts
    df_new = add_to_state_df(df_new)
    
    # creating rural/urban level counts
    df_new = add_to_community_df(df_new)
    
    # creating regional level counts
    df_new = add_to_region_df(df_new)
    
    # creating regional rural/urban level counts
    df_new = add_to_regioncomm_df(df_new)
    
    # creating county level counts
    df_new = add_to_county_df(df_new)
    
    # geo_level conditions
    if geo_level == 'state':
        # drop everything besides state
        df_new = df_new[df_new.unwt_ind_counts > n]
    elif geo_level == 'rural/urban':
        # drop everything besides state, rural/urban
        df_new = df_new[df_new.unwt_comm_ind_counts > n]
    elif geo_level == 'regional':
        # ...
        df_new = df_new[df_new.unwt_reg_ind_counts > n]
    elif geo_level == 'regional rural/urban':
        df_new = df_new[df_new.unwt_regcomm_ind_counts > n]
    elif geo_level == 'county':
        df_new = df_new[df_new.unwt_county_ind_counts > n]
    else:
        return 'Invalid Geographical Level'
    
#     antijoin = df[~df['Industry Title'].isin(df_new['Industry Title'])]
#     antijoin = antijoin['Industry Title'].unique()
    return df_new, antijoin

In [85]:
# for a given N, what industries are missing in california that appear in the EDD data?
# go off EDD data for list of industries
# do EDD merge before sample size check - merge on parsed codes
# double check EDD parsing - accounting for duplicate industries properly?
# add EDD industry title/code columns, and geo level counts for each, differentiate from ipums industry

In [82]:
ca_ipums_new, ca_ipums_aj = check_sample_size(50, ca_ipums, 'rural/urban')

In [86]:
ca_ipums_new

Unnamed: 0,INDNAICS,PERWT,INCWAGE,NAICS Code,Industry Title,Parsed_Code,County,Rural/Urban,CDI Regions,Regional COL,...,wt_regcomm_above_thresh,wt_regcomm_hw_count,wt_regcomm_hw_perc,unwt_regcomm_ind_counts,above_county_thresh,wt_county_ind_counts,wt_county_above_thresh,wt_county_hw_count,wt_county_hw_perc,unwt_county_ind_counts
0,4853,21.0,23100,4853,Taxi and limousine service,485,Los Angeles,Urban,Los Angeles,80216,...,0.0,9408.0,3.046524,2786,0,308811.0,0.0,9206.0,2.981111,2786
1,4853,21.0,23100,4853,Taxi and limousine service,485,Los Angeles,Urban,Los Angeles,80216,...,0.0,9408.0,3.046524,2786,0,308811.0,0.0,9206.0,2.981111,2786
2,4853,21.0,23100,4853,Taxi and limousine service,485,Los Angeles,Urban,Los Angeles,80216,...,0.0,9408.0,3.046524,2786,0,308811.0,0.0,9206.0,2.981111,2786
3,4853,21.0,23100,4853,Taxi and limousine service,485,Los Angeles,Urban,Los Angeles,80216,...,0.0,9408.0,3.046524,2786,0,308811.0,0.0,9206.0,2.981111,2786
4,4853,11.0,28000,4853,Taxi and limousine service,485,Los Angeles,Urban,Los Angeles,80216,...,0.0,9408.0,3.046524,2786,0,308811.0,0.0,9206.0,2.981111,2786
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488571,3211,102.0,28800,3211,Sawmills and wood preservation,321,Humboldt,Rural,Redwood Coast,58625,...,0.0,1470.0,15.291792,95,0,9613.0,0.0,1470.0,15.291792,95
488572,3113,558.0,5300,3113,Sugar and confectionery products,311,Humboldt,Rural,Redwood Coast,58625,...,0.0,1512.0,10.114389,145,0,14949.0,0.0,1512.0,10.114389,145
488573,3113,244.0,5300,3113,Sugar and confectionery products,311,Humboldt,Rural,Redwood Coast,58625,...,0.0,1512.0,10.114389,145,0,14949.0,0.0,1512.0,10.114389,145
488574,4247,117.0,50000,4247,Petroleum and petroleum products merchant whol...,424,Humboldt,Rural,Redwood Coast,58625,...,0.0,4594.0,30.886110,169,0,14874.0,0.0,4705.0,31.632379,169


In [83]:
ca_ipums_aj, len(ca_ipums_aj)

(array(['Carpet and rug mills',
        'Veneer, plywood, and engineered wood products',
        'Military reserves or national guard', 'Agricultural chemicals',
        'Pottery, ceramics, and plumbing fixture manufacturing',
        'Prefabricated wood buildings and mobile homes', 'Logging',
        'Clay building material and refractories manufacturing',
        'Knitting fabric mills, and apparel knitting mills',
        'Farm product raw materials, merchant wholesalers ', 'Ordnance',
        'Agricultural implements', 'Railroad rolling stock manufacturing',
        'Pipeline transportation', 'Tobacco', 'Coal mining',
        'Miscellaneous petroleum and coal products', 'Metal ore mining'],
       dtype=object),
 18)

In [84]:
ns = [5, 10, 15, 20, 30, 40, 50, 60, 70]
levels = ['state', 'rural/urban', 'regional', 'regional rural/urban', 'county']

In [None]:
for level in levels:
    for n in ns:
        ca_ipums_new, ca_ipums_aj = check_sample_size(n, ca_ipums, level)
        print(f'Level)
        print(ca_ipums_aj, len(ca_ipums_aj))