In [29]:
import pandas as pd
import seaborn as sns

## SUSB Tables
SUSB is an annual series that provides national and subnational data on the distribution of economic data by establishment industry & enterprise size.

https://www.census.gov/programs-surveys/susb.html

The county information is not in easily extractable tables, which makes the data aggregation irritating. 

In [109]:
susb = pd.DataFrame()

rename_map = {'COUNTY_DESCRIPTION':'County_Name',
              'NUMBER_OF_FIRMS':'Firms',
              'NUMBER_OF_ESTABLISHMENTS':'Establishments',
              'EMPLOYMENT':'Employment',
              'ANNUAL_PAYROLL_($1,000)':'Payroll_$1000',
              'Annual_Payroll\n($1,000)':'Payroll_$1000',
              'Annual_Payroll_($1,000)':'Payroll_$1000',
              'ENTERPRISE_EMPLOYMENT_SIZE':'Enterprise_Size',
              'STATE_DESCRIPTION':'State_Name',
              'FIPS_COUNTY_CODE':'FIPS_County',
              'County':'FIPS_County'
             }

urls = ['https://www2.census.gov/programs-surveys/susb/tables/2010/county_totals_2010.xls',
        'https://www2.census.gov/programs-surveys/susb/tables/2011/county_totals_2011.xls',
        'https://www2.census.gov/programs-surveys/susb/tables/2012/county_totals_2012.xls',
        'https://www2.census.gov/programs-surveys/susb/tables/2013/county_totals_2013.xlsx',
        'https://www2.census.gov/programs-surveys/susb/tables/2014/county_totals_2014.xlsx',
        'https://www2.census.gov/programs-surveys/susb/tables/2015/county_totals_2015.xlsx',
        'https://www2.census.gov/programs-surveys/susb/tables/2016/county_totals_2016.xlsx',
        'https://www2.census.gov/programs-surveys/susb/tables/2017/county_totals_2017.xlsx',
        'https://www2.census.gov/programs-surveys/susb/tables/2018/county_3digitnaics_2018.xlsx',
        'https://www2.census.gov/programs-surveys/susb/tables/2019/county_3digitnaics_2019.xlsx']

urls2 = ['https://www2.census.gov/programs-surveys/susb/tables/2010/county_totals_2010.xls',
        'https://www2.census.gov/programs-surveys/susb/tables/2011/county_totals_2011.xls',
        'https://www2.census.gov/programs-surveys/susb/tables/2012/county_totals_2012.xls',
        'https://www2.census.gov/programs-surveys/susb/tables/2013/county_totals_2013.xlsx',
        'https://www2.census.gov/programs-surveys/susb/tables/2014/county_totals_2014.xlsx',
        'https://www2.census.gov/programs-surveys/susb/tables/2015/county_totals_2015.xlsx',
        'https://www2.census.gov/programs-surveys/susb/tables/2016/county_totals_2016.xlsx',
        'https://www2.census.gov/programs-surveys/susb/tables/2017/county_totals_2017.xlsx']
j = 2010

def get_skiprows(n):
    """
    Likely not the best way to do this, but returns the right skiprows for each dataset"""
    if n < 2012 or n ==2013:
        return [0,1,2,3,5,6]
    elif n == 2012:
        return [0,1,2,3,4,6,7]
    # elif n == 2013:
    #     return [0,1,2,3,5,6]
    elif n < 2016:
        return [0,1,2,3,4,6,7]
    elif n < 2017:
        return [0,1,2,3,4,5,7,8]
    elif n < 2018:
        return 2
    else:
        return 2

for i in urls:
    df = pd.read_excel(i,skiprows=get_skiprows(j)) # bad excel tables need custom skipped row
    df.columns = df.columns.str.strip() # remove leading/tailing whitespace
    df.columns = df.columns.str.replace(r'\s+','_',regex=True) # regex removes multiple spaces with _
    df = df.rename(rename_map,axis=1)
    df.Enterprise_Size = df.Enterprise_Size.replace(r'\s+', ' ', regex=True)
    df = df.replace(dict.fromkeys(['1: Total','01: Total'], 'Total'))
    if 'NAICS_Description' in df.columns:
        df = df.query('State_Name=="Virginia"&Enterprise_Size=="Total"&NAICS_Description=="Total"')\
               .loc[:,['FIPS_County','County_Name','Firms','Establishments','Employment','Payroll_$1000']]
    else:
        df = df.query('State_Name=="Virginia"&Enterprise_Size=="Total"')\
               .loc[:,['FIPS_County','County_Name','Firms','Establishments','Employment','Payroll_$1000']]
    df['year']=j
    j+=1
    susb = pd.concat([susb, df])

In [105]:
susb

Unnamed: 0,FIPS_County,County_Name,Firms,Establishments,Employment,Payroll_$1000,year
14192,1,Accomack,734,826,9443,256121,2010
14197,3,Albemarle,2297,2514,36318,1611944,2010
14202,5,Alleghany,238,253,2365,64972,2010
14207,7,Amelia,280,286,1866,52307,2010
14212,9,Amherst,567,601,6697,193779,2010
...,...,...,...,...,...,...,...
467241,810,Virginia Beach city,9538,11316,160538,6931230,2019
467586,820,Waynesboro city,569,611,9103,336617,2019
467738,830,Williamsburg city,456,524,9416,292213,2019
467865,840,Winchester city,1231,1354,25529,1063623,2019


In [103]:
len(susb.FIPS_County.unique())

135

In [133]:
susb.dtypes

FIPS_County        int64
County_Name       object
Firms              int64
Establishments     int64
Employment         int64
Payroll_$1000      int64
year               int64
dtype: object

## Small Area Poverty Income Estimates
https://www.census.gov/programs-surveys/saipe.html

These tables use a regression to estimate poverty percentage per county with 90% intervals. I haven't gone into the regression estimates. 

In [104]:
pov_tot = pd.DataFrame()

for i in range(10,20):
    url = 'https://www2.census.gov/programs-surveys/saipe/datasets/20'+str(i)+'/20'+str(i)+'-state-and-county/est'+str(i)+'-va.txt'
    pov = pd.read_fwf(url, usecols=[1,5,20,29],names = ['FIPS_County','Percent_in_Poverty','Median_Income','County_Name'])
    pov['year'] = '20'+str(i)
    pov_tot = pd.concat([pov_tot, pov])

In [56]:
pov_tot.Percent_in_Poverty = pov_tot.Percent_in_Poverty.astype(float)
pov_tot.year = pov_tot.year.astype(int)

In [132]:
len(pov_tot.FIPS_County.unique())

135

In [130]:
pov_tot.dtypes

FIPS_County             int64
Percent_in_Poverty    float64
Median_Income           int64
County_Name            object
year                   object
dtype: object

## VA Economic Indicators Data

### Brian to add info

Is there anyway to get this data with the FIPS code? Would make the merge much easier

In [111]:
econ = pd.read_csv('va_economic_indicators.csv')

In [115]:
econ = econ.rename({'GeoName':'County_Name'},axis=1)
econ_year = econ.loc[(econ['year'] > 2009) & (econ['year'] < 2020)]
econ_year.County_Name = econ.County_Name.str.split(',').str[0]

econreplace = {
    'Albemarle + Charlottesville':'Albemarle County',
    'Alexandria (Independent City)':'Alexandria city',
    'Alleghany + Covington':'Alleghany County',
    'Campbell + Lynchburg':'Campbell County',
    'Carroll + Galax':'Carroll County',
    'Chesapeake (Independent City)':'Chesapeake city',
    'Frederick + Winchester':'Frederick County',
    'Greensville + Emporia':'Greensville County',
    'Hampton (Independent City)':'Hampton city',
    'Henry + Martinsville':'Henry County',
    'James City + Williamsburg':'James City County',
    'Montgomery + Radford':'Montgomery County',
    'Newport News (Independent City)':'Newport News city',
    'Norfolk (Independent City)':'Norfolk city',
    'Pittsylvania + Danville':'Pittsylvania County',
    'Portsmouth (Independent City)':'Portsmouth city',
    'Prince George + Hopewell':'Prince George County',
    'Richmond (Independent City)':'Richmond city',
    'Roanoke (Independent City)':'Roanoke city',
    'Roanoke + Salem':'Roanoke County',
    'Rockingham + Harrisonburg':'Rockingham County',
    'Southampton + Franklin':'Southampton County',
    'Spotsylvania + Fredericksburg':'Spotsylvania County',
    'Suffolk (Independent City)':'Suffolk city',
    'Virginia Beach (Independent City)':'Virginia Beach city',
    'Washington + Bristol':'Washington County',
    'Wise + Norton':'Wise County',
    'York + Poquoson':'York County'
}
econ_year = econ_year.replace(econreplace)

econ_year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  econ_year.County_Name = econ.County_Name.str.split(',').str[0]


Unnamed: 0.1,Unnamed: 0,County_Name,Earnings by place of work,Employee and self-employed contributions for government social insurance,Employer contributions for employee pension and insurance funds 8,Employer contributions for government social insurance,Equals: Net earnings by place of residence,Farm income 2,Farm proprietors' income,Less: Contributions for government social insurance 5,...,"Plus: Dividends, interest, and rent 7",Plus: Personal current transfer receipts,Population (persons) 3,Proprietors employment,Proprietors' income 9,Supplements to wages and salaries,Total employment,Wage and salary employment,Wages and salaries,year
4346,0,Accomack,669368,42187,95329,69258,603275,30526,23563,76816,...,222749,296704,33150,3731,74730,129958,18051,14320,464680,2010
4347,1,Albemarle County,5972951,341821,735779,581248,4301638,-1140,-10408,632445,...,2180543,786560,142663,22196,844150,1026403,112944,90748,4102398,2010
4348,2,Alexandria city,10033478,549450,1045199,1056772,8022104,0,0,1077836,...,2015709,632530,140737,19658,1340703,1573585,123544,103886,7119190,2010
4349,3,Alleghany County,451606,31611,57444,51702,381859,-2042,-2322,57462,...,102743,219003,22144,1352,16895,83295,9903,8551,351416,2010
4350,4,Amelia,143066,9696,15259,13136,267116,16485,12806,16264,...,59809,98657,12741,1662,34267,21827,4364,2702,86972,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5401,101,Washington County,1670965,124175,221479,176628,1428899,-703,-3802,212489,...,588026,830196,70888,8106,152990,309793,39157,31051,1208182,2019
5402,102,Westmoreland,208451,18313,26153,18944,412151,23645,19017,27785,...,193234,207575,18029,2015,43838,35625,5906,3891,128988,2019
5403,103,Wise County,839644,65536,129250,91456,631798,-239,-338,111264,...,175647,561252,41410,2681,43386,174978,19026,16345,621280,2019
5404,104,Wythe,612425,46401,86564,66536,533289,-4515,-6986,79669,...,150210,336554,28618,3180,39163,119832,15203,12023,453430,2019


In [113]:
len(econ_year.County_Name.unique())

106

We lose some counties/cities compared to the previous data sets. I'm not sure if there is anything we can do since some cities are grouped with counties. 

In [124]:
df = pd.merge(pov_tot, susb, how='inner', on=['FIPS_County'])

In [122]:
df

Unnamed: 0,FIPS_County,Percent_in_Poverty,Median_Income,County_Name_x,year_x,County_Name_y,Firms,Establishments,Employment,Payroll_$1000,year_y
0,1,20.5,37312,Accomack County,2010,Accomack,734,826,9443,256121,2010
1,1,20.5,37312,Accomack County,2010,Accomack,699,786,9030,253150,2011
2,1,20.5,37312,Accomack County,2010,Accomack,658,742,8422,249829,2014
3,1,20.5,37312,Accomack County,2010,Accomack,653,738,8344,241161,2015
4,1,20.5,37312,Accomack County,2010,Accomack,647,733,8679,247464,2016
...,...,...,...,...,...,...,...,...,...,...,...
10655,840,13.3,60254,Winchester city,2019,Winchester city,1230,1350,24843,982521,2015
10656,840,13.3,60254,Winchester city,2019,Winchester city,1233,1361,25540,986630,2016
10657,840,13.3,60254,Winchester city,2019,Winchester city,1250,1372,25971,1045843,2017
10658,840,13.3,60254,Winchester city,2019,Winchester city,1259,1380,26565,1064589,2018
