In [1]:
# clear variable cache
%reset -f

In [2]:
# Data
import pandas as pd
import numpy as np
import datetime

# Notebook Settings
import os

In [3]:
# set wd
os.getcwd()
os.chdir('/home/ad-frazier/data_science/MSBA_320/final_project')

# set pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

## Join OPS

#### Import GDP Data -- Dataset Core

In [4]:
df_gdp = pd.read_csv('./clean/gdp_state_annual_clean.csv')

In [5]:
df_gdp.head(2)

Unnamed: 0,state,year,subsidies,taxes_on_production_and_imports_(topi),compensation,current-dollar_gdp,gross_operating_surplus
0,AK,1997,-62900000.0,2464700000.0,12347800000.0,25810800000.0,11061300000.0
1,AK,1998,-54700000.0,2003900000.0,12889600000.0,24227500000.0,9388700000.0


In [6]:
# Get unique states for join integrity confirmation
print('unique states:', df_gdp.state.unique().shape[0])
print('earliest year:', df_gdp.year.min())
print('earliest year:', df_gdp.year.max())

unique states: 51
earliest year: 1997
earliest year: 2020


In [7]:
df_gdp.shape

(1224, 7)

#### Import Population Data to join

In [8]:
df_pop = pd.read_csv('./clean/population_state_annual_clean.csv')

In [9]:
df_pop.head(2)

Unnamed: 0,state,year,population
0,AL,1990,4050055
1,AK,1990,553290


In [10]:
# Get unique states for join integrity confirmation
print('unique states:', df_pop.state.unique().shape[0])
print('earliest year:', df_pop.year.min())
print('earliest year:', df_pop.year.max())

unique states: 51
earliest year: 1990
earliest year: 2020


In [11]:
df_pop.shape

(1581, 3)

## JOIN 1

GDP + Population

In [12]:
# MERGE 1
df_core = df_gdp.merge(df_pop, how='inner', on=['state','year'])

# Expect 1224/8

In [13]:
df_core.shape

(1224, 8)

In [14]:
df_core.head(2)

Unnamed: 0,state,year,subsidies,taxes_on_production_and_imports_(topi),compensation,current-dollar_gdp,gross_operating_surplus,population
0,AK,1997,-62900000.0,2464700000.0,12347800000.0,25810800000.0,11061300000.0,612968
1,AK,1998,-54700000.0,2003900000.0,12889600000.0,24227500000.0,9388700000.0,619932


In [15]:
# Shift population to follow year
pop = df_core.pop('population')

df_core.insert(2, 'population', pop)

#### Import PCE

In [16]:
df_pce = pd.read_csv('./clean/pce_state_annual_clean.csv')

In [17]:
df_pce.head(2)

Unnamed: 0,year,state,pce_clothing_and_footwear,pce_financial_services_and_insurance,pce_food_and_beverages_purchased_for_off-premises_consumption,pce_food_services_and_accommodations,pce_furnishings_and_durable_household_equipment,pce_gasoline_and_other_energy_goods,pce_gross_output_of_nonprofit_institutions,pce_health_care,pce_housing_and_utilities,pce_less:_receipts_from_sales_of_goods_and_services_by_nonprofit_institutions,pce_motor_vehicles_and_parts,pce_other_durable_goods,pce_other_nondurable_goods,pce_other_services,pce_recreation_services,pce_recreational_goods_and_vehicles,pce_transportation_services,pce_durable_goods,pce_final_consumption_expenditures_of_nonprofit_institutions_serving_households_(npishs),pce_household_consumption_expenditures_(for_services),pce_nondurable_goods,pce_goods,pce_services,pce_personal_consumption_expenditures
0,1997,AK,515600000.0,848300000.0,1450000000.0,1091000000.0,376100000.0,432700000.0,1396000000.0,1719200000.0,2378300000.0,962600000.0,795000000.0,235200000.0,941800000.0,1221800000.0,646700000.0,557300000.0,563100000.0,1963600000.0,433400000.0,8468500000.0,3340100000.0,5303700000.0,8901900000.0,14205600000.0
1,1997,AL,3249700000.0,5026600000.0,7747300000.0,4278700000.0,2164500000.0,2485300000.0,6320000000.0,12481000000.0,13167000000.0,5506100000.0,4890300000.0,1088900000.0,6523000000.0,6196300000.0,2839000000.0,1846500000.0,2272800000.0,9990100000.0,813900000.0,46261500000.0,20005500000.0,29995600000.0,47075400000.0,77070900000.0


In [18]:
# Get unique states for join integrity confirmation
print('unique states:', df_pce.state.unique().shape[0])
print('earliest year:', df_pce.year.min())
print('earliest year:', df_pce.year.max())

unique states: 51
earliest year: 1997
earliest year: 2020


In [19]:
df_pce.shape

(1224, 26)

Should see a 1:1 match across states and years

## JOIN 2

CORE + PCE

In [21]:
## MERGE 2
df_core = df_core.merge(df_pce, how='inner', on=['state','year'])

# expect 1224/32

In [30]:
df_core.shape

(1224, 32)

In [22]:
df_core.head(2)

Unnamed: 0,state,year,population,subsidies,taxes_on_production_and_imports_(topi),compensation,current-dollar_gdp,gross_operating_surplus,pce_clothing_and_footwear,pce_financial_services_and_insurance,pce_food_and_beverages_purchased_for_off-premises_consumption,pce_food_services_and_accommodations,pce_furnishings_and_durable_household_equipment,pce_gasoline_and_other_energy_goods,pce_gross_output_of_nonprofit_institutions,pce_health_care,pce_housing_and_utilities,pce_less:_receipts_from_sales_of_goods_and_services_by_nonprofit_institutions,pce_motor_vehicles_and_parts,pce_other_durable_goods,pce_other_nondurable_goods,pce_other_services,pce_recreation_services,pce_recreational_goods_and_vehicles,pce_transportation_services,pce_durable_goods,pce_final_consumption_expenditures_of_nonprofit_institutions_serving_households_(npishs),pce_household_consumption_expenditures_(for_services),pce_nondurable_goods,pce_goods,pce_services,pce_personal_consumption_expenditures
0,AK,1997,612968,-62900000.0,2464700000.0,12347800000.0,25810800000.0,11061300000.0,515600000.0,848300000.0,1450000000.0,1091000000.0,376100000.0,432700000.0,1396000000.0,1719200000.0,2378300000.0,962600000.0,795000000.0,235200000.0,941800000.0,1221800000.0,646700000.0,557300000.0,563100000.0,1963600000.0,433400000.0,8468500000.0,3340100000.0,5303700000.0,8901900000.0,14205600000.0
1,AK,1998,619932,-54700000.0,2003900000.0,12889600000.0,24227500000.0,9388700000.0,520800000.0,883800000.0,1512400000.0,1114300000.0,391100000.0,375700000.0,1551000000.0,1861100000.0,2513600000.0,1033000000.0,871400000.0,244900000.0,971600000.0,1306200000.0,664900000.0,582900000.0,589600000.0,2090200000.0,518000000.0,8933600000.0,3380600000.0,5470800000.0,9451600000.0,14922500000.0


In [23]:
print('unique states:', df_core.state.unique().shape[0])
print('earliest year:', df_core.year.min())
print('earliest year:', df_core.year.max())

unique states: 51
earliest year: 1997
earliest year: 2020


### Import poverty rate

In [25]:
df_poverty = pd.read_csv('./clean/poverty_rate_state_annual_clean.csv')

In [26]:
df_poverty.head(2)

Unnamed: 0,state,poverty_rate,year
0,AL,0.15,2020
1,AK,0.13,2020


In [27]:
# Get unique states for join integrity confirmation
print('unique states:', df_poverty.state.unique().shape[0])
print('earliest year:', df_poverty.year.min())
print('earliest year:', df_poverty.year.max())

unique states: 51
earliest year: 1997
earliest year: 2020


In [28]:
df_poverty.shape

(1224, 3)

## JOIN 3

Core + Poverty

In [31]:
# MERGE 3
df_core = df_core.merge(df_poverty, how='inner', on=['state','year'])

# expect 1223/33

In [32]:
df_core.shape

(1224, 33)

#### Import Per Capita Disposable

In [33]:
df_dis_inc = pd.read_csv('./clean/per_capita_disposable_personal_income_state_annual_clean.csv')

In [34]:
df_dis_inc.head(2)

Unnamed: 0,year,per_capita_disposable_personal_income,state
0,1997,19087,AL
1,1997,25176,AK


In [35]:
# Get unique states for join integrity confirmation
print('unique states:', df_dis_inc.state.unique().shape[0])
print('earliest year:', df_dis_inc.year.min())
print('earliest year:', df_dis_inc.year.max())

unique states: 51
earliest year: 1997
earliest year: 2020


In [36]:
df_dis_inc.shape

(1224, 3)

## JOIN 4

In [37]:
df_core = df_core.merge(df_dis_inc, how='inner', on=['state','year'])

# expect 1224/34

In [38]:
df_core.shape

(1224, 34)

In [39]:
df_core.head(2)

Unnamed: 0,state,year,population,subsidies,taxes_on_production_and_imports_(topi),compensation,current-dollar_gdp,gross_operating_surplus,pce_clothing_and_footwear,pce_financial_services_and_insurance,pce_food_and_beverages_purchased_for_off-premises_consumption,pce_food_services_and_accommodations,pce_furnishings_and_durable_household_equipment,pce_gasoline_and_other_energy_goods,pce_gross_output_of_nonprofit_institutions,pce_health_care,pce_housing_and_utilities,pce_less:_receipts_from_sales_of_goods_and_services_by_nonprofit_institutions,pce_motor_vehicles_and_parts,pce_other_durable_goods,pce_other_nondurable_goods,pce_other_services,pce_recreation_services,pce_recreational_goods_and_vehicles,pce_transportation_services,pce_durable_goods,pce_final_consumption_expenditures_of_nonprofit_institutions_serving_households_(npishs),pce_household_consumption_expenditures_(for_services),pce_nondurable_goods,pce_goods,pce_services,pce_personal_consumption_expenditures,poverty_rate,per_capita_disposable_personal_income
0,AK,1997,612968,-62900000.0,2464700000.0,12347800000.0,25810800000.0,11061300000.0,515600000.0,848300000.0,1450000000.0,1091000000.0,376100000.0,432700000.0,1396000000.0,1719200000.0,2378300000.0,962600000.0,795000000.0,235200000.0,941800000.0,1221800000.0,646700000.0,557300000.0,563100000.0,1963600000.0,433400000.0,8468500000.0,3340100000.0,5303700000.0,8901900000.0,14205600000.0,0.09,25176
1,AK,1998,619932,-54700000.0,2003900000.0,12889600000.0,24227500000.0,9388700000.0,520800000.0,883800000.0,1512400000.0,1114300000.0,391100000.0,375700000.0,1551000000.0,1861100000.0,2513600000.0,1033000000.0,871400000.0,244900000.0,971600000.0,1306200000.0,664900000.0,582900000.0,589600000.0,2090200000.0,518000000.0,8933600000.0,3380600000.0,5470800000.0,9451600000.0,14922500000.0,0.09,26080
