In [130]:
# clear variable cache
%reset -f

In [131]:
# Data
import pandas as pd
import numpy as np
import datetime

# Notebook Settings
import os

In [132]:
# set wd
os.getcwd()
os.chdir('/home/ad-frazier/data_science/MSBA_320/final_project')

# set pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

#### Import GDP Data -- Dataset Core

In [133]:
df_gdp = pd.read_csv('./clean/gdp_state_annual_clean.csv')

In [134]:
df_gdp.head(2)

Unnamed: 0,state,year,subsidies,taxes_on_production_and_imports_(topi),compensation,current-dollar_gdp,gross_operating_surplus
0,AK,1997,-62900000.0,2464700000.0,12347800000.0,25810800000.0,11061300000.0
1,AK,1998,-54700000.0,2003900000.0,12889600000.0,24227500000.0,9388700000.0


In [135]:
# Get unique states for join integrity confirmation
print('unique states:', df_gdp.state.unique().shape[0])
print('earliest year:', df_gdp.year.min())
print('earliest year:', df_gdp.year.max())

unique states: 51
earliest year: 1997
earliest year: 2020


In [136]:
df_gdp.shape

(1224, 7)

#### Import Personal Income + Population

In [137]:
df_pop = pd.read_csv('./clean/per_capita_personal_income-population_state_annual_clean.csv')

In [138]:
df_pop.head(2)

Unnamed: 0,year,state,Per capita personal income,Population
0,1929,AL,319.0,2644000.0
1,1929,AR,303.0,1852000.0


In [139]:
# Get unique states for join integrity confirmation
print('unique states:', df_pop.state.unique().shape[0])
print('earliest year:', df_pop.year.min())
print('earliest year:', df_pop.year.max())

unique states: 49
earliest year: 1929
earliest year: 2021


In [140]:
df_pop.shape

(4557, 4)

## JOIN 1

GDP + Personal Income and Population

In [141]:
# MERGE 1
df_core = df_gdp.merge(df_pop, how='inner', on=['state','year'])

# Expect 1224/8

In [142]:
df_core.shape

(1176, 9)

In [143]:
df_core.head(2)

Unnamed: 0,state,year,subsidies,taxes_on_production_and_imports_(topi),compensation,current-dollar_gdp,gross_operating_surplus,Per capita personal income,Population
0,AL,1997,-274500000.0,6754800000.0,61083800000.0,104811900000.0,37247900000.0,21516.0,4367935.0
1,AL,1998,-292900000.0,6968300000.0,64168600000.0,110212000000.0,39368100000.0,22668.0,4404701.0


In [144]:
# Shift population to follow year
pop = df_core.pop('Population')

df_core.insert(2, 'population', pop)

#### Import PCE

In [145]:
df_pce = pd.read_csv('./clean/pce_state_annual_clean.csv')

In [146]:
df_pce.head(2)

Unnamed: 0,year,state,"pce_clothing,_footwear,_and_related_services",pce_communication,pce_education,pce_financial_services_and_insurance,pce_food_and_beverages_purchased_for_off-premises_consumption,pce_food_services_and_accommodations,"pce_furnishings,_household_equipment,_and_routine_household_maintenance",pce_health,"pce_housing,_utilities,_and_fuels",pce_other_goods_and_services,pce_total,pce_recreation,pce_transportation
0,1997,AK,552000000.0,193700000.0,255200000.0,848300000.0,1450000000.0,1091000000.0,701300000.0,1965500000.0,2457500000.0,1146800000.0,14205600000.0,1451200000.0,1711600000.0
1,1997,AL,3457300000.0,1875300000.0,1156500000.0,5026600000.0,7747300000.0,4278700000.0,3993800000.0,14574600000.0,13361000000.0,4375700000.0,77070900000.0,6744600000.0,9454500000.0


In [147]:
# Get unique states for join integrity confirmation
print('unique states:', df_pce.state.unique().shape[0])
print('earliest year:', df_pce.year.min())
print('earliest year:', df_pce.year.max())

unique states: 51
earliest year: 1997
earliest year: 2020


In [110]:
df_pce.shape

(1224, 15)

Should see a 1:1 match across states and years

## JOIN 2

CORE + PCE

In [111]:
## MERGE 2
df_core = df_core.merge(df_pce, how='inner', on=['state','year'])

# expect 1224/32

In [112]:
df_core.shape

(1176, 22)

In [113]:
df_core.head(2)

Unnamed: 0,state,year,population,subsidies,taxes_on_production_and_imports_(topi),compensation,current-dollar_gdp,gross_operating_surplus,Per capita personal income,"pce_clothing,_footwear,_and_related_services",pce_communication,pce_education,pce_financial_services_and_insurance,pce_food_and_beverages_purchased_for_off-premises_consumption,pce_food_services_and_accommodations,"pce_furnishings,_household_equipment,_and_routine_household_maintenance",pce_health,"pce_housing,_utilities,_and_fuels",pce_other_goods_and_services,pce_total,pce_recreation,pce_transportation
0,AL,1997,4367935.0,-274500000.0,6754800000.0,61083800000.0,104811900000.0,37247900000.0,21516.0,3457300000.0,1875300000.0,1156500000.0,5026600000.0,7747300000.0,4278700000.0,3993800000.0,14574600000.0,13361000000.0,4375700000.0,77070900000.0,6744600000.0,9454500000.0
1,AL,1998,4404701.0,-292900000.0,6968300000.0,64168600000.0,110212000000.0,39368100000.0,22668.0,3636500000.0,1947300000.0,1244200000.0,5399400000.0,7866400000.0,4464600000.0,4152500000.0,15206600000.0,14006400000.0,4600700000.0,80481000000.0,7014500000.0,9709400000.0


In [114]:
print('unique states:', df_core.state.unique().shape[0])
print('earliest year:', df_core.year.min())
print('earliest year:', df_core.year.max())

unique states: 49
earliest year: 1997
earliest year: 2020


In [115]:
# Get Per Capita PCE
df_core.iloc[:,9:] = df_core.iloc[:,9:].div(df_core.population, axis=0)

# Verify output
df_core.head(2)

Unnamed: 0,state,year,population,subsidies,taxes_on_production_and_imports_(topi),compensation,current-dollar_gdp,gross_operating_surplus,Per capita personal income,"pce_clothing,_footwear,_and_related_services",pce_communication,pce_education,pce_financial_services_and_insurance,pce_food_and_beverages_purchased_for_off-premises_consumption,pce_food_services_and_accommodations,"pce_furnishings,_household_equipment,_and_routine_household_maintenance",pce_health,"pce_housing,_utilities,_and_fuels",pce_other_goods_and_services,pce_total,pce_recreation,pce_transportation
0,AL,1997,4367935.0,-274500000.0,6754800000.0,61083800000.0,104811900000.0,37247900000.0,21516.0,791.52,429.33,264.77,1150.8,1773.68,979.57,914.35,3336.73,3058.88,1001.78,17644.7,1544.12,2164.52
1,AL,1998,4404701.0,-292900000.0,6968300000.0,64168600000.0,110212000000.0,39368100000.0,22668.0,825.6,442.1,282.47,1225.83,1785.91,1013.6,942.74,3452.36,3179.88,1044.5,18271.61,1592.5,2204.33


### Import poverty rate

In [116]:
df_poverty = pd.read_csv('./clean/poverty_rate_state_annual_clean.csv')

In [117]:
df_poverty.head(2)

Unnamed: 0,state,poverty_rate,year
0,AL,0.15,2020
1,AK,0.13,2020


In [118]:
# Get unique states for join integrity confirmation
print('unique states:', df_poverty.state.unique().shape[0])
print('earliest year:', df_poverty.year.min())
print('earliest year:', df_poverty.year.max())

unique states: 51
earliest year: 1997
earliest year: 2020


## Join 3

In [119]:
# Get unique states for join integrity confirmation
print('unique states:', df_core.state.unique().shape[0])
print('earliest year:', df_core.year.min())
print('earliest year:', df_core.year.max())

unique states: 49
earliest year: 1997
earliest year: 2020


In [120]:
df_core = df_core.merge(df_poverty, how='inner', on=['state','year'])

In [121]:
# Get unique states for join integrity confirmation
print('unique states:', df_core.state.unique().shape[0])
print('earliest year:', df_core.year.min())
print('earliest year:', df_core.year.max())

unique states: 49
earliest year: 1997
earliest year: 2020


In [122]:
df_core.head(5)

Unnamed: 0,state,year,population,subsidies,taxes_on_production_and_imports_(topi),compensation,current-dollar_gdp,gross_operating_surplus,Per capita personal income,"pce_clothing,_footwear,_and_related_services",pce_communication,pce_education,pce_financial_services_and_insurance,pce_food_and_beverages_purchased_for_off-premises_consumption,pce_food_services_and_accommodations,"pce_furnishings,_household_equipment,_and_routine_household_maintenance",pce_health,"pce_housing,_utilities,_and_fuels",pce_other_goods_and_services,pce_total,pce_recreation,pce_transportation,poverty_rate
0,AL,1997,4367935.0,-274500000.0,6754800000.0,61083800000.0,104811900000.0,37247900000.0,21516.0,791.52,429.33,264.77,1150.8,1773.68,979.57,914.35,3336.73,3058.88,1001.78,17644.7,1544.12,2164.52,0.16
1,AL,1998,4404701.0,-292900000.0,6968300000.0,64168600000.0,110212000000.0,39368100000.0,22668.0,825.6,442.1,282.47,1225.83,1785.91,1013.6,942.74,3452.36,3179.88,1044.5,18271.61,1592.5,2204.33,0.14
2,AL,1999,4430141.0,-388400000.0,7329600000.0,67225100000.0,115680100000.0,41513700000.0,23333.0,858.71,460.96,300.35,1312.78,1843.37,1064.26,977.24,3561.44,3319.65,1124.68,19194.47,1667.71,2385.66,0.15
3,AL,2000,4452173.0,-364700000.0,7868600000.0,69764400000.0,119851700000.0,42583400000.0,24306.0,878.27,485.99,324.31,1445.23,1872.3,1094.95,1001.35,3759.89,3531.02,1173.99,20198.9,1722.22,2551.47,0.13
4,AL,2001,4467634.0,-475800000.0,8004300000.0,72038400000.0,122915500000.0,43348600000.0,25057.0,875.12,492.95,334.29,1423.42,1968.4,1111.8,1002.34,4053.55,3771.53,1220.27,20911.81,1738.33,2543.76,0.16


#### Import Spending Per Student

In [123]:
df_stud_spend = pd.read_csv('./clean/tot_spend_student_state_annual_clean.csv')

In [124]:
df_stud_spend = df_stud_spend.rename(columns={'start_year':'year'})

In [125]:
# Get unique states for join integrity confirmation
print('unique states:', df_stud_spend.state.unique().shape[0])
print('earliest year:', df_stud_spend.year.min())
print('earliest year:', df_stud_spend.year.max())

unique states: 51
earliest year: 1997
earliest year: 2018


In [126]:
df_stud_spend.head(5)

Unnamed: 0,state,year,per_pupil_expenditure
0,AL,2018,10107
1,AK,2018,18393
2,AZ,2018,8773
3,AR,2018,10412
4,CA,2018,13831


## Join 4

In [127]:
# Get unique states for join integrity confirmation
print('unique states:', df_core.state.unique().shape[0])
print('earliest year:', df_core.year.min())
print('earliest year:', df_core.year.max())

unique states: 49
earliest year: 1997
earliest year: 2020


In [128]:
df_core = df_core.merge(df_stud_spend, how='inner', on=['year','state'])
df_core.shape

(1078, 24)

In [129]:
# Get unique states for join integrity confirmation
print('unique states:', df_core.state.unique().shape[0])
print('earliest year:', df_core.year.min())
print('earliest year:', df_core.year.max())

unique states: 49
earliest year: 1997
earliest year: 2018
