In [1]:
# clear variable cache
%reset -f

In [2]:
# Data
import pandas as pd
import numpy as np
import datetime

# Notebook Settings
import os

In [3]:
# set wd
os.getcwd()
os.chdir('/home/ad-frazier/data_science/MSBA_320/final_project')

# set pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [123]:
# Read in helper files

# Enables State Full Name to Abbreviation
state_dict = pd.read_csv('https://raw.githubusercontent.com/adfrisealach/List-of-US-States/master/states.csv', header=None, skiprows=1 , index_col=0, squeeze=True).to_dict()

# Enable Verification of correct number of states
state_list = pd.read_csv('https://raw.githubusercontent.com/adfrisealach/List-of-US-States/master/states.csv', header=0)

# Filter to PCE top Level Categores
pce_levels = pd.read_csv('./supplemental_data/pce_categories.csv', header=None)
pce_levels = pce_levels[0].to_list()

Files cleaned in order of perceived importance

# Financial Data

### GDP by State and Year

In [5]:
df = pd.read_csv('./dirty/gdp_state_annual.csv', engine='python')
# verified correct head/tail skip against manual excel inspections

In [6]:
df.head(2)

Unnamed: 0,GeoFips,GeoName,LineCode,Description,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,0,United States,3,Current-dollar GDP (millions of current dollars),8577552.0,9062817.0,9631172.0,10250952.0,10581929.0,10929108.0,11456450.0,12217196.0,13039197.0,13815583.0,14474228.0,14769862.0,14478067.0,15048970.0,15599731.0,16253970.0,16843196.0,17550687.0,18206023.0,18695106.0,19479623.0,20527159.0,21372582.0,20893746.0
1,0,United States,4,Compensation (millions of dollars),4713220.0,5075701.0,5409937.0,5854634.0,6046346.0,6143370.0,6362298.0,6729306.0,7077722.0,7491260.0,7889371.0,8068682.0,7767191.0,7932970.0,8234017.0,8575362.0,8843637.0,9259654.0,9709535.0,9977096.0,10436745.0,10969807.0,11459449.0,11580088.0


In [7]:
# reshape
df = pd.melt(df, id_vars=['GeoFips','GeoName','LineCode','Description'])

In [8]:
df.sample(5)

Unnamed: 0,GeoFips,GeoName,LineCode,Description,variable,value
493,41000,Oregon,7,Taxes on production and imports (TOPI) (mill...,1998,5082.2
677,18000,Indiana,5,Gross operating surplus (millions of dollars),1999,70630.2
278,94000,Plains,7,Taxes on production and imports (TOPI) (mill...,1997,39332.7
2057,56000,Wyoming,5,Gross operating surplus (millions of dollars),2003,9065.6
1779,94000,Plains,8,Subsidies (millions of dollars),2002,-4891.3


In [9]:
# rename
df = df.rename(columns={'variable':'year', 'value':'dollars'}) 

# Map full state names to Abbreviated form
df['state'] = df['GeoName'].map(state_dict)

In [10]:
# drop uneeded columns
df = df.drop(columns=['GeoFips','GeoName','LineCode'])

# Turn GDP Values into 1:1 dollars -- Multiply by 1 million
df['dollars'] = df['dollars']*1000000

# clean up column names a little bit
df['Description'] = df['Description'].str.replace("(millions of dollars)", "", regex=False)
df['Description'] = df['Description'].str.replace("(millions of current dollars)", "", regex=False)

In [11]:
df.sample(5)

Unnamed: 0,Description,year,dollars,state
412,Gross operating surplus,1998,91037900000.0,MA
1901,Compensation,2003,24462800000.0,ME
6185,Current-dollar GDP,2017,190674500000.0,OK
4159,Subsidies,2010,-82300000.0,WY
5030,Current-dollar GDP,2013,29289600000.0,VT


In [12]:
df = df.pivot_table(index=['state','year'], # columns to keep
                    columns='Description', # unstack this column
                    values='dollars', # populate this this column
                    aggfunc='first'
                    ).reset_index()

df.head(5)

Description,state,year,Subsidies,Taxes on production and imports (TOPI),Compensation,Current-dollar GDP,Gross operating surplus
0,AK,1997,-62900000.0,2464700000.0,12347800000.0,25810800000.0,11061300000.0
1,AK,1998,-54700000.0,2003900000.0,12889600000.0,24227500000.0,9388700000.0
2,AK,1999,-56400000.0,1983100000.0,13127500000.0,24744300000.0,9690100000.0
3,AK,2000,-57500000.0,2515400000.0,13893000000.0,26806600000.0,10455700000.0
4,AK,2001,-112300000.0,2385900000.0,14883300000.0,28494100000.0,11337100000.0


In [13]:
# Strip WhiteSpace
df.columns = df.columns.str.strip()

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1224 entries, 0 to 1223
Data columns (total 7 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   state                                   1224 non-null   object 
 1   year                                    1224 non-null   object 
 2   Subsidies                               1224 non-null   float64
 3   Taxes on production and imports (TOPI)  1224 non-null   float64
 4   Compensation                            1224 non-null   float64
 5   Current-dollar GDP                      1224 non-null   float64
 6   Gross operating surplus                 1224 non-null   float64
dtypes: float64(5), object(2)
memory usage: 67.1+ KB


In [15]:
df.columns = df.columns.str.lower().str.replace(' ','_')

In [16]:
df.head(5)

Description,state,year,subsidies,taxes_on_production_and_imports_(topi),compensation,current-dollar_gdp,gross_operating_surplus
0,AK,1997,-62900000.0,2464700000.0,12347800000.0,25810800000.0,11061300000.0
1,AK,1998,-54700000.0,2003900000.0,12889600000.0,24227500000.0,9388700000.0
2,AK,1999,-56400000.0,1983100000.0,13127500000.0,24744300000.0,9690100000.0
3,AK,2000,-57500000.0,2515400000.0,13893000000.0,26806600000.0,10455700000.0
4,AK,2001,-112300000.0,2385900000.0,14883300000.0,28494100000.0,11337100000.0


In [17]:
# including `index=False` removes unwanted description level
df.to_csv('./clean/gdp_state_annual_clean.csv', index=False)

### Per Capita Personal Income + State Population

In [130]:
df = pd.read_csv('./dirty/personal_income_per_capita_state_annual.csv',skiprows=5, skipfooter=7, engine='python')
# verified correct head/tail skip against manual excel inspections

In [131]:
df.head(2)

Unnamed: 0,GeoFips,GeoName,LineCode,Description,1929,1930,1931,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,0,United States,1,Personal income (millions of dollars),85151,76394,65531,50162,47114,53967,60704,69063,74556,68924,73456,79178,97407,125269,152637,163275,168186,179710,192525,211495,209009,232835.6,262330.7,280335.8,297142.9,300302.0,322134.3,346083.5,366021.2,376151.3,400328.8,417700.0,436047.0,464000.0,487780.0,522633.0,564444.0,612713.0,656828.0,720877.0,791229.0,855525.0,924613.0,1016408.0,1133468.0,1244912.0,1362505.0,1495704.0,1651632.0,1855849.0,2073257.0,2313160.0,2592915.0,2779794.0,2968676.0,3279488.0,3510471.0,3719647.0,3946593.0,4267813.0,4609667.0,4897821.0,5067291.0,5409920.0,5648732.0,5940128.0,6286143.0,6673186.0,7086935.0,7601594.0,8006585.0,8654561.0,9009842.0,9157682.0,9491393.0,10037313.0,10599603.0,11374142.0,12014107.0,12475898.0,12073407.0,12586509.0,13330436.0,14003346.0,14189228.0,14969527.0,15681233.0,16092713.0,16845028.0,17681159.0,18402004.0,19607447.0,21056621.9
1,0,United States,2,Population (persons) 1/,121769000,123075000,124038000,124839000,125580000,126372000,127251000,128054000,128822000,129825000,130884000,131955000,133417000,134670000,134697000,134075000,133387000,140638000,143665000,146091000,148666000,151871000.0,153970000.0,156369000.0,158946000.0,161881000.0,165058000.0,168078000.0,171178000.0,174153000.0,177136000.0,179972000.0,182976000.0,185739000.0,188434000.0,191085000.0,193460000.0,195499000.0,197375000.0,199312000.0,201298000.0,203798722.0,206817509.0,209274882.0,211349205.0,213333635.0,215456585.0,217553859.0,219760875.0,222098244.0,224568579.0,227224719.0,229465744.0,231664432.0,233792014.0,235824907.0,237923734.0,240132831.0,242288936.0,244499004.0,246819222.0,249622814.0,252980941.0,256514224.0,259918588.0,263125821.0,266278393.0,269394284.0,272646925.0,275854104.0,279040168.0,282162411.0,284968955.0,287625193.0,290107933.0,292805298.0,295516599.0,298379912.0,301231207.0,304093966.0,306771529.0,309378433.0,311841632.0,314344331.0,316735375.0,319270047.0,321829327.0,324367742.0,326623063.0,328542157.0,330233102.0,331501080.0,331893745.0


In [132]:
# drop un-needed columns
df = df.drop(columns=['LineCode','GeoFips'])

# reshape column structure
df = pd.melt(df, id_vars=['GeoName','Description'])

# rename columns
df = df.rename(columns={'variable':'year'})

# map state name to state abbreviation
df['state'] = df['GeoName'].map(state_dict)

In [133]:
df.sample(2)

Unnamed: 0,GeoName,Description,year,value,state
13599,New York,Personal income (millions of dollars),2004,754007.4,NY
1358,New Mexico,Per capita personal income (dollars) 2/,1936,343.0,NM


In [134]:
# Get unique geo areas
df.GeoName.unique().shape

(60,)

In [135]:
df.Description.value_counts()

Personal income (millions of dollars)      5580
Population (persons) 1/                    5580
Per capita personal income (dollars) 2/    5580
Name: Description, dtype: int64

In [136]:
# filter to only Per Capita Personal Income
df = df.loc[(df['Description']=='Per capita personal income (dollars) 2/') | (df['Description']=='Population (persons) 1/')]

# Clean up categorical values
df['Description'] = df['Description'].str.replace(' (persons) 1/','', regex=False)
df['Description'] = df['Description'].str.replace(' (dollars) 2/','', regex=False)

# drop uneeded columns

df = df.drop(columns='GeoName')

In [137]:
df.state.unique().shape

(50,)

In [138]:
df = df.pivot_table(index=['year','state'], # columns to keep
                    columns='Description', # unstack this column
                    values='value', # populate this this column
                    aggfunc='first'
                    ).reset_index()

In [139]:
df.state.unique().shape

(49,)

In [140]:
# verify correct number of states
df[~df['state'].isin(state_list.Abbreviation)]

Description,year,state,Per capita personal income,Population


In [141]:
df.to_csv('./clean/per_capita_personal_income-population_state_annual_clean.csv', index=False)

### PCE Total by Function -  State Annual

Will divide by population to get Per Capita PCE

In [18]:
df = pd.read_csv('./dirty/pce_function_state_annual.csv', skipfooter=4, engine='python')
# verified correct head/tail skip against manual excel inspections

In [19]:
df.tail(2)

Unnamed: 0,GeoFIPS,GeoName,Region,TableName,LineCode,IndustryClassification,Description,Unit,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
7979,"""98000""",Far West,8,SAPCE4,133,...,Gross output of nonprofit institutions,Millions of current dollars,79226.1,83550.3,89980.1,98840.6,109305.9,121766.1,126678.0,131308.3,139575.4,150443.3,167124.2,180409.1,188687.4,197754.7,210008.4,227811.9,235697.0,245181.6,262357.3,278990.5,291572.1,308248.2,322748.7,321992.0
7980,"""98000""",Far West,8,SAPCE4,134,...,Less: Receipts from sales of goods and servi...,Millions of current dollars,63483.1,66529.7,70254.0,75358.2,81743.5,89280.5,94280.2,100390.9,108251.8,113378.6,123961.4,131613.1,139806.4,147270.1,155808.8,166870.2,171478.0,178397.7,190636.7,200273.2,208230.3,220005.4,233618.0,218594.3


In [20]:
# drop initially unnecessary columns
df = df.drop(columns=['Region','TableName','LineCode','IndustryClassification'])

# reshape column structure
df = pd.melt(df, id_vars=['GeoFIPS','GeoName','Description','Unit'])

# map state name to state abbreviation
df['state'] = df['GeoName'].map(state_dict)

# rename columns
df = df.rename(columns={'variable':'year', 'Description':'pce_description','value':'pce_value'})

In [21]:
df.head(2)

Unnamed: 0,GeoFIPS,GeoName,pce_description,Unit,year,pce_value,state
0,"""00000""",United States,Personal consumption expenditures,Millions of current dollars,1997,5536790.0,
1,"""00000""",United States,Household consumption expenditures,Millions of current dollars,1997,5431202.0,


In [22]:
# final drop of unnecessary columns
df = df.drop(columns=['GeoName','Unit','GeoFIPS'])

In [23]:
# Strip whitespace
df['pce_description'] = df['pce_description'].str.strip()

In [24]:
df['pce_description'].value_counts().shape

(134,)

In [25]:
len(pce_levels)

13

In [26]:
# remove any categories not in the desired list
df = df[df['pce_description'].isin(pce_levels)]

In [27]:
df['pce_description'].value_counts()

Personal consumption expenditures                                      1440
Food and beverages purchased for off-premises consumption              1440
Clothing, footwear, and related services                               1440
Housing, utilities, and fuels                                          1440
Furnishings, household equipment, and routine household maintenance    1440
Health                                                                 1440
Transportation                                                         1440
Communication                                                          1440
Recreation                                                             1440
Education                                                              1440
Food services and accommodations                                       1440
Financial services and insurance                                       1440
Other goods and services                                               1440
Name: pce_de

In [28]:
# convert pce_value to numeric in order to conver to millions
df['pce_value'] = df['pce_value'].astype('float64')

# Convert to 1:1 dollars
df['pce_value'] = df['pce_value'].multiply(1000000)

In [29]:
df = df.pivot_table(index=['year','state'], # columns to keep
                    columns='pce_description', # unstack this column
                    values='pce_value', # populate this this column
                    aggfunc='first'
                    ).reset_index()

In [30]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ','_')

In [31]:
df.head(2)

pce_description,year,state,"clothing,_footwear,_and_related_services",communication,education,financial_services_and_insurance,food_and_beverages_purchased_for_off-premises_consumption,food_services_and_accommodations,"furnishings,_household_equipment,_and_routine_household_maintenance",health,"housing,_utilities,_and_fuels",other_goods_and_services,personal_consumption_expenditures,recreation,transportation
0,1997,AK,552000000.0,193700000.0,255200000.0,848300000.0,1450000000.0,1091000000.0,701300000.0,1965500000.0,2457500000.0,1146800000.0,14205600000.0,1451200000.0,1711600000.0
1,1997,AL,3457300000.0,1875300000.0,1156500000.0,5026600000.0,7747300000.0,4278700000.0,3993800000.0,14574600000.0,13361000000.0,4375700000.0,77070900000.0,6744600000.0,9454500000.0


In [32]:
# add PCE to value columns
cols = df.columns[~df.columns.str.contains('year|state')]

df.rename(columns = dict(zip(cols, 'pce_' + cols)), inplace=True)

df = df.rename(columns={'pce_personal_consumption_expenditures':'pce_total'})

In [33]:
df.to_csv('./clean/pce_state_annual_clean.csv', index=False)

### Business Applications State Annual

In [45]:
df = pd.read_csv('./dirty/business_apps_state_annual.csv', engine='python')
# verified correct head/tail skip against manual excel inspections

# Data exists from 2005-2021

In [46]:
# NAIC Sector Codes
naic_sector_codes = pd.read_csv('./supplemental_data/naic_sectors_codes.csv', skiprows=0 , index_col=0, squeeze=True).to_dict()

# NAIC Series Codes
naic_series_codes = pd.read_csv('./supplemental_data/naic_series_codes.csv', skiprows=0 , index_col=0, squeeze=True).to_dict()

In [47]:
# Filter out incomplete years
df = df[df['year']!=2022]

df = df[df['year']!=2004]

# filter out unwanted series
df = df[~df['series'].isin(['BF_PBF4Q','BF_PBF8Q','BF_SBF4Q','BF_SBF8Q'])]

# filter to only seasonally adjusted data
df = df[df['sa']=='A']

In [48]:
df.tail(5)

Unnamed: 0,sa,naics_sector,series,geo,year,jan,feb,mar,apr,may,jun,jul,aug,sep,oct,nov,dec
27275,A,TOTAL,BF_BF8Q,VA,2005,1070,1090,1043,1054,1036,1088,1009,1021,1044,1130,1103,1084
27277,A,TOTAL,BF_BF8Q,WA,2005,1161,1063,1078,1060,993,1109,1162,1156,840,1230,1197,1159
27279,A,TOTAL,BF_BF8Q,WV,2005,D,161,D,148,160,187,167,D,D,201,202,172
27281,A,TOTAL,BF_BF8Q,WI,2005,525,620,609,645,606,624,622,598,467,620,593,553
27283,A,TOTAL,BF_BF8Q,WY,2005,D,D,121,120,119,125,D,D,D,D,D,D


In [49]:
# convert selected columns by index(months) to numeric
df.iloc[:,5:17] = df.iloc[:,5:17].apply(pd.to_numeric, errors='coerce')

# create annual total
df['annual_total'] =  df.iloc[:,5:17].sum(axis=1, min_count=12)

# drop monthly columns after creating annual total
df = df.drop(df.columns[5:17], axis=1)

In [50]:
df.sample(5)

Unnamed: 0,sa,naics_sector,series,geo,year,annual_total
23273,A,TOTAL,BA_CBA,MA,2007,12222.0
21396,A,NAICS81,BA_BA,US,2008,221334.0
18923,A,TOTAL,BF_BF8Q,WY,2010,
13483,A,NAICS52,BA_WBA,US,2013,15281.0
3925,A,TOTAL,BA_CBA,OR,2019,3915.0


In [51]:
# Map supplemental NAIC Data to columns
df['naics_sector'] = df['naics_sector'].map(naic_sector_codes)

df['series'] = df['series'].map(naic_series_codes)

In [52]:
df = df.pivot_table(index=['sa','geo','year','naics_sector'], # columns to keep
                    columns='series', # unstack this column
                    values='annual_total', # populate this this column
                    aggfunc='first'
                    ).reset_index()

df.head(5)

series,sa,geo,year,naics_sector,Business Applications,Business Applications from Corporations,Business Formations within Eight Quarters,Business Formations within Four Quarters,High‐Propensity Business Applications BA_WBA – Business Applications with Planned Wages
0,A,AK,2005,Total for All NAICS,4770.0,1028.0,,1152.0,2708.0
1,A,AK,2006,Total for All NAICS,5218.0,1142.0,,1028.0,2961.0
2,A,AK,2007,Total for All NAICS,5197.0,1024.0,,953.0,2855.0
3,A,AK,2008,Total for All NAICS,4902.0,749.0,,824.0,2428.0
4,A,AK,2009,Total for All NAICS,4452.0,607.0,,742.0,2122.0


In [53]:
df.head(5)

series,sa,geo,year,naics_sector,Business Applications,Business Applications from Corporations,Business Formations within Eight Quarters,Business Formations within Four Quarters,High‐Propensity Business Applications BA_WBA – Business Applications with Planned Wages
0,A,AK,2005,Total for All NAICS,4770.0,1028.0,,1152.0,2708.0
1,A,AK,2006,Total for All NAICS,5218.0,1142.0,,1028.0,2961.0
2,A,AK,2007,Total for All NAICS,5197.0,1024.0,,953.0,2855.0
3,A,AK,2008,Total for All NAICS,4902.0,749.0,,824.0,2428.0
4,A,AK,2009,Total for All NAICS,4452.0,607.0,,742.0,2122.0


In [54]:
df.to_csv('./clean/business_apps_state_annual_clean.csv', index=False)

### Poverty Rate

In [55]:
df = pd.read_csv('./dirty/poverty_rate_state_annual.csv', engine='python')
# verified correct head/tail skip against manual excel inspections

In [56]:
df.sample(2)

Unnamed: 0,STATE,Percent,Year,Unnamed: 3
239,Ohio,13.7,2016,
1057,Oregon,10.9,2000,


In [57]:
# drop unused columns
df = df.drop(columns='Unnamed: 3')

# convert to numeric percent
df['Percent'] = (df['Percent']/100)

# Map states to abbreviated form
df['STATE'] = df['STATE'].map(state_dict)

# convert column names to lower case
df.columns = df.columns.str.lower()

df = df.rename(columns={'percent':'poverty_rate'})

In [58]:
df.sample(2)

Unnamed: 0,state,poverty_rate,year
794,NH,0.06,2005
405,WV,0.2,2013


In [59]:
df.to_csv('./clean/poverty_rate_state_annual_clean.csv', index=False)

### Median Income

In [142]:
df = pd.read_csv('./dirty/median_income_state_annual.csv')

In [144]:
df.head(2)

Unnamed: 0,state,year,education_level,median_income
0,Alabama,2008,Regular high school diploma,801711
1,Alabama,2009,Regular high school diploma,797598


In [147]:
df.state.unique().shape

(52,)

In [148]:
df.state = df.state.map(state_dict)

In [149]:
df.state.unique().shape

(52,)

In [150]:
df.head(2)

Unnamed: 0,state,year,education_level,median_income
0,AL,2008,Regular high school diploma,801711
1,AL,2009,Regular high school diploma,797598


In [151]:
# Write out
df.to_csv('./clean/median_income_state_annual_clean.csv', index=False)

###  Median Age

In [152]:
df = pd.read_csv('./dirty/median_age_state_annual.csv')

In [153]:
df.head(2)

Unnamed: 0,state,year,median_age
0,Alabama,2005,37.4
1,Alabama,2006,37.2


In [154]:
df.state.unique().shape

(52,)

In [155]:
df.state = df.state.map(state_dict)

In [156]:
df.state.unique().shape

(52,)

In [157]:
df.head(2)

Unnamed: 0,state,year,median_age
0,AL,2005,37.4
1,AL,2006,37.2


In [158]:
# Write out
df.to_csv('./clean/median_age_state_annual_clean.csv', index=False)

# Education Data

### Total Spending per Student

In [60]:
df = pd.read_csv('./dirty/tot_spend_student_state_annual.csv', skiprows=6, skipfooter=7, engine='python')
# verified correct head/tail skip against manual excel inspections

In [61]:
# Create String
string = 'Total Current Expenditures for Public El-Sec (TE5) per Pupil (MEMBR) [State Finance]'

# Left Strip String
df.columns = df.columns.str.replace(string, '', regex=False)

In [62]:
df.sample(1)

Unnamed: 0,State Name,2018-19,2017-18,2016-17,2015-16,2014-15,2013-14,2012-13,2011-12,2010-11,2009-10,2008-09,2007-08,2006-07,2005-06,2004-05,2003-04,2002-03,2001-02,2000-01,1999-00,1998-99,1997-98
44,UTAH,7950,7576,7206,7006,6751,6546,6432,6441,6326,6452,6612,5978,5709,5464,5216,4991,4838,4900,4674,4378,4210,3969


In [63]:
df['State Name'].unique().shape

(51,)

In [64]:
# reshape dataframe
df = pd.melt(df, id_vars=['State Name'])

#rename columns
df = df.rename(columns={'State Name':'state', 'variable':'start_year','value':'per_pupil_expenditure'})

# trim to desired year digits
df['start_year'] = df['start_year'].str[:5]

# convert state dictionary to uppercase values - match dataframe format
state_dict_upper = {k.upper():v.upper() for k,v in state_dict.items()}

# map state name to state abbreviation
df['state'] = df['state'].map(state_dict_upper)

In [65]:
# Check output
df.head(5)

Unnamed: 0,state,start_year,per_pupil_expenditure
0,AL,2018,10107
1,AK,2018,18393
2,AZ,2018,8773
3,AR,2018,10412
4,CA,2018,13831


In [66]:
df.state.unique().shape

(51,)

In [67]:
df.to_csv('./clean/tot_spend_student_state_annual_clean.csv', index=False)

### Total Revenue Per Student

In [68]:
df = pd.read_csv('./dirty/tot_rev_per_student_state_annual.csv', skiprows=0, skipfooter=7, engine='python')
# verified correct head/tail skip against manual excel inspections

In [69]:
df.head(5)

Unnamed: 0,State Name,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2018-19,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2017-18,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2016-17,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2015-16,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2014-15,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2013-14,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2012-13,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2011-12,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2010-11,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2009-10,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2008-09,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2007-08,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2006-07,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2005-06,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2004-05,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2003-04,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2002-03,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2001-02,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2000-01,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 1999-00,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 1998-99,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 1997-98
0,ALABAMA,11415,10792,10590,10200,9992,9913,9653,9534,9776,9667,9708,10356,9548,8555,8028,7349,6971,6956,6503,6523,5975,5535
1,ALASKA,19553,19038,18897,18851,22379,20447,20312,19034,18699,17759,17319,17471,14304,12849,12632,11576,10928,10717,10275,10118,9532,9222
2,ARIZONA,10396,9697,9214,9293,8995,8703,8616,8613,9111,9343,9002,9457,9023,8071,7814,7550,7791,7214,7071,6455,5988,5812
3,ARKANSAS,11733,11592,11388,10939,10762,10478,10391,10939,10939,10738,10072,9758,9362,9031,8712,7542,7243,7112,6250,6054,5772,5697
4,CALIFORNIA,16014,14535,13965,13606,11786,10985,10481,10590,10790,10239,11180,11228,10857,9909,9234,8980,8975,8363,8306,7465,6750,6572


In [70]:
# Create String
string = 'Total Revenues (TR) per Pupil (MEMBR) [State Finance] '

# Left Strip String
df.columns = df.columns.str.replace(string, '', regex=False)

# Reshape df
df = pd.melt(df, id_vars=['State Name'])

# Rename columns
df = df.rename(columns={'State Name':'state', 'variable':'start_year','value':'per_pupil_revenue'})

# remap state name to state abbreviations
df['state'] = df['state'].map(state_dict_upper)

# Use Start year instead of year-span
df['start_year'] = df['start_year'].str[:4]

In [71]:
# Sample output
df.sample(10)

Unnamed: 0,state,start_year,per_pupil_revenue
800,OH,2003,10249
619,DE,2006,13345
340,ND,2012,13396
504,VT,2009,17916
789,MS,2003,7058
862,VA,2002,8735
302,WA,2013,12213
561,AL,2007,10356
1006,OR,1999,7952
1032,ID,1998,5806


In [72]:
df.to_csv('./clean/tot_revenue_student_state_annual_clean.csv', index=False)