In [1]:
# Data
import pandas as pd
import numpy as np
import datetime

# Notebook Settings
import os

In [2]:
# set wd
os.getcwd()
os.chdir('/home/ad-frazier/data_science/MSBA_320/final_project')

# set pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [3]:
# Read in helper files

# Enables State Full Name to Abbreviation
state_dict = pd.read_csv('https://raw.githubusercontent.com/adfrisealach/List-of-US-States/master/states.csv', header=None, skiprows=1 , index_col=0, squeeze=True).to_dict()

Files cleaned in order of perceived importance

# Financial Data

### GDP by State and Year

In [4]:
df = pd.read_csv('./dirty/1_gdp_state_annual.csv', engine='python')
# verified correct head/tail skip against manual excel inspections

In [5]:
df.head(2)

Unnamed: 0,GeoFips,GeoName,LineCode,Description,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,0,United States,3,Current-dollar GDP (millions of current dollars),8577552.0,9062817.0,9631172.0,10250952.0,10581929.0,10929108.0,11456450.0,12217196.0,13039197.0,13815583.0,14474228.0,14769862.0,14478067.0,15048970.0,15599731.0,16253970.0,16843196.0,17550687.0,18206023.0,18695106.0,19479623.0,20527159.0,21372582.0,20893746.0
1,0,United States,4,Compensation (millions of dollars),4713220.0,5075701.0,5409937.0,5854634.0,6046346.0,6143370.0,6362298.0,6729306.0,7077722.0,7491260.0,7889371.0,8068682.0,7767191.0,7932970.0,8234017.0,8575362.0,8843637.0,9259654.0,9709535.0,9977096.0,10436745.0,10969807.0,11459449.0,11580088.0


In [6]:
# reshape
df = pd.melt(df, id_vars=['GeoFips','GeoName','LineCode','Description'])

In [7]:
df.sample(5)

Unnamed: 0,GeoFips,GeoName,LineCode,Description,variable,value
3163,35000,New Mexico,7,Taxes on production and imports (TOPI) (mill...,2007,5876.8
2808,24000,Maryland,7,Taxes on production and imports (TOPI) (mill...,2006,18428.5
1822,5000,Arkansas,5,Gross operating surplus (millions of dollars),2003,30737.5
3958,13000,Georgia,7,Taxes on production and imports (TOPI) (mill...,2010,26133.3
1721,48000,Texas,4,Compensation (millions of dollars),2002,426501.3


In [8]:
# rename
df = df.rename(columns={'variable':'year', 'value':'dollars'}) 

# Map full state names to Abbreviated form
df['state'] = df['GeoName'].map(state_dict)

In [9]:
# drop uneeded columns
df = df.drop(columns=['GeoFips','GeoName','LineCode'])

# Turn GDP Values into 1:1 dollars -- Multiply by 1 million
df['dollars'] = df['dollars']*1000000

# clean up column names a little bit
df['Description'] = df['Description'].str.replace("(millions of dollars)", "", regex=False)
df['Description'] = df['Description'].str.replace("(millions of current dollars)", "", regex=False)

In [10]:
df.sample(5)

Unnamed: 0,Description,year,dollars,state
4555,Current-dollar GDP,2012,447764700000.0,GA
3401,Compensation,2008,29678500000.0,ME
2878,Taxes on production and imports (TOPI),2006,2063500000.0,ND
1528,Taxes on production and imports (TOPI),2002,92987000000.0,CA
116,Compensation,1997,178618100000.0,MI


In [11]:
df = df.pivot_table(index=['state','year'], # columns to keep
                    columns='Description', # unstack this column
                    values='dollars', # populate this this column
                    aggfunc='first'
                    ).reset_index()

df.head(5)

Description,state,year,Subsidies,Taxes on production and imports (TOPI),Compensation,Current-dollar GDP,Gross operating surplus
0,AK,1997,-62900000.0,2464700000.0,12347800000.0,25810800000.0,11061300000.0
1,AK,1998,-54700000.0,2003900000.0,12889600000.0,24227500000.0,9388700000.0
2,AK,1999,-56400000.0,1983100000.0,13127500000.0,24744300000.0,9690100000.0
3,AK,2000,-57500000.0,2515400000.0,13893000000.0,26806600000.0,10455700000.0
4,AK,2001,-112300000.0,2385900000.0,14883300000.0,28494100000.0,11337100000.0


In [12]:
# Strip WhiteSpace
df.columns = df.columns.str.strip()

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1224 entries, 0 to 1223
Data columns (total 7 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   state                                   1224 non-null   object 
 1   year                                    1224 non-null   object 
 2   Subsidies                               1224 non-null   float64
 3   Taxes on production and imports (TOPI)  1224 non-null   float64
 4   Compensation                            1224 non-null   float64
 5   Current-dollar GDP                      1224 non-null   float64
 6   Gross operating surplus                 1224 non-null   float64
dtypes: float64(5), object(2)
memory usage: 67.1+ KB


In [14]:
df.columns = df.columns.str.lower().str.replace(' ','_')

In [15]:
df.head(5)

Description,state,year,subsidies,taxes_on_production_and_imports_(topi),compensation,current-dollar_gdp,gross_operating_surplus
0,AK,1997,-62900000.0,2464700000.0,12347800000.0,25810800000.0,11061300000.0
1,AK,1998,-54700000.0,2003900000.0,12889600000.0,24227500000.0,9388700000.0
2,AK,1999,-56400000.0,1983100000.0,13127500000.0,24744300000.0,9690100000.0
3,AK,2000,-57500000.0,2515400000.0,13893000000.0,26806600000.0,10455700000.0
4,AK,2001,-112300000.0,2385900000.0,14883300000.0,28494100000.0,11337100000.0


In [16]:
# including `index=False` removes unwanted description level
df.to_csv('./clean/gdp_state_annual_clean.csv', index=False)

### PCE State Annual

In [17]:
df = pd.read_csv('./dirty/pce_state_annual.csv', skipfooter=4, engine='python')
# verified correct head/tail skip against manual excel inspections

In [18]:
df.head(2)

Unnamed: 0,GeoFIPS,GeoName,Region,TableName,LineCode,IndustryClassification,Description,Unit,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,"""00000""",United States,,SAPCE1,1,...,Personal consumption expenditures,Millions of current dollars,5536790.0,5877248.0,6283758.0,6767179.0,7073801.0,7348941.0,7740749.0,8231960.0,8769066.0,9277236.0,9746594.0,10050083.0,9891218.0,10260256.0,10698857.0,11047363.0,11363528.0,11847725.0,12263476.0,12693266.0,13239111.0,13913531.0,14428676.0,14047565.0
1,"""00000""",United States,,SAPCE1,2,...,Goods,Millions of current dollars,2006506.0,2108439.0,2287062.0,2453172.0,2525593.0,2598805.0,2722597.0,2902021.0,3082923.0,3239655.0,3367031.0,3363221.0,3180022.0,3317825.0,3518121.0,3637739.0,3729973.0,3862956.0,3922993.0,3991849.0,4158554.0,4353716.0,4478918.0,4653822.0


In [19]:
# drop initially unnecessary columns
df = df.drop(columns=['Region','TableName','LineCode','IndustryClassification'])

# reshape column structure
df = pd.melt(df, id_vars=['GeoFIPS','GeoName','Description','Unit'])

# map state name to state abbreviation
df['state'] = df['GeoName'].map(state_dict)

# rename columns
df = df.rename(columns={'variable':'year', 'Description':'pce_description','value':'pce_value'})

# Convert to 1:1 dollars
df['pce_value'] = df['pce_value']*1000000

# final drop of unnecessary columns
df = df.drop(columns=['GeoName','Unit','GeoFIPS'])

In [20]:
df = df.pivot_table(index=['year','state'], # columns to keep
                    columns='pce_description', # unstack this column
                    values='pce_value', # populate this this column
                    aggfunc='first'
                    ).reset_index()

In [21]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ','_')

In [22]:
df.head(2)

pce_description,year,state,clothing_and_footwear,financial_services_and_insurance,food_and_beverages_purchased_for_off-premises_consumption,food_services_and_accommodations,furnishings_and_durable_household_equipment,gasoline_and_other_energy_goods,gross_output_of_nonprofit_institutions,health_care,housing_and_utilities,less:_receipts_from_sales_of_goods_and_services_by_nonprofit_institutions,motor_vehicles_and_parts,other_durable_goods,other_nondurable_goods,other_services,recreation_services,recreational_goods_and_vehicles,transportation_services,durable_goods,final_consumption_expenditures_of_nonprofit_institutions_serving_households_(npishs),household_consumption_expenditures_(for_services),nondurable_goods,goods,services,personal_consumption_expenditures
0,1997,AK,515600000.0,848300000.0,1450000000.0,1091000000.0,376100000.0,432700000.0,1396000000.0,1719200000.0,2378300000.0,962600000.0,795000000.0,235200000.0,941800000.0,1221800000.0,646700000.0,557300000.0,563100000.0,1963600000.0,433400000.0,8468500000.0,3340100000.0,5303700000.0,8901900000.0,14205600000.0
1,1997,AL,3249700000.0,5026600000.0,7747300000.0,4278700000.0,2164500000.0,2485300000.0,6320000000.0,12481000000.0,13167000000.0,5506100000.0,4890300000.0,1088900000.0,6523000000.0,6196300000.0,2839000000.0,1846500000.0,2272800000.0,9990100000.0,813900000.0,46261500000.0,20005500000.0,29995600000.0,47075400000.0,77070900000.0


In [23]:
# add PCE to value columns
cols = df.columns[~df.columns.str.contains('year|state')]

df.rename(columns = dict(zip(cols, 'pce_' + cols)), inplace=True)

In [24]:
df.head(2)

pce_description,year,state,pce_clothing_and_footwear,pce_financial_services_and_insurance,pce_food_and_beverages_purchased_for_off-premises_consumption,pce_food_services_and_accommodations,pce_furnishings_and_durable_household_equipment,pce_gasoline_and_other_energy_goods,pce_gross_output_of_nonprofit_institutions,pce_health_care,pce_housing_and_utilities,pce_less:_receipts_from_sales_of_goods_and_services_by_nonprofit_institutions,pce_motor_vehicles_and_parts,pce_other_durable_goods,pce_other_nondurable_goods,pce_other_services,pce_recreation_services,pce_recreational_goods_and_vehicles,pce_transportation_services,pce_durable_goods,pce_final_consumption_expenditures_of_nonprofit_institutions_serving_households_(npishs),pce_household_consumption_expenditures_(for_services),pce_nondurable_goods,pce_goods,pce_services,pce_personal_consumption_expenditures
0,1997,AK,515600000.0,848300000.0,1450000000.0,1091000000.0,376100000.0,432700000.0,1396000000.0,1719200000.0,2378300000.0,962600000.0,795000000.0,235200000.0,941800000.0,1221800000.0,646700000.0,557300000.0,563100000.0,1963600000.0,433400000.0,8468500000.0,3340100000.0,5303700000.0,8901900000.0,14205600000.0
1,1997,AL,3249700000.0,5026600000.0,7747300000.0,4278700000.0,2164500000.0,2485300000.0,6320000000.0,12481000000.0,13167000000.0,5506100000.0,4890300000.0,1088900000.0,6523000000.0,6196300000.0,2839000000.0,1846500000.0,2272800000.0,9990100000.0,813900000.0,46261500000.0,20005500000.0,29995600000.0,47075400000.0,77070900000.0


In [25]:
df.to_csv('./clean/pce_state_annual_clean.csv', index=False)

### Per Capita Disposable Personal Income

In [26]:
df = pd.read_csv('./dirty/per_capita_disposable_personal_income_state_annual.csv', engine='python')
# verified correct head/tail skip against manual excel inspections

In [27]:
df.sample(5)

Unnamed: 0,GeoFips,GeoName,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
32,35000,New Mexico,18381,19083,19207,20618,21888,22911,23872,25124,26232,27614,28749,30528,30306,31189,32174,32901,31984,33976,34755,35430,36122,37859,39445,42803
34,37000,North Carolina,21205,22077,22798,23834,24168,24685,25727,27583,28859,30551,32027,33776,32800,32623,33102,35208,34127,35779,37015,37647,39146,40778,42413,45060
1,1000,Alabama,19087,20120,20685,21574,22255,23170,24304,26011,27130,28296,29371,30202,30155,31203,31966,32914,32922,33974,35255,35681,37001,38571,39962,42392
19,22000,Louisiana,19017,19901,20362,21482,22967,23936,24853,25917,27776,30793,32840,34633,33994,35296,35902,37448,37538,39080,39215,38783,40217,42255,43480,46771
48,53000,Washington,24160,25728,26537,28257,29060,29852,31116,33274,33834,35984,38250,39846,38370,38911,40106,43064,43104,45719,47366,48815,51032,53920,56332,60468


In [28]:
df.GeoName.unique().shape

(60,)

In [29]:
# reshape column structure
df = pd.melt(df, id_vars=['GeoFips','GeoName'])

In [30]:
# rename columns
df = df.rename(columns={'variable':'year', 'value':'per_capita_disposable_personal_income'})

In [31]:
# map state name to state abbreviation
df['state'] = df['GeoName'].map(state_dict)

In [32]:
df.state.unique().shape

(52,)

In [33]:
# drop null values from regions not mapping
df = df.dropna()

In [34]:
df = df.drop(columns=['GeoFips','GeoName'])

In [35]:
df.to_csv('./clean/per_capita_disposable_personal_income_state_annual_clean.csv', index=False)

### Per Capita Personal Income

In [36]:
df = pd.read_csv('./dirty/per_capita_personal_income_state_annual.csv', engine='python')
# verified correct head/tail skip against manual excel inspections

In [37]:
df.sample(5)

Unnamed: 0,GeoFips,GeoName,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
52,91000,New England,30459,32428,34173,37349,38866,38787,39500,41530,43321,46440,48962,50954,50011,52000,53845,55445,55140,57505,60177,61822,63907,66684,69094,73179
13,16000,Idaho,21506,22783,23688,25183,25799,26233,26728,28317,29392,31476,32707,32841,31281,32106,33627,35201,36229,37863,39622,40385,41905,43766,45741,48759
50,55000,Wisconsin,25429,27031,27933,29556,30841,31425,32148,33424,34447,36338,37816,39055,38182,39185,41262,43043,43194,44905,46800,47509,49105,51250,52918,55593
23,26000,Michigan,25990,27430,28629,30344,30749,30602,31279,32168,32946,33812,34929,35969,34307,35769,37829,39355,39696,41383,43655,44809,45983,47708,49277,53259
27,30000,Montana,20159,21461,21794,23081,23996,24554,26125,28131,30074,32193,34183,35792,34689,36398,38584,40750,40746,42591,44066,44211,46282,48021,50099,53361


In [38]:
# reshape column structure
df = pd.melt(df, id_vars=['GeoFips','GeoName'])

# rename columns
df = df.rename(columns={'variable':'year', 'value':'per_capita_personal_income'})

# map state name to state abbreviation
df['state'] = df['GeoName'].map(state_dict)

df = df.drop(columns=['GeoName','GeoFips'])

In [39]:
df.sample(5)

Unnamed: 0,year,per_capita_personal_income,state
468,2004,36378,WA
777,2009,36249,
1363,2019,49009,TN
702,2008,41168,SD
141,1999,33036,MD


In [40]:
df.isnull().sum()

year                            0
per_capita_personal_income      0
state                         264
dtype: int64

In [41]:
# remove nulls generated by region drop from state mapping
df = df.dropna()

In [42]:
df.to_csv('./clean/per_capita_personal_income_state_annual_clean.csv', index=False)

### Business Applications State Annual

In [43]:
df = pd.read_csv('./dirty/business_apps_monthly_structured.csv', engine='python')
# verified correct head/tail skip against manual excel inspections

# Data exists from 2005-2021

In [44]:
# NAIC Sector Codes
naic_sector_codes = pd.read_csv('./supplemental_data/naic_sectors_codes.csv', skiprows=0 , index_col=0, squeeze=True).to_dict()

# NAIC Series Codes
naic_series_codes = pd.read_csv('./supplemental_data/naic_series_codes.csv', skiprows=0 , index_col=0, squeeze=True).to_dict()

In [45]:
# Filter out incomplete years
df = df[df['year']!=2022]

df = df[df['year']!=2004]

# filter out unwanted series
df = df[~df['series'].isin(['BF_PBF4Q','BF_PBF8Q','BF_SBF4Q','BF_SBF8Q'])]

# filter to only seasonally adjusted data
df = df[df['sa']=='A']

In [46]:
df.tail(5)

Unnamed: 0,sa,naics_sector,series,geo,year,jan,feb,mar,apr,may,jun,jul,aug,sep,oct,nov,dec
27275,A,TOTAL,BF_BF8Q,VA,2005,1070,1090,1043,1054,1036,1088,1009,1021,1044,1130,1103,1084
27277,A,TOTAL,BF_BF8Q,WA,2005,1161,1063,1078,1060,993,1109,1162,1156,840,1230,1197,1159
27279,A,TOTAL,BF_BF8Q,WV,2005,D,161,D,148,160,187,167,D,D,201,202,172
27281,A,TOTAL,BF_BF8Q,WI,2005,525,620,609,645,606,624,622,598,467,620,593,553
27283,A,TOTAL,BF_BF8Q,WY,2005,D,D,121,120,119,125,D,D,D,D,D,D


In [47]:
# convert selected columns by index(months) to numeric
df.iloc[:,5:17] = df.iloc[:,5:17].apply(pd.to_numeric, errors='coerce')

# create annual total
df['annual_total'] =  df.iloc[:,5:17].sum(axis=1, min_count=12)

# drop monthly columns after creating annual total
df = df.drop(df.columns[5:17], axis=1)

In [48]:
df.sample(5)

Unnamed: 0,sa,naics_sector,series,geo,year,annual_total
10197,A,TOTAL,BA_WBA,ME,2015,2030.0
23368,A,NAICS61,BA_HBA,US,2007,16950.0
26431,A,TOTAL,BA_BA,CA,2005,274214.0
1563,A,TOTAL,BA_HBA,NO,2021,282378.0
2773,A,NAICS62,BA_HBA,US,2020,182771.0


In [49]:
# Map supplemental NAIC Data to columns
df['naics_sector'] = df['naics_sector'].map(naic_sector_codes)

df['series'] = df['series'].map(naic_series_codes)

In [50]:
df = df.pivot_table(index=['sa','geo','year','naics_sector'], # columns to keep
                    columns='series', # unstack this column
                    values='annual_total', # populate this this column
                    aggfunc='first'
                    ).reset_index()

df.head(5)

series,sa,geo,year,naics_sector,Business Applications,Business Applications from Corporations,Business Formations within Eight Quarters,Business Formations within Four Quarters,High‐Propensity Business Applications BA_WBA – Business Applications with Planned Wages
0,A,AK,2005,Total for All NAICS,4770.0,1028.0,,1152.0,2708.0
1,A,AK,2006,Total for All NAICS,5218.0,1142.0,,1028.0,2961.0
2,A,AK,2007,Total for All NAICS,5197.0,1024.0,,953.0,2855.0
3,A,AK,2008,Total for All NAICS,4902.0,749.0,,824.0,2428.0
4,A,AK,2009,Total for All NAICS,4452.0,607.0,,742.0,2122.0


In [51]:
df.head(5)

series,sa,geo,year,naics_sector,Business Applications,Business Applications from Corporations,Business Formations within Eight Quarters,Business Formations within Four Quarters,High‐Propensity Business Applications BA_WBA – Business Applications with Planned Wages
0,A,AK,2005,Total for All NAICS,4770.0,1028.0,,1152.0,2708.0
1,A,AK,2006,Total for All NAICS,5218.0,1142.0,,1028.0,2961.0
2,A,AK,2007,Total for All NAICS,5197.0,1024.0,,953.0,2855.0
3,A,AK,2008,Total for All NAICS,4902.0,749.0,,824.0,2428.0
4,A,AK,2009,Total for All NAICS,4452.0,607.0,,742.0,2122.0


In [52]:
df.to_csv('./clean/business_apps_state_annual_clean.csv', index=False)

### Poverty Rate

In [53]:
df = pd.read_csv('./dirty/poverty_rate_state_annual.csv', engine='python')
# verified correct head/tail skip against manual excel inspections

In [54]:
df.sample(2)

Unnamed: 0,STATE,Percent,Year,Unnamed: 3
500,South Dakota,14.5,2011,
243,Rhode Island,11.4,2016,


In [55]:
# drop unused columns
df = df.drop(columns='Unnamed: 3')

# convert to numeric percent
df['Percent'] = (df['Percent']/100)

# Map states to abbreviated form
df['STATE'] = df['STATE'].map(state_dict)

# convert column names to lower case
df.columns = df.columns.str.lower()

df = df.rename(columns={'percent':'poverty_rate'})

In [56]:
df.sample(2)

Unnamed: 0,state,poverty_rate,year
627,IA,0.1,2008
798,NC,0.13,2005


In [57]:
df.to_csv('./clean/poverty_rate_state_annual_clean.csv', index=False)

# Education Data

### Total Spending per Student

In [58]:
df = pd.read_csv('./dirty/tot_spend_student_state_annual.csv', skiprows=6, skipfooter=7, engine='python')
# verified correct head/tail skip against manual excel inspections

In [59]:
# Create String
string = 'Total Current Expenditures for Public El-Sec (TE5) per Pupil (MEMBR) [State Finance]'

# Left Strip String
df.columns = df.columns.str.replace(string, '', regex=False)

In [60]:
df.sample(1)

Unnamed: 0,State Name,2018-19,2017-18,2016-17,2015-16,2014-15,2013-14,2012-13,2011-12,2010-11,2009-10,2008-09,2007-08,2006-07,2005-06,2004-05,2003-04,2002-03,2001-02,2000-01,1999-00,1998-99,1997-98
18,LOUISIANA,11920,11636,11379,11169,11106,10853,10539,10726,10799,10701,10625,10006,8937,8486,7669,7271,6922,6567,6037,5804,5548,5187


In [61]:
df['State Name'].unique().shape

(51,)

In [62]:
# reshape dataframe
df = pd.melt(df, id_vars=['State Name'])

#rename columns
df = df.rename(columns={'State Name':'state', 'variable':'start_year','value':'per_pupil_expenditure'})

# trim to desired year digits
df['start_year'] = df['start_year'].str[:5]

# convert state dictionary to uppercase values - match dataframe format
state_dict_upper = {k.upper():v.upper() for k,v in state_dict.items()}

# map state name to state abbreviation
df['state'] = df['state'].map(state_dict_upper)

In [63]:
# Check output
df.head(5)

Unnamed: 0,state,start_year,per_pupil_expenditure
0,AL,2018,10107
1,AK,2018,18393
2,AZ,2018,8773
3,AR,2018,10412
4,CA,2018,13831


In [64]:
df.state.unique().shape

(51,)

In [65]:
df.to_csv('./clean/tot_spend_student_state_annual_clean.csv', index=False)

### Total Revenue Per Student

In [66]:
df = pd.read_csv('./dirty/tot_rev_per_student_state_annual.csv', skiprows=0, skipfooter=7, engine='python')
# verified correct head/tail skip against manual excel inspections

In [67]:
df.head(5)

Unnamed: 0,State Name,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2018-19,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2017-18,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2016-17,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2015-16,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2014-15,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2013-14,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2012-13,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2011-12,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2010-11,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2009-10,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2008-09,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2007-08,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2006-07,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2005-06,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2004-05,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2003-04,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2002-03,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2001-02,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 2000-01,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 1999-00,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 1998-99,Total Revenues (TR) per Pupil (MEMBR) [State Finance] 1997-98
0,ALABAMA,11415,10792,10590,10200,9992,9913,9653,9534,9776,9667,9708,10356,9548,8555,8028,7349,6971,6956,6503,6523,5975,5535
1,ALASKA,19553,19038,18897,18851,22379,20447,20312,19034,18699,17759,17319,17471,14304,12849,12632,11576,10928,10717,10275,10118,9532,9222
2,ARIZONA,10396,9697,9214,9293,8995,8703,8616,8613,9111,9343,9002,9457,9023,8071,7814,7550,7791,7214,7071,6455,5988,5812
3,ARKANSAS,11733,11592,11388,10939,10762,10478,10391,10939,10939,10738,10072,9758,9362,9031,8712,7542,7243,7112,6250,6054,5772,5697
4,CALIFORNIA,16014,14535,13965,13606,11786,10985,10481,10590,10790,10239,11180,11228,10857,9909,9234,8980,8975,8363,8306,7465,6750,6572


In [68]:
# Create String
string = 'Total Revenues (TR) per Pupil (MEMBR) [State Finance] '

# Left Strip String
df.columns = df.columns.str.replace(string, '', regex=False)

# Reshape df
df = pd.melt(df, id_vars=['State Name'])

# Rename columns
df = df.rename(columns={'State Name':'state', 'variable':'start_year','value':'per_pupil_revenue'})

# remap state name to state abbreviations
df['state'] = df['state'].map(state_dict_upper)

# Use Start year instead of year-span
df['start_year'] = df['start_year'].str[:4]

In [69]:
# Sample output
df.sample(10)

Unnamed: 0,state,start_year,per_pupil_revenue
288,NC,2013,8573
461,AZ,2009,9343
802,OR,2003,9281
266,HI,2013,14434
902,OH,2001,9636
349,TX,2012,9858
958,SC,2000,8059
1012,TX,1999,7179
444,OK,2010,8901
819,AR,2002,7243


In [70]:
df.to_csv('./clean/tot_revenue_student_state_annual_clean.csv', index=False)

### Map State Dictionary to Population Data

In [71]:
df = pd.read_csv('./dirty/population_annual_state.csv')

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1581 entries, 0 to 1580
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   state       1581 non-null   object
 1   year        1581 non-null   int64 
 2   population  1581 non-null   object
dtypes: int64(1), object(2)
memory usage: 37.2+ KB


In [73]:
# make population a number datatype
df['population'] = df['population'].str.replace(',','').astype(int)

In [74]:
df.head(5)

Unnamed: 0,state,year,population
0,Alabama,1990,4050055
1,Alaska,1990,553290
2,Arizona,1990,3684097
3,Arkansas,1990,2356586
4,California,1990,29959515


In [75]:
df.state.unique().shape

(51,)

In [76]:
df['state'] = df['state'].map(state_dict)

In [77]:
df.state.unique().shape

(51,)

Map good, 51/51

In [78]:
df.to_csv('./clean/population_state_annual_clean.csv', index=False)