In [1]:
import pandas as pd
import numpy as np

In [2]:
states = pd.read_csv('states.csv')
naep = pd.read_csv('naep.csv')
districts = pd.read_csv('districts.csv')

In [3]:
states.columns

Index(['STATE', 'YEAR', 'ENROLL', 'TOTAL_REVENUE', 'FEDERAL_REVENUE',
       'STATE_REVENUE', 'LOCAL_REVENUE', 'TOTAL_EXPENDITURE',
       'INSTRUCTION_EXPENDITURE', 'SUPPORT_SERVICES_EXPENDITURE',
       'OTHER_EXPENDITURE', 'CAPITAL_OUTLAY_EXPENDITURE'],
      dtype='object')

In [4]:
naep.columns

Index(['YEAR', 'STATE', 'AVG_SCORE', 'TEST_SUBJECT', 'TEST_YEAR'], dtype='object')

In [5]:
districts.columns

Index(['STATE', 'ENROLL', 'NAME', 'YRDATA', 'TOTALREV', 'TFEDREV', 'TSTREV',
       'TLOCREV', 'TOTALEXP', 'TCURINST', 'TCURSSVC', 'TCURONON', 'TCAPOUT'],
      dtype='object')

In [13]:
districts.shape

(374161, 13)

In [6]:
districts.isnull().sum()

STATE           0
ENROLL      15868
NAME            0
YRDATA          0
TOTALREV        0
TFEDREV         0
TSTREV          0
TLOCREV         0
TOTALEXP        0
TCURINST        0
TCURSSVC        0
TCURONON    15868
TCAPOUT         0
dtype: int64

In [7]:
##The author mentioned that there is no enrollment data for year 1992
enroll92 = districts[districts['YRDATA']== 1992]

In [8]:
##The length of 1992 district matches the number of Na
len(enroll92)

15868

In [9]:
## Consider excluding 1992 data. Since it's the first year, excluding 1992 will not create a "hole" in data
districts['YRDATA'].min()

1992

In [10]:
## Excluding 1992
df1 = districts[districts['YRDATA']>1992]

In [14]:
df1.isnull().sum()

STATE       0
ENROLL      0
NAME        0
YRDATA      0
TOTALREV    0
TFEDREV     0
TSTREV      0
TLOCREV     0
TOTALEXP    0
TCURINST    0
TCURSSVC    0
TCURONON    0
TCAPOUT     0
dtype: int64

In [15]:
## Joining districts with states
states.isnull().sum()

STATE                            0
YEAR                             0
ENROLL                          51
TOTAL_REVENUE                    0
FEDERAL_REVENUE                  0
STATE_REVENUE                    0
LOCAL_REVENUE                    0
TOTAL_EXPENDITURE                0
INSTRUCTION_EXPENDITURE          0
SUPPORT_SERVICES_EXPENDITURE     0
OTHER_EXPENDITURE               51
CAPITAL_OUTLAY_EXPENDITURE       0
dtype: int64

In [20]:
## Exclude 1992
df2 = states[states['YEAR']>1992]

In [22]:
merge1 = df1.merge(df2, how = 'left', left_on = ['STATE','YRDATA'], right_on = ['STATE','YEAR'])

In [23]:
merge1.head(5).transpose()

Unnamed: 0,0,1,2,3,4
STATE,Alabama,Alabama,Alabama,Alabama,Alabama
ENROLL_x,9609,30931,912,2842,3322
NAME,AUTAUGA COUNTY SCHOOL DISTRICT,BALDWIN COUNTY SCHOOL DISTRICT,BARBOUR COUNTY SCHOOL DISTRICT,EUFAULA CITY SCHOOL DISTRICT,BIBB COUNTY SCHOOL DISTRICT
YRDATA,2016,2016,2016,2016,2016
TOTALREV,80867,338236,10116,26182,32486
TFEDREV,7447,23710,2342,3558,3664
TSTREV,53842,145180,5434,15900,21846
TLOCREV,19578,169346,2340,6724,6976
TOTALEXP,76672,299880,10070,29843,31662
TCURINST,43843,164977,4907,15302,16407


In [24]:
merge1.isnull().sum()

STATE                           0
ENROLL_x                        0
NAME                            0
YRDATA                          0
TOTALREV                        0
TFEDREV                         0
TSTREV                          0
TLOCREV                         0
TOTALEXP                        0
TCURINST                        0
TCURSSVC                        0
TCURONON                        0
TCAPOUT                         0
YEAR                            0
ENROLL_y                        0
TOTAL_REVENUE                   0
FEDERAL_REVENUE                 0
STATE_REVENUE                   0
LOCAL_REVENUE                   0
TOTAL_EXPENDITURE               0
INSTRUCTION_EXPENDITURE         0
SUPPORT_SERVICES_EXPENDITURE    0
OTHER_EXPENDITURE               0
CAPITAL_OUTLAY_EXPENDITURE      0
dtype: int64

In [25]:
## Joining NAEP data
naep.shape

(2305, 5)

In [26]:
naep.isnull().sum()

YEAR            0
STATE           0
AVG_SCORE       0
TEST_SUBJECT    0
TEST_YEAR       0
dtype: int64

In [44]:
naep['YEAR'].unique()

array([2017, 2015, 2013, 2011, 2009, 2007, 2005, 2003, 2000, 1996, 1992,
       1990, 2002, 1998, 1994], dtype=int64)

In [68]:
unique = pd.DataFrame(naep.groupby(['YEAR','STATE'])['AVG_SCORE'].count())

In [70]:
unique.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,AVG_SCORE
YEAR,STATE,Unnamed: 2_level_1
1990,Alabama,1
1990,Alaska,1
1990,Arizona,1
1990,Arkansas,1
1990,California,1


In [71]:
unique.sort_values(by = 'AVG_SCORE',ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,AVG_SCORE
YEAR,STATE,Unnamed: 2_level_1
2003,Nebraska,4
2009,Arkansas,4
2007,Vermont,4
2007,Virginia,4
2007,Washington,4
2007,West Virginia,4
2007,Wisconsin,4
2007,Wyoming,4
2009,Alabama,4
2009,Alaska,4


In [31]:
df3 = naep[naep['YEAR']>1992]

In [34]:
merge2 = merge1.merge(df3,how = 'left', left_on = ['STATE','YEAR'],right_on = ['STATE','YEAR'])

In [36]:
merge2.head(5).transpose()

Unnamed: 0,0,1,2,3,4
STATE,Alabama,Alabama,Alabama,Alabama,Alabama
ENROLL_x,9609,30931,912,2842,3322
NAME,AUTAUGA COUNTY SCHOOL DISTRICT,BALDWIN COUNTY SCHOOL DISTRICT,BARBOUR COUNTY SCHOOL DISTRICT,EUFAULA CITY SCHOOL DISTRICT,BIBB COUNTY SCHOOL DISTRICT
YRDATA,2016,2016,2016,2016,2016
TOTALREV,80867,338236,10116,26182,32486
TFEDREV,7447,23710,2342,3558,3664
TSTREV,53842,145180,5434,15900,21846
TLOCREV,19578,169346,2340,6724,6976
TOTALEXP,76672,299880,10070,29843,31662
TCURINST,43843,164977,4907,15302,16407


In [61]:
df_fin = merge2.rename(columns={'ENROLL_x':'District_Enrollment','ENROLL_y':'State_Enrollment'}).drop(columns = ['YRDATA'])

In [62]:
df_fin.columns

Index(['STATE', 'District_Enrollment', 'NAME', 'TOTALREV', 'TFEDREV', 'TSTREV',
       'TLOCREV', 'TOTALEXP', 'TCURINST', 'TCURSSVC', 'TCURONON', 'TCAPOUT',
       'YEAR', 'State_Enrollment', 'TOTAL_REVENUE', 'FEDERAL_REVENUE',
       'STATE_REVENUE', 'LOCAL_REVENUE', 'TOTAL_EXPENDITURE',
       'INSTRUCTION_EXPENDITURE', 'SUPPORT_SERVICES_EXPENDITURE',
       'OTHER_EXPENDITURE', 'CAPITAL_OUTLAY_EXPENDITURE', 'AVG_SCORE',
       'TEST_SUBJECT', 'TEST_YEAR'],
      dtype='object')