# Get Data Notebook
This notebook us used to create functions for reproducibility of getting and combining data from Census and NCES

# Module Imports

In [49]:
## Import modules necessary for getting data from API and http:

import pandas as pd #standard
import numpy as np #standard
import matplotlib.pyplot as plt #for visualization
import seaborn as sns #for visualization
import requests #for APIs
import json #for APIS


# Variable Definitions

Here, we define several variables to simplify the reproduction of calling this API. 

In [15]:
## Get list of state codes from census website

state_list = pd.read_csv("https://www2.census.gov/geo/docs/reference/state.txt", sep="|")
state_list = pd.DataFrame(state_list[["STATE","STUSAB","STATE_NAME"]])
state_list.columns = map(str.lower, state_list.columns)
state_list.state = state_list.state.apply(str)
state_list.state = state_list.state.str.zfill(2)
state_list = state_list[:51]

In [16]:
group_list = ['DP02', 'DP03', 'DP04', 'DP05']
year_list = ['2013', '2014', '2015', '2016', '2017', '2018']
geo_list = ['school%20district%20(unified)', 'school%20district%20(elementary)', 'school%20district%20(secondary)']
my_key = input('Input Personal Census API Key')

Input Personal Census API Key 2b07e25a69e507e080faa2c31f9da3b42d178b4e


# Download Data

## Census Data
The function below is designed to iterate over the variables documented above. This will return what the Census calls "Data Profiles" for each state.

In [19]:
## Function to call Census API

# Loop over variables defined
for year in year_list:
    year = str(year)

    
    for group in group_list:
        group = str(group)

        geo = 'school%20district%20(unified)'


        group_states_info = []
        for state in state_list['state']:  
            state = str(state)

            get_acs_data = requests.get('https://api.census.gov/data/{year}/acs/acs5/profile?get=group({group})&for={geo}:*&in=state:{stateID}&key={key}'
                                        .format(year=year, group=group, geo=geo, stateID=state, key=my_key))
            acs_content = json.loads(get_acs_data.content)
            state_census_info = pd.DataFrame(data=acs_content)
            state_census_info.columns = state_census_info.iloc[0]
            state_census_info = state_census_info[1:]
            state_census_info['GEO_ID'] = state_census_info['GEO_ID'].str.lstrip('9700000US').str.zfill(7)
            state_census_info.rename(columns={'GEO_ID':'LEAID'}, inplace=True)
            group_states_info.append(state_census_info) 

        group_dp = pd.concat(group_states_info)
        group_dp.to_pickle('ACS_5yr_{group}_{year}.pkl'.format(group=group, year=year))
        print(group_dp.head)

<bound method NDFrame.head of 0     LEAID DP02_0001E DP02_0001M DP02_0001PE DP02_0001PM DP02_0002E  \
1   0100001       1167        134        1167  -888888888        919   
2   0100003        282         41         282  -888888888        255   
3   0100005       7079        348        7079  -888888888       5020   
4   0100006      17650        428       17650  -888888888      12944   
5   0100007      32026        685       32026  -888888888      21939   
..      ...        ...        ...         ...         ...        ...   
45  5605820        362         75         362  -888888888        227   
46  5605830       7583        445        7583  -888888888       4532   
47  5606090        670        126         670  -888888888        431   
48  5606240       3106        155        3106  -888888888       1970   
49  5699997        225         87         225  -888888888         43   

0  DP02_0002M DP02_0002PE DP02_0002PM DP02_0003E  ... DP02_0151EA DP02_0151MA  \
1         132        78.

KeyError: 'GEO_ID'

# Import Educational Data

In [50]:
## Create new variables for each file created
## Merge datasets together

dp02 = pd.read_pickle('ACS_5yr_DP02_2018.pkl')
dp03 = pd.read_pickle('ACS_5yr_DP03_2018.pkl')
dp04 = pd.read_pickle('ACS_5yr_DP04_2018.pkl')
dp05 = pd.read_pickle('ACS_5yr_DP05_2018.pkl')
ACS_2018A = pd.merge(dp02, dp03, on='LEAID', how='inner')
ACS_2018B = pd.merge(dp04, dp05, on='LEAID', how='inner')
ACS_2018 = pd.merge(ACS_2018A, ACS_2018B, on='LEAID', how='inner')
ACS_2018.head()

KeyboardInterrupt: 

In [31]:
## Save this file to a pickle for later reference / calling
ACS_2018.to_pickle('../data/acs_2018.pkl')
ACS_2018.head()

Unnamed: 0,LEAID,DP02_0001E,DP02_0001M,DP02_0001PE,DP02_0001PM,DP02_0002E,DP02_0002M,DP02_0002PE,DP02_0002PM,DP02_0003E,...,DP05_0029PEA,DP05_0030MA,DP05_0030EA,DP05_0030PMA,DP05_0030PEA,DP05_0031MA,DP05_0031EA,DP05_0031PEA,state_y_y,school district (unified)_y_y
0,102650,11557,430,11557,-888888888,7879,368,68.2,3.4,2803,...,,,,,,,,,1,2650
1,102670,3079,238,3079,-888888888,1588,256,51.6,7.8,451,...,,*****,,*****,,*****,,,1,2670
2,102700,14462,421,14462,-888888888,8813,424,60.9,3.1,4413,...,,,,,,,,,1,2700
3,102730,7530,262,7530,-888888888,4935,279,65.5,3.1,1688,...,,,,,,,,,1,2730
4,102760,2671,205,2671,-888888888,1839,167,68.9,4.6,726,...,,,,,,,,,1,2760


In [44]:
## Read pickle for assessment
## Change column to LEAID for later

assessment_2018 = pd.read_pickle('../data/assessment_2018.pkl')
assessment_2018.rename(columns={'leaid':'LEAID'}, inplace=True)
assessment_2018.head()

Unnamed: 0,STNAM_x,FIPST_x,LEAID,ST_LEAID_x,LEANM_x,DATE_CUR_x,ALL_MTH00NUMVALID_1718,ALL_MTH00PCTPROF_1718,MAM_MTH00NUMVALID_1718,MAM_MTH00PCTPROF_1718,...,MIL_RLA05NUMVALID_1718,MIL_RLA05PCTPROF_1718,MIL_RLA06NUMVALID_1718,MIL_RLA06PCTPROF_1718,MIL_RLA07NUMVALID_1718,MIL_RLA07PCTPROF_1718,MIL_RLA08NUMVALID_1718,MIL_RLA08PCTPROF_1718,MIL_RLAHSNUMVALID_1718,MIL_RLAHSPCTPROF_1718
0,ALABAMA,1,100005,AL-101,Albertville City,27MAR19,2892,45,5,PS,...,14,LT50,5.0,PS,11,GE50,14,GE50,10,GE50
1,ALABAMA,1,100006,AL-048,Marshall County,27MAR19,2987,43,12,GE50,...,1,PS,,,3,PS,2,PS,3,PS
2,ALABAMA,1,100007,AL-158,Hoover City,27MAR19,7551,70,5,PS,...,11,GE50,8.0,GE50,6,GE50,12,GE50,20,40-59
3,ALABAMA,1,100008,AL-169,Madison City,27MAR19,5668,76,47,60-69,...,34,80-89,38.0,70-79,42,70-79,51,70-79,44,60-69
4,ALABAMA,1,100011,AL-167,Leeds City,27MAR19,1017,40,2,PS,...,6,LT50,,,2,PS,4,PS,3,PS


In [51]:
## Set the index for using LEAID as the LEAID, since it doesn't tell anything specifically
assessment_2018.set_index('LEAID')

Unnamed: 0_level_0,STNAM_x,FIPST_x,ST_LEAID_x,LEANM_x,DATE_CUR_x,ALL_MTH00NUMVALID_1718,ALL_MTH00PCTPROF_1718,MAM_MTH00NUMVALID_1718,MAM_MTH00PCTPROF_1718,MAS_MTH00NUMVALID_1718,...,MIL_RLA05NUMVALID_1718,MIL_RLA05PCTPROF_1718,MIL_RLA06NUMVALID_1718,MIL_RLA06PCTPROF_1718,MIL_RLA07NUMVALID_1718,MIL_RLA07PCTPROF_1718,MIL_RLA08NUMVALID_1718,MIL_RLA08PCTPROF_1718,MIL_RLAHSNUMVALID_1718,MIL_RLAHSPCTPROF_1718
LEAID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0100005,ALABAMA,1,AL-101,Albertville City,27MAR19,2892,45,5,PS,18,...,14,LT50,5,PS,11,GE50,14,GE50,10,GE50
0100006,ALABAMA,1,AL-048,Marshall County,27MAR19,2987,43,12,GE50,9,...,1,PS,,,3,PS,2,PS,3,PS
0100007,ALABAMA,1,AL-158,Hoover City,27MAR19,7551,70,5,PS,531,...,11,GE50,8,GE50,6,GE50,12,GE50,20,40-59
0100008,ALABAMA,1,AL-169,Madison City,27MAR19,5668,76,47,60-69,530,...,34,80-89,38,70-79,42,70-79,51,70-79,44,60-69
0100011,ALABAMA,1,AL-167,Leeds City,27MAR19,1017,40,2,PS,9,...,6,LT50,,,2,PS,4,PS,3,PS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5513770,WYOMING,56,WY-1902000,Sweetwater County School District #2,27MAR19,1629,56,10,GE50,5,...,,,,,,,,,,
5513800,WYOMING,56,WY-2202000,Washakie County School District #2,27MAR19,58,60-69,1,PS,,...,,,,,,,,,,
5513830,WYOMING,56,WY-2001000,Teton County School District #1,27MAR19,1773,55,6,LT50,34,...,,,,,,,,,,
5513860,WYOMING,56,WY-2307000,Weston County School District #7,27MAR19,145,40-44,,,,...,,,,,,,,,,


In [32]:
poverty_2018 = pd.read_pickle('../data/poverty_2018.pkl')
poverty_2018.head()

Unnamed: 0_level_0,SAEPOVRAT5_17RV_PT
LEAID,Unnamed: 1_level_1
100001,12.4
100003,15.2
100005,30.1
100006,26.6
100007,7.3


In [39]:
acs_5yr_2018 = pd.merge(ACS_2018, poverty_2018, on="LEAID", how="inner")
acs_5yr_2018.set_index('LEAID')

Unnamed: 0_level_0,DP02_0001E,DP02_0001M,DP02_0001PE,DP02_0001PM,DP02_0002E,DP02_0002M,DP02_0002PE,DP02_0002PM,DP02_0003E,DP02_0003M,...,DP05_0030MA,DP05_0030EA,DP05_0030PMA,DP05_0030PEA,DP05_0031MA,DP05_0031EA,DP05_0031PEA,state_y_y,school district (unified)_y_y,SAEPOVRAT5_17RV_PT
LEAID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0102650,11557,430,11557,-888888888,7879,368,68.2,3.4,2803,335,...,,,,,,,,01,02650,16.5
0102670,3079,238,3079,-888888888,1588,256,51.6,7.8,451,152,...,*****,,*****,,*****,,,01,02670,49
0102700,14462,421,14462,-888888888,8813,424,60.9,3.1,4413,399,...,,,,,,,,01,02700,32.3
0102730,7530,262,7530,-888888888,4935,279,65.5,3.1,1688,199,...,,,,,,,,01,02730,29.4
0102760,2671,205,2671,-888888888,1839,167,68.9,4.6,726,127,...,,,,,,,,01,02760,27.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5605762,4852,230,4852,-888888888,3489,227,71.9,3.3,1637,214,...,,,,,,,,56,05762,7.7
5605820,327,66,327,-888888888,221,48,67.6,9.6,77,27,...,,,,,,,,56,05820,8.9
5605830,9158,486,9158,-888888888,5305,452,57.9,4.3,2254,366,...,,,,,,,,56,05830,5.8
5606090,698,96,698,-888888888,394,66,56.4,8.4,153,66,...,,,,,,,,56,06090,5.9


In [48]:
complete_df = pd.merge(acs_5yr_2018, assessment_2018, on='LEAID', how='inner')
complete_df.to_pickle('../data/complete_df.pkl')
complete_df.head()

Unnamed: 0,LEAID,DP02_0001E,DP02_0001M,DP02_0001PE,DP02_0001PM,DP02_0002E,DP02_0002M,DP02_0002PE,DP02_0002PM,DP02_0003E,...,MIL_RLA05NUMVALID_1718,MIL_RLA05PCTPROF_1718,MIL_RLA06NUMVALID_1718,MIL_RLA06PCTPROF_1718,MIL_RLA07NUMVALID_1718,MIL_RLA07PCTPROF_1718,MIL_RLA08NUMVALID_1718,MIL_RLA08PCTPROF_1718,MIL_RLAHSNUMVALID_1718,MIL_RLAHSPCTPROF_1718
0,102650,11557,430,11557,-888888888,7879,368,68.2,3.4,2803,...,4.0,PS,2.0,PS,1.0,PS,4.0,PS,0,
1,102670,3079,238,3079,-888888888,1588,256,51.6,7.8,451,...,1.0,PS,1.0,PS,,,1.0,PS,1,PS
2,102700,14462,421,14462,-888888888,8813,424,60.9,3.1,4413,...,10.0,GE50,9.0,LT50,8.0,GE50,2.0,PS,10,LT50
3,102730,7530,262,7530,-888888888,4935,279,65.5,3.1,1688,...,,,2.0,PS,4.0,PS,4.0,PS,4,PS
4,102760,2671,205,2671,-888888888,1839,167,68.9,4.6,726,...,,,,,,,,,1,PS
