In [1]:
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


from urllib.request import urlopen
from json import loads
from functools import reduce

valid_years = ["2012", "2013", "2014", "2015", "2016"]

'''
Method accepts array of urls 
Returns concatenated dataframe with data from urlss
'''        
def get_data_frames(urls):  
    dfs = []   
    for url in urls:
        while url:
            print("Fetching data from", url)
            response = urlopen(url)
            data = loads(response.read())
            dfs.append(pd.DataFrame.from_dict(data["results"]))
            url = data["next"]
            print("Next url:", url)
    return pd.concat(dfs, sort=True)

In [None]:
'''
Unforunately, API does not have years past 2016.
Following loads district level financial data w/ "leaid": 
Local education agency identification number (NCES) as a common identifier.
'''
finance_urls = ["https://educationdata.urban.org/api/v1/school-districts/ccd/finance/{}/".format(yr) for yr in valid_years]
finance_df = get_data_frames(finance_urls)
finance_df.head()


Fetching data from https://educationdata.urban.org/api/v1/school-districts/ccd/finance/2012/
Next url: https://educationdata.urban.org/api/v1/school-districts/ccd/finance/2012/?page=2
Fetching data from https://educationdata.urban.org/api/v1/school-districts/ccd/finance/2012/?page=2
Next url: https://educationdata.urban.org/api/v1/school-districts/ccd/finance/2012/?page=3
Fetching data from https://educationdata.urban.org/api/v1/school-districts/ccd/finance/2012/?page=3
Next url: https://educationdata.urban.org/api/v1/school-districts/ccd/finance/2012/?page=4
Fetching data from https://educationdata.urban.org/api/v1/school-districts/ccd/finance/2012/?page=4
Next url: https://educationdata.urban.org/api/v1/school-districts/ccd/finance/2012/?page=5
Fetching data from https://educationdata.urban.org/api/v1/school-districts/ccd/finance/2012/?page=5
Next url: https://educationdata.urban.org/api/v1/school-districts/ccd/finance/2012/?page=6
Fetching data from https://educationdata.urban.org/a

In [None]:
'''
Unforunately, API does not have years past 2016.
Following loads district level student poverty data w/ "leaid": 
Local education agency identification number (NCES) as a common identifier.
'''

poverty_urls = ["https://educationdata.urban.org/api/v1/school-districts/saipe/{}/".format(yr) for yr in valid_years]
poverty_df = get_data_frames(finance_urls)
poverty_df.head()

In [None]:
'''
Unforunately, API does not have years past 2016.
And states: ['AL' 'AK' 'AZ' 'IL' 'CA' 'MI' 'MD' 'MN' 'TX' 'AR' 'NV']
Following loads geographic data w/ "leaid": 
Local education agency identification number (NCES) as a common identifier.
'''

geographic_urls = ["https://educationdata.urban.org/api/v1/school-districts/ccd/directory/{}/".format(yr) for yr in valid_years]
geographic_df = get_data_frames(finance_urls)
geographic_df.head()

In [None]:
df = pd.merge(finance_df, geographic_df, on=["leaid"])
df.head()

In [None]:
'''
Calculate total revenue 'rev_total' (fed, state, local) for all states.
'''
df.rename(columns={'state_mailing':'State'}, inplace = True)
df.rename(columns={'year_y':'Year'}, inplace = True)
grouped_series = df.groupby(['State', 'Year'])['rev_total'].sum().reset_index(name='Total Revenue')

# num_years = len(grouped_series['Year'].unique())
states = grouped_series['State'].unique()
years = grouped_series['Year'].unique()

print(grouped_series.head())

colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(len(grouped_series['State'].unique()))]

'''
Generate sample graphs using colors and random ints
'''

# Sample Graph 1
for i in range(len(states)):
    plt.scatter(random.randint(0, 10), random.randint(0,10), c=colors[i], s=200)

plt.title('Sample Graph 1')
plt.show()
    
# Sample Graph 2  
fig, ax = plt.subplots()
for i in range(len(states)):
    x, y = np.random.rand(2, len(years))
#     print(x,":", y)
    scale = 200.0 * np.random.rand(len(years))
    ax.scatter(x, y, c=colors[i], s=scale, label=states[i],
               alpha=1, edgecolors='none')

ax.legend(title='States')
ax.grid(True)

plt.title('Sample Graph 2')
plt.show()

In [None]:
'''
Plot the total revenue 'rev_total' (fed, state, local) for all states by year
'''
#TODO: Add label on side (in billions); connect the dots 
fig, ax = plt.subplots()
for i in range(len(states)):
    curr_state = states[i]
    curr_total_revenues = grouped_series.loc[grouped_series['State']== curr_state]['Total Revenue']/1000000
    
    if(len(curr_total_revenues) != len(years)):
        print("Missing some data for state: ", states[i], " . Skipping...")
    else:
        x , y =  years, curr_total_revenues
        ax.scatter(x, y, c=colors[i], label=curr_state, alpha=1, edgecolors='none')

ax.legend(title='States', bbox_to_anchor=(1.0, 1.0))
ax.grid(True)

plt.title('Revue by State Over Time')
plt.show()

In [None]:
'''
Plot the total revenue 'rev_total' (fed, state, local) for all states by year starting at origin
'''
#TODO: Add label on side (in millions -> but normalized, so need to set origin as (0, min_rev)
#TODO: connect the dots 
fig, ax = plt.subplots()

 

for i in range(len(states)):
    curr_state = states[i]
    curr_total_revenues = grouped_series.loc[grouped_series['State']== curr_state]['Total Revenue']
    min = np.min(curr_total_revenues)
    normalizer = lambda t: (t - min)/1000000
    normalized_revenues = np.array([normalizer(x) for x in curr_total_revenues])
    
    if(len(curr_total_revenues) != len(years)):
        print("Missing some data for state: ", states[i], " . Skipping...")
    else:
        x , y =  years, normalized_revenues
        ax.scatter(x, y, c=colors[i], label=curr_state, alpha=1, edgecolors='none')

ax.legend(title='States', bbox_to_anchor=(1.0, 1.0))
ax.grid(True)

plt.title('Revenue by State Over Time')
plt.show()