# Question 1: What is each states’ instructional spend (y) from 2009-2016 (x)?
### Line graph of regional (S, MW, NE, W) spend for 50 states & DC 
o   Phase 1: Normalize as a ratio starting from the first year (2009)  
o   Phase 2: Nominal or actual spend dollars – does it account for inflation?

**Goal:** a) Figure out states that are outliers in terms of funding over the years b) Figure out which states improved in funding over the years

In [None]:
'''
Load the finance data and import all necessary libraries  
'''
import random
import seaborn as sns

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from functools import reduce

valid_regions = ["Northeast", "South", "West", "Midwest"]
valid_years = ["2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016"]
valid_cols = ["rev_total", "state_name", "region","year", 'exp_current_instruction_total']

finance_df = pd.read_csv('finance_data/districts_ccd_finance.csv.zip', low_memory=False)
finance_df.head()

In [None]:
'''
Map geographic data (region, state) to finance data &  drop NaN rows. 
'''

value_mappings = pd.read_excel(open('finance_data/codebook_districts_ccd_finance.xls', 'rb'), sheet_name='values')

fips_states = value_mappings[value_mappings["format"] == "fips"].set_index('code').code_label.to_dict()
finance_df["state_name"] = finance_df["fips"].map(fips_states)

fips_regions = value_mappings[value_mappings["format"] == "fips"].set_index('code').region.to_dict()
finance_df["region"] = finance_df["fips"].map(fips_regions)

finance_df = finance_df[finance_df.year >= 2009 ]
#TODO: drop all NaN or just those whose regions are NaN?
finance_df.dropna(subset=valid_cols, inplace=True)

finance_df.head()

In [None]:
'''
Clean Up Unused Cols and group by state_name, year and total revenue  
'''

revenue_df = finance_df.filter(valid_cols, axis=1).groupby(['state_name', "year", "region"])["rev_total"].sum().reset_index(name="total")
revenue_df.head()

In [None]:
'''
Clean Up Unused Cols and group by state_name, year and total expidenture 
'''
expidenture_df = finance_df.filter(valid_cols, axis=1).groupby(['state_name', "year", "region"])["exp_current_instruction_total"].sum().reset_index(name="total")
#.reset_index("total_spend")
expidenture_df.head()

# Method to Plot Financial Data for Given States (plt)

In [None]:
'''Create Method to plot finances for a group of given states over given year for states '''
#TODO: pass name of column that has the values to sum, pass transformation_lambda 

def plot_finances_as_ratio_of_year_one_plt(df, states, years, title="", region=""): 
    
    data = df[df.state_name.isin(states)]
    
    colors = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(len(states))]
    fig, ax = plt.subplots()
    
    for i in range(len(states)):
        curr_state = states[i]
        curr_totals = data.loc[data['state_name']== curr_state ]['total']
        starting_total =  curr_totals.iloc[0]

        normalized_totals = np.array([x/starting_total for x in curr_totals])

        if(len(curr_totals) != len(years)):
            print("Missing some data for state: ", curr_state, " . Skipping...")
        else:
            x , y =  years, normalized_totals
            plt.scatter(x, y, c=colors[i], label=curr_state, alpha=1, edgecolors='none')
            plt.plot(x, y)

    ax.legend(title='States', bbox_to_anchor=(1, 1), fancybox=True, framealpha=1, shadow=True, borderpad=1)
    ax.grid(True)
    
    plt.title('{} By State Over Time As Ratio of Year 1 for the {}'.format(title, region))
    plt.xlabel('Years', fontsize=12)
    plt.ylabel('Normalized Total Expidenture', fontsize=12)
    plt.xticks(np.arange(np.min(years), np.max(years)+1, 1))

    plt.show()

# Method to Plot Financial Data for Given States (sns)

In [None]:
'''Define method for seaborn'''
#TODO: pass name of column that has the values to sum, pass transformation_lambda 

def plot_finances_as_ratio_of_year_one_sns(df, states, years, title_name, region=None): 
    
    data = df[df.state_name.isin(states)]
    
    fig, ax = plt.subplots()

    for state in states:
                
        curr_total_expidentures = data.loc[data['state_name'] == state]['total']
        starting_expidenture =  curr_total_expidentures.iloc[0]
    
        data['total'].loc[data['state_name'] == state] = curr_total_expidentures.apply(lambda x: x/starting_expidenture)

    sns.lineplot(data=data, x='year', y='total', hue='state_name',  size=4, legend='brief')
    ax.legend(bbox_to_anchor=(1,1), fancybox=True, framealpha=1, shadow=True, borderpad=1)

    title = '{} By State Over Time As Ratio of Year 1 for the {}'.format(title_name, region) if region else '{} By State Over Time As Ratio of Year 1'.format(title)
    fig.suptitle('{} By State Over Time As Ratio of Year 1 for the {}'.format(title_name, region), fontsize=16)
    plt.show()

# Method to Plot Financial Data for Individual State (plt)

In [None]:
def plot_individual_state_as_ratio_of_year_one_plt(df, curr_state, years, title, region=None):
    fig, ax = plt.subplots()
    
    curr_totals = df.loc[df['state_name']== curr_state ]['total']
    starting_total =  curr_totals.iloc[0]

    normalized_totals = np.array([x/starting_total for x in curr_totals])

    if(len(curr_totals) != len(years)):
        print("Missing some data for state: ", curr_state, " . Skipping...")
    else:
        x , y =  years, normalized_totals
        plt.scatter(x, y, label=curr_state, alpha=1, edgecolors='none')
        plt.plot(x, y)

    ax.legend(title='States', bbox_to_anchor=(1, 1), fancybox=True, framealpha=1, shadow=True, borderpad=1)
    ax.grid(True)
    
    plt.title('{} Over Time As Ratio of Year 1 for {}'.format(title, curr_state))
    plt.xlabel('Years', fontsize=12)
    plt.ylabel('Normalized Total {}'.format(title), fontsize=12)
    plt.xticks(np.arange(np.min(years), np.max(years)+1, 1))

    plt.show()

# Method to Plot Financial Data Regression for Individual State (sns)

In [None]:
def plot_individual_state_financial_data_regression_sns(df, curr_state, years, title, region=None):

    data = df.loc[df['state_name']== curr_state ]
    title = "Linear Regresssion of {}'s {}".format(state, title)
    
    sns.regplot(data=data, scatter=True, x="year", y="total").set_title(title)
    plt.show()

# Expidenture Data (State by State - plt)

In [None]:
'''
Plot the total expidenture (fed, state, local) for all states by year starting at 1.0 for Year 1 (2019)
'''
states = expidenture_df['state_name'].unique()
years = expidenture_df['year'].unique()

plot_finances_as_ratio_of_year_one_plt(expidenture_df, states, years, "Expidenture")

# Expidenture Data (Region By Region - plt)

In [None]:
'''
Plot the total expidenture (fed, state, local) for all states by year starting at 1.0 for Year 1 (2019)
'''

for region in expidenture_df['region'].unique(): 
    
    states = expidenture_df.loc[expidenture_df['region'] == region]['state_name'].unique()
    years = expidenture_df['year'].unique()
    plot_finances_as_ratio_of_year_one_plt(expidenture_df, states, years, "Expidenture", region)

In [None]:
'''
Plot the total expidenture(fed, state, local) for all states by year starting at 1.0 for Year 1 (2019)
'''

for region in expidenture_df['region'].unique(): 

    states = expidenture_df.loc[expidenture_df['region'] == region]['state_name'].unique()
    years = expidenture_df['year'].unique()   
    title_name= 'Revenue'   
    
    plot_finances_as_ratio_of_year_one_sns(expidenture_df, states, years, title_name, region)

# Expidenture Outliers

**Massachusetts**: expidenture dips in 2012. 

**Alaska**: expidenture peaks at 2014 then dip after. Maybe related to: https://en.wikipedia.org/wiki/Alaska_Permanent_Fund individual payments (inverse relationship). 

**# TODO: look at district by district level changes & plot a regression of that data**

In [None]:
years = expidenture_df['year'].unique()
title = "Expidenture"
state = "Alaska"
region= "West"

plot_individual_state_as_ratio_of_year_one_plt(expidenture_df, state, years, title, region=region)
plot_individual_state_financial_data_regression_sns(expidenture_df, state, years, title, region=None)


In [None]:
years = expidenture_df['year'].unique()
title = "Expidenture"
state = "Massachusetts"
region= "Northeast"

plot_individual_state_as_ratio_of_year_one_plt(expidenture_df, state, years, title, region=region)

plot_individual_state_financial_data_regression_sns(expidenture_df, state, years, title, region=region)

# Revenue Data

# Revenue Data (State by State - plt)

In [None]:
'''
Plot the total revenue (fed, state, local) for all states by year starting at 1.0 for Year 1 (2019)
'''
states = revenue_df['state_name'].unique()
years = revenue_df['year'].unique()

plot_finances_as_ratio_of_year_one_plt(revenue_df, states, years, "Revenue")

# Revenue Data (Region By Region - plt)

In [None]:
'''
Plot the total revenue (fed, state, local) for all states by year starting at 1.0 for Year 1 (2019)
'''

for region in revenue_df['region'].unique(): 

    states = revenue_df.loc[revenue_df['region'] == region]['state_name'].unique()
    years = revenue_df['year'].unique()
    plot_finances_as_ratio_of_year_one_plt(revenue_df, states, years, "Revenue", region)

# Revenue Data (Region By Region - sns)

In [None]:
'''
Plot the total revenue (fed, state, local) for all states by year starting at 1.0 for Year 1 (2019)
'''

for region in revenue_df['region'].unique(): 

    states = revenue_df.loc[revenue_df['region'] == region]['state_name'].unique()
    years = revenue_df['year'].unique()   
    title_name= 'Revenue'   
    
    plot_finances_as_ratio_of_year_one_sns(revenue_df, states, years, title_name, region)