In [228]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import geopandas
import os
import bokeh
from bokeh.models import Range1d
from bokeh.plotting import figure, show
from bokeh.layouts import gridplot
from ipywidgets import interact, interact_manual
from bokeh.models.widgets import Panel, Tabs
from bokeh.io import output_notebook
import pandas_datareader.data as web
from pandas_datareader import wb
from geopandas.tools import overlay
from mpl_toolkits.axes_grid1 import make_axes_locatable
output_notebook()

In [229]:
# Define data import variables
a_path = '/Users/chandlerhall/Desktop/GitHub/D&P Q2/homework-2-cgwhall-1/Data/EducationDataPortal_10.21.2022_all_files'
inst_file = 'EducationDataPortal_10.21.2022_College.csv' 
year_year_file = 'EducationDataPortal_10.21.2022_years_after_entry.csv'

In [230]:
# Merge all Urban Labs data files into one working df
def create_merged_df(first_download=inst_file, second_download=year_year_file, midwest_states=13, user_path=a_path):
    def start_empty_df():
        col_path = f'{a_path} 1'
        first_cols = pd.read_csv(os.path.join(col_path, first_download)).columns
        second_cols = pd.read_csv(os.path.join(col_path, second_download)).columns
        column_names = first_cols.append(second_cols[4:])
        empty_df = pd.DataFrame(columns=column_names)
        return empty_df
    merged_df = start_empty_df()
    for state in range(1, midwest_states):
            path = f'{a_path} {state}'
            higher_ed_df = pd.read_csv(os.path.join(path, first_download))
            year_after_year = pd.read_csv(os.path.join(path, second_download))
            state_df = higher_ed_df.merge(year_after_year, how='left', on=['year', 'unitid', 'inst_name', 'state_abbr'])
            merged_df = pd.concat([merged_df, state_df])
    return merged_df

In [231]:
# Create df of all midwest states from 2010-2016 with median earnings indicator
midwest_df = create_merged_df()

In [232]:
# Define variable of interest
study_var = 'earnings_med'
panel_var = 'inst_control'

In [233]:
# Remove missing median earnings data
midwest_df = midwest_df.dropna(subset=[study_var])
midwest_df = midwest_df[midwest_df[study_var] != 'Suppressed data']

# Clean data types
midwest_df[study_var] = midwest_df[study_var].astype('float')
midwest_df['year'] = midwest_df['year'].astype('int64')

# Define State Names Variable
state_names = midwest_df['state_abbr'].unique()

In [234]:
def make_tab(df, state, study_var, panel_var, plot_title, y_title, tab_title=''):
    # Plot function used for all instances
    def a_plot(data, study_var, plot_title, y_title):
        plot_obj = figure(title=plot_title, x_axis_label='Year', y_axis_label=y_title, plot_height=300)
        plot_obj.line(x = data.year, y = data[study_var])
        plot_obj.xaxis.ticker = [2011, 2012, 2013, 2014]
        plot_obj.y_range = Range1d(25000, 60000)
        panel_obj = Panel(child=plot_obj, title=tab_title)
        return plot_obj, panel_obj
    
    # Check to see how to filter the data based on institution type, use plot function to return tab variables
    if tab_title == 'All Institutions': # exception for looking at aggregated institution data
        all_data =  df[df['state_abbr'] == state]
        plot, panel = a_plot(all_data, study_var, plot_title, y_title)
        return plot, panel
    else:
        inst_data = df[(df['state_abbr'] == state) & (df[panel_var] == tab_title)]
        plot, panel = a_plot(inst_data, study_var, plot_title, y_title)
        return plot, panel

# To look at one state disagregated by instiution control
def plot_one_state(df, state, study_var, panel_var, tab_title_list=[], plot_title='', y_title=''):
    df_all = df.groupby(['year', 'state_abbr'])[study_var].mean().reset_index()    
    df_inst = midwest_df.groupby(['year', 'state_abbr', 'inst_control'])[study_var].mean().reset_index()
    all_plot, all_panel = make_tab(df_all, state, study_var, panel_var, plot_title, y_title,  tab_title=tab_title_list[0])
    priv_plot, priv_panel = make_tab(df_inst, state, study_var, panel_var, plot_title, y_title,   tab_title=tab_title_list[1])
    priv_non_plot, priv_non_panel = make_tab(df_inst, state, study_var, panel_var,plot_title, y_title,  tab_title=tab_title_list[2])
    public_plot, public_panel = make_tab(df_inst, state, study_var, panel_var, plot_title, y_title,  tab_title=tab_title_list[3])
    tabs = Tabs(tabs=[all_panel, priv_panel, priv_non_panel, public_panel])
    return tabs

# Compare avg median earnings of students from two states
def plot_two_states(df, state, state2, study_var, plot_title='', y_title=''):
    df_all = df.groupby(['year', 'state_abbr'])[study_var].mean().reset_index()  
    data1 =  df_all[df_all['state_abbr'] == state]
    data2 =  df_all[df_all['state_abbr'] == state2]
    plot = figure(title=plot_title, x_axis_label='Year', y_axis_label=y_title, plot_height=300)
    plot.xaxis.ticker = [2011, 2012, 2013, 2014]
    plot.line(x = data1.year, y = data1[study_var], line_color = 'blue', legend_label=state)
    plot.line(x = data2.year, y = data2[study_var], line_color = 'red', legend_label=state2)
    return plot

In [235]:
# Define Plot Variables
tab_title_list = ['All Institutions', 'Private for-profit', 'Private not-for-profit', 'Public']

In [236]:
# Plot with interaction to compare states or look at one state with institution type tabs
@interact(compare=False)
def checkbox(compare=False):
    # Toggle to see if comparing states or just looking at one
    if compare==False:
        @interact(state=state_names) 
        # Plot one state with institution control panel,tabs
        def make_one_plot(state=state_names[0]):
            plot = plot_one_state(midwest_df, state, study_var, panel_var, tab_title_list, plot_title='Avg Median Earnings', y_title='Earnings, 2015 dollars')
            show(plot)
    else:
        @interact(state=state_names, state2=state_names)
        # Compare avg median earnings of students from two states
        def make_two_plots(state=state_names[0], state2=state_names[1]):
            plot = plot_two_states(midwest_df, state, state2, study_var, plot_title='Avg Median Earnings', y_title='Earnings, 2015 dollars')
            show(plot)

interactive(children=(Checkbox(value=False, description='compare'), Output()), _dom_classes=('widget-interact'…

My initial research question was how Illinois compared to other midwest states in terms of student median earnings from higher education, and if there is a significant difference in median earnings for students attending a public or private school. I was surprised to see for most states, median earnings of public school students is comparable or even higher than students attending private for-profit schools.

While I was somewhat able to generalize to allow for variation in the data files loaded and the variables of interest, as well as using a plot function and tab function to make the final interactive code more legible, it was difficult to generalize further. Specifically, I wanted to allow for a variable number of arugments to be passed into the tab function (to allow a different categorical variable to be chosen), but ran into trouble because the plot obj needs to be created on its own for each tab needed. Therefore, why this code allows for some variation in the chosen categoy, it could not do 5 categories for example. I tried for awhile to figure out how you call use a variable number of plots, but I'm not sure how given the plot obj needs to be written into a unique variable each time.

In [237]:
## TASK 3

In [238]:
# Import world shape file
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))

In [239]:
# Define variables of interest
continent = 'South America'
countries = ['AR', 'BR', 'CO', 'PE', 'CL', 'BO', 'EC', 'VE']
start_t = 1990
end_t = 2019
indicators = ['EN.ATM.CO2E.KT', 'EG.ELC.ACCS.ZS']
var_list = ['CO2', 'Electricity']
indic_dic = {indicators[0]: var_list[0], indicators[1]: var_list[1]}

In [240]:
# Subset world geo to contintent of study
def select_cont(world, continent):
    cont = world[world['continent'] == continent]
    return cont

# Download data of interest from World Bank
def get_wb_data(indicators, countries, start_t, end_t, indic_dic):
    wb_data = wb.download(indicator=indicators, country=countries, start=start_t, end=end_t)
    wb_data = wb_data.rename(columns=indic_dic)
    wb_data = wb_data.reset_index()
    return wb_data

In [241]:
# Prepare data to merge
cont_geo = select_cont(world, continent)
wb_df = get_wb_data(indicators, countries, start_t, end_t, indic_dic)
# cont_geo.head()
wb_df.head()


Unnamed: 0,country,year,CO2,Electricity
0,Argentina,2019,168100.006104,100.0
1,Argentina,2018,176899.993896,99.989578
2,Argentina,2017,179320.007324,100.0
3,Argentina,2016,183160.003662,99.849579
4,Argentina,2015,185550.003052,99.625389


In [242]:
# Prepare geo df for boundaries
cont_df = cont_geo.merge(wb_df, left_on='name', right_on='country', how ='outer')


# Prepare data df for plotting
data_df = wb_df.fillna(0)
data_df['year'] = data_df['year'].astype('int64')
data_df = cont_geo.merge(data_df, left_on='name', right_on='country', how ='outer')

In [243]:
# Create variables from prepared data
country_names = data_df.reset_index()['country'].unique()
print(country_names)

['Argentina' 'Chile' nan 'Brazil' 'Bolivia' 'Peru' 'Colombia' 'Ecuador'
 'Venezuela, RB']


In [244]:
@interact(Indicator=var_list, Year=(start_t,end_t))
def make_cont_plot(Indicator=var_list[0], Year=end_t):
    fig, ax = plt.subplots(figsize=(12,12))
    divider = make_axes_locatable(ax)
    cax = divider.append_axes('right', size='5%', pad=0.1)
    if Indicator == 'Electricity':
        # Making the exception for electricity because it conveys more information if min is restricted instead of 0
        min_val = 50
    else:
        min_val = data_df[Indicator].min()
    max_val = data_df[Indicator].max()
    
    plot_df = data_df[(data_df['year'] == Year)]
    ax = plot_df.plot(ax=ax, column=Indicator, legend=True, cax=cax, edgecolor='black', cmap='BuPu', vmin=min_val, vmax=max_val)
    ax = cont_df.boundary.plot(ax=ax, edgecolor='black')
    
    ax.axis('off')
    ax.set_title('Electrification and CO2 emissions in South America, 1990-2019')

interactive(children=(Dropdown(description='Indicator', options=('CO2', 'Electricity'), value='CO2'), IntSlide…