In [1]:
%%html

<script>
  function code_toggle() {
    if (code_shown){
      $('div.input').hide('500');
      $('#toggleButton').val('Show Code')
    } else {
      $('div.input').show('500');
      $('#toggleButton').val('Hide Code')
    }
    code_shown = !code_shown
  }
  
  $( document ).ready(function(){
    code_shown=false;
    $('div.input').hide()
  });
</script>
<form action="javascript:code_toggle()"><input type="submit" id="toggleButton" value="Show Code"></form>

In [7]:
import pandas as pd
import numpy as np
from numpy import nan as Nan
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import re
import ipywidgets as widgets
from IPython.display import display
init_notebook_mode(connected=True)

In [8]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

## Analysing Diploma Exam Data - results by school

 <img src="images/Alberta_education.jpg" width="200px" align="right"/>
 Provincial diploma exam results are located here: https://education.alberta.ca/diploma-exam-administration/diploma-results/?searchMode=3 .

We will download provincial results by shool xlsx spreadsheet: diploma-multiyear-sch-list-annual.xlsx

In [9]:
school_results = pd.read_excel('https://education.alberta.ca/media/3680580/diploma-multiyear-sch-list-annual.xlsx')
school_results.head()

Unnamed: 0,Diploma Course,Authority Type,Authority Code,Authority Name,School Code,School Name,2013 Sch Students Writing,2013 Sch School Mark % Exc,2013 Sch School Mark % Acc,2013 Sch School Average %,...,2017 Sch School Mark % Exc,2017 Sch School Mark % Acc,2017 Sch School Average %,2017 Sch School Standard Deviation %,2017 Sch Exam Mark % Exc,2017 Sch Exam Mark Exc Sig,2017 Sch Exam Mark % Acc,2017 Sch Exam Mark Acc Sig,2017 Sch Exam Average %,2017 Sch Exam Standard Deviation %
0,Biology 30,Charter,9.0,Foundations for the Future Charter Academy Cha...,12.0,FFCA High School Campus,100.0,59.0,96.0,79.5,...,58.8,99.1,79.6,12.8,44.7,=,86.8,=,73.9,16.4
1,Chemistry 30,Charter,9.0,Foundations for the Future Charter Academy Cha...,12.0,FFCA High School Campus,56.0,26.8,83.9,67.7,...,53.5,98.0,78.0,13.0,55.6,=,93.9,=,77.6,16.7
2,English Lang Arts 30-1,Charter,9.0,Foundations for the Future Charter Academy Cha...,12.0,FFCA High School Campus,111.0,42.3,99.1,74.6,...,36.1,100.0,74.5,11.0,21.3,=,96.7,=,69.3,11.2
3,English Lang Arts 30-2,Charter,9.0,Foundations for the Future Charter Academy Cha...,12.0,FFCA High School Campus,21.0,9.5,95.2,69.5,...,18.2,100.0,71.9,7.4,36.4,=,100.0,=,75.8,8.7
4,Mathematics 30-1,Charter,9.0,Foundations for the Future Charter Academy Cha...,12.0,FFCA High School Campus,84.0,36.9,95.2,73.0,...,50.9,94.7,78.0,15.2,50.9,,87.7,,74.5,20.2


This data set needs to be reshaped - year and all the resluts will be split in separate columns. Let's reshape it and remove rows  having  "n/a" in following columns: 'Sch Students Writing','Sch School Mark % Acc','Sch School Mark % Exc','Sch Exam Mark % Exc','Sch Exam Mark % Acc'.
Examine the result:

In [10]:
school_results_reshaped  = school_results.copy()

def splitter(string):
    r = re.compile(r'\d{4}|\S.*$')
    return r.findall(string)

cols = list(school_results_reshaped)[0:6]
years = ['2013', '2014', '2015', '2016', '2017']

for year in years:
    for names in cols:
        school_results_reshaped[str(year +" "+names)] = school_results_reshaped[names]

school_results_reshaped.columns = pd.MultiIndex.from_tuples([tuple(splitter(c)) for c in school_results_reshaped.columns])
school_results_reshaped = school_results_reshaped.stack(0).reset_index(1)

school_results_reshaped.rename(columns={'level_1': "Year"}, inplace=True)
school_results_reshaped = school_results_reshaped.reset_index(drop=True)
school_results_reshaped[['Sch Students Writing','Sch School Mark % Acc','Sch School Mark % Exc','Sch Exam Mark % Exc','Sch Exam Mark % Acc']]=school_results_reshaped[['Sch Students Writing','Sch School Mark % Acc','Sch School Mark % Exc','Sch Exam Mark % Exc','Sch Exam Mark % Acc']].apply(pd.to_numeric, errors='coerce')
school_results_reshaped=school_results_reshaped.dropna(subset=['Sch Students Writing','Sch School Mark % Acc','Sch Exam Mark % Exc','Sch School Mark % Exc','Sch Exam Mark % Acc'])
school_results_reshaped=school_results_reshaped[:-5]
school_results_reshaped.head()

Unnamed: 0,Year,Authority Code,Authority Name,Authority Type,Diploma Course,Sch Exam Average %,Sch Exam Mark % Acc,Sch Exam Mark % Exc,Sch Exam Mark Acc Sig,Sch Exam Mark Exc Sig,Sch Exam Standard Deviation %,Sch School Average %,Sch School Mark % Acc,Sch School Mark % Exc,Sch School Standard Deviation %,Sch Students Writing,School Code,School Name
0,2013,9.0,Foundations for the Future Charter Academy Cha...,Charter,Biology 30,74.3,92.0,42.0,=,=,14.4,79.5,96.0,59.0,13.5,100.0,12.0,FFCA High School Campus
1,2014,9.0,Foundations for the Future Charter Academy Cha...,Charter,Biology 30,73.9,88.1,46.4,=,=,16.4,77.9,97.6,56.0,14.2,84.0,12.0,FFCA High School Campus
2,2015,9.0,Foundations for the Future Charter Academy Cha...,Charter,Biology 30,72.1,92.2,39.2,=,=,15.7,74.2,98.0,45.1,13.7,102.0,12.0,FFCA High School Campus
3,2016,9.0,Foundations for the Future Charter Academy Cha...,Charter,Biology 30,76.0,95.2,48.1,=,=,14.5,77.6,99.0,49.0,13.0,104.0,12.0,FFCA High School Campus
4,2017,9.0,Foundations for the Future Charter Academy Cha...,Charter,Biology 30,73.9,86.8,44.7,=,=,16.4,79.6,99.1,58.8,12.8,114.0,12.0,FFCA High School Campus


Let's create interactive plot - choose school and subject and get 'Sch Students Writing','Sch Exam Mark % Acc' and 'Sch Exam Mark % Exc' for all available years:

In [11]:
def view_schools(subject):

    result = school_results_reshaped[school_results_reshaped['School Name'] == schoolW.value]
    result = result[result['Diploma Course'] == subject]

    trace1 = go.Bar( x= result['Year'], y=result['Sch Students Writing'],
                name='Number of students taken the course', marker=dict(color='#59606D'))

    trace2 = go.Bar(x=result['Year'], y=result['Sch Students Writing']/100*result['Sch Exam Mark % Acc'],
                name='Number of students achieved acceptable standard', marker=dict(color='#ffcdd2'))

    trace3 = go.Bar(x=result['Year'], y=result['Sch Students Writing']/100*result['Sch Exam Mark % Exc'],
        name='Number of students achieved standard of excellence',marker=dict(color='#A2D5F2'))


    data = [trace1, trace2, trace3]
    layout = go.Layout(title=subject + "/" + schoolW.value,
                xaxis=dict(title='Year'),
                yaxis=dict(title='Number of students'))
    fig = go.Figure(data=data, layout=layout)

    iplot(fig)


def select_school(school):
    subjectW.options = school_results_reshaped[school_results_reshaped['School Name'] == school]['Diploma Course'].unique()
   

schoolW = widgets.Select(options=np.sort(school_results_reshaped['School Name'].unique()))
subjectW = widgets.Select(options=school_results_reshaped[school_results_reshaped['School Name'] == schoolW]['Diploma Course'].unique())

i = widgets.interactive(select_school, school=schoolW)
j = widgets.interactive(view_schools, subject=subjectW)

display(i)
display(j)

#### Comparing multiple schools.

1. Downloading provincial results and reshaping:

In [12]:
provincial_results = pd.read_excel('https://education.alberta.ca/media/3680581/diploma-multiyear-province-annual.xlsx')  
years = []
stats = []
for value in provincial_results.columns.values[1:-1]:
    year = value[0:4]
    stat = value[5:]
    int(year)
    if year not in years:
        years.append(year)
    if stat not in stats:
        stats.append(stat)
provincial_results_reshaped = pd.DataFrame(columns=(['Diploma Course','Year'] + stats))
for ind,row in provincial_results.drop(provincial_results.index[len(provincial_results)-1]).iterrows():
    new_row = pd.DataFrame(columns=(['Diploma Course','Year'] + stats))
    new_row.loc[0] = [Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan,Nan]
    new_row.loc[0]['Diploma Course'] = row['Diploma Course']                                       
    for year in years:
        new_row.loc[0]['Year'] = year
        for stat in stats:
            new_row.loc[0][stat] = row[year+" "+stat]
        provincial_results_reshaped=provincial_results_reshaped.append(new_row)                               
provincial_results_reshaped = provincial_results_reshaped.reset_index(drop=True)

provincial_results_reshaped=provincial_results_reshaped.rename(columns = {'Prov School Mark % Exc':'Sch School Mark % Exc'})
provincial_results_reshaped=provincial_results_reshaped.rename(columns = {'Prov School Mark % Acc':'Sch School Mark % Acc'})
provincial_results_reshaped=provincial_results_reshaped.rename(columns = {'Prov Exam Mark % Exc':'Sch Exam Mark % Exc'})
provincial_results_reshaped=provincial_results_reshaped.rename(columns = {'Prov Exam Mark % Acc':'Sch Exam Mark % Acc'})
provincial_results_reshaped.head()

Unnamed: 0,Diploma Course,Year,Prov Students Writing,Sch School Mark % Exc,Sch School Mark % Acc,Prov School Average %,Prov School Standard Deviation %,Sch Exam Mark % Exc,Prov Exam Mark Exc Sig,Sch Exam Mark % Acc,Prov Exam Mark Acc Sig,Prov Exam Average %,Prov Exam Standard Deviation %
0,Biology 30,2013,22429,42.9,96.0,74.4,13.9,32.2,+,84.4,+,68.8,16.5
1,Biology 30,2014,21733,43.6,96.2,74.7,13.8,31.8,+,85.2,+,68.9,16.6
2,Biology 30,2015,21257,45.3,96.4,75.2,13.8,33.0,+,85.8,+,69.4,16.5
3,Biology 30,2016,22550,47.0,97.1,75.9,13.7,32.4,=,85.1,=,69.1,16.8
4,Biology 30,2017,22993,48.4,97.0,76.3,13.7,32.3,=,84.2,-,68.7,16.9


2. Compaing 5 schools between each other and with provincial average:

In [13]:

def view_schools(school1,school2,school3,school4, school5, subject, metric):

    result1 = school_results_reshaped[school_results_reshaped['School Name'] == school1]
    result1 = result1[result1['Diploma Course'] == subject]
    
    result2 = school_results_reshaped[school_results_reshaped['School Name'] == school2]
    result2 = result2[result2['Diploma Course'] == subject]
    
    result3 = school_results_reshaped[school_results_reshaped['School Name'] == school3]
    result3 = result3[result3['Diploma Course'] == subject]
    
    result4 = school_results_reshaped[school_results_reshaped['School Name'] == school4]
    result4 = result4[result4['Diploma Course'] == subject]
    
    result5 = school_results_reshaped[school_results_reshaped['School Name'] == school5]
    result5 = result5[result5['Diploma Course'] == subject]
    
    provincial_results = provincial_results_reshaped[provincial_results_reshaped['Diploma Course'] == subject]
    
    trace1 = go.Scatter(
    x = result1['Year'],
    y = result1[metric],
    mode='markers',
    opacity=0.7,
    marker={
        'size': 15,
        'line': {'width': 0.5, 'color': 'white'}
            },
    name = school1)
        
    trace2 = go.Scatter(
    x = result2['Year'],
    y = result2[metric],
    mode = 'markers',
    opacity=0.7,
    marker={
        'size': 15,
        'line': {'width': 0.5, 'color': 'white'}
            },
    name = school2)    
        
    trace3 = go.Scatter(
    x =  result3['Year'],
    y = result3[metric],
    mode = 'markers',
    opacity=0.7,
    marker={
        'size': 15,
        'line': {'width': 0.5, 'color': 'white'}
            },
    name = school3)
        
    trace4 = go.Scatter(
    x = result4['Year'],
    y = result4[metric],
    mode = 'markers',
    opacity=0.7,
    marker={
        'size': 15,
        'line': {'width': 0.5, 'color': 'white'}
            },
    name = school4)

        
    trace5 = go.Scatter(
    x = result5['Year'],
    y = result5[metric],
    mode = 'markers',
    opacity=0.7,
    marker={
        'size': 15,
        'line': {'width': 0.5, 'color': 'white'}
            },
    name = school5)
    
    trace6 = go.Scatter(
        x=provincial_results['Year'],
        y=provincial_results[metric],
        mode='markers',
        opacity=0.7,
        marker={
            'size': 15,
            'symbol':'diamond',
            'color': 'red',
            'line': {'width': 0.5, 'color': 'white'}
            },
        name='Provincial Average'
        )
        
    data = [trace1, trace2, trace3, trace4, trace5, trace6]
    layout = go.Layout(
                title=subject + " - percentage of students achived " + str(metric),
                xaxis=dict(title='Year', type='category'),
                yaxis=dict(title='% of students'))
    fig = go.Figure(data=data, layout=layout)

    iplot(fig)


interact(view_schools, school1=np.sort(school_results_reshaped['School Name'].unique()),
         school2=np.sort(school_results_reshaped['School Name'].unique()),
         school3=np.sort(school_results_reshaped['School Name'].unique()),
         school4=np.sort(school_results_reshaped['School Name'].unique()),
         school5=np.sort(school_results_reshaped['School Name'].unique()),
         subject=np.sort(school_results_reshaped['Diploma Course'].unique()),
         metric=['Sch Exam Mark % Exc','Sch Exam Mark % Acc','Sch School Mark % Acc','Sch School Mark % Exc']);

NameError: name 'interact' is not defined