In [65]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from os import listdir
import os

In [66]:
def organize_df(df):
    df["Year"] = df["REF_DATE"].str.split("/").str[-1]
    df["Year"] = pd.DatetimeIndex(df["Year"]).year
    df = df[df["GEO"] != "Canada"]
    return df

In [67]:
male = pd.read_csv("data/male_teacher_count.csv")
female = pd.read_csv("data/female_teacher_count.csv")
all = pd.read_csv("data/teacher_count.csv")

In [68]:
all_df = pd.concat([organize_df(all), organize_df(female), organize_df(male)])
all_df.rename(columns={"VALUE": "Count"}, inplace=True)
all_df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,Work status,Age group,Sex,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,Count,STATUS,SYMBOL,TERMINATED,DECIMALS,Year
15,2002/2003,Newfoundland and Labrador,2016A000210,All educators,All ages,Both sexes,Number,223,units,0,v65932896,2.1.1.1,8034.0,,,,0,2003
16,2003/2004,Newfoundland and Labrador,2016A000210,All educators,All ages,Both sexes,Number,223,units,0,v65932896,2.1.1.1,7920.0,,,,0,2004
17,2004/2005,Newfoundland and Labrador,2016A000210,All educators,All ages,Both sexes,Number,223,units,0,v65932896,2.1.1.1,7737.0,,,,0,2005
18,2005/2006,Newfoundland and Labrador,2016A000210,All educators,All ages,Both sexes,Number,223,units,0,v65932896,2.1.1.1,7320.0,,,,0,2006
19,2006/2007,Newfoundland and Labrador,2016A000210,All educators,All ages,Both sexes,Number,223,units,0,v65932896,2.1.1.1,7401.0,,,,0,2007


In [69]:
all_df["GEO"].unique()

array(['Newfoundland and Labrador', 'Prince Edward Island', 'Nova Scotia',
       'New Brunswick', 'Quebec', 'Ontario', 'Manitoba', 'Saskatchewan',
       'Alberta', 'British Columbia', 'Yukon', 'Northwest Territories',
       'Nunavut'], dtype=object)

In [70]:
colors=['#1f77b4',  # muted blue
        '#ff7f0e',  # safety orange
        '#2ca02c',  # cooked asparagus green
        '#d62728',  # brick red
        '#9467bd',  # muted purple
        '#8c564b',  # chestnut brown
        '#e377c2',  # raspberry yogurt pink
        '#17becf',  # blue-teal
        '#210240',  # dark purple
        '#21DC49',  # bright green
        '#3F5063',  # dark navy
        '#6C7075',  # dark grey
        '#F4BC1A']  # mustard

color_dict = dict(zip(all_df["GEO"].unique(), colors))
all_df["Color"] = all_df["GEO"].map(color_dict)

In [71]:
all_fig = go.Figure()
genders = ["Both sexes", "Males", "Females"]

for sex in genders:
      col_num = 0
      df_int = all_df[all_df["Sex"] == sex]
      all_prov = all_df["GEO"].unique()
      for ind, province in enumerate(all_prov):
            all_fig.add_trace(go.Scatter(
                  x = df_int[df_int["GEO"] == province]["Year"],
                  y = df_int[df_int["GEO"] == province]["Count"],
                  mode = "lines",
                  line = dict(color=colors[col_num]),
                  name = province
                  ))
            col_num += 1

all_buttons = []
prov_var_dict = [[] for _ in range(len(genders))]

for ind_sex, sex in enumerate(["Both sexes", "Males", "Females"]):
      df_sex = all_df[all_df["Sex"] == sex]
      all_provs = all_df["GEO"].unique()
      for province in all_provs:
            prov_var_dict[ind_sex].append(province)

currnum = 0

for ind, sex in enumerate(prov_var_dict):
      traces = [False] * len(all_fig.data)
      oldnum = currnum
      num_provs = len(sex)
      traces[0:oldnum] = [False for i in traces[0:oldnum]]
      currnum += num_provs
      traces[oldnum:currnum] = ["legendonly" for i in traces[oldnum:currnum]]
      traces[oldnum] = [True]
      all_buttons.append(dict(
            label=genders[ind],
            method="update",
            args=[{"visible":traces}]
      ))

all_fig.update_layout(
      showlegend=True,
      updatemenus=[dict(
            active=0,
            buttons=all_buttons,
            direction="down"
      )]
)

all_fig.show()b

In [80]:
all_fig.write_html("count_trend.html")

In [72]:
enrol_path = "/Users/jennylee/Library/Mobile Documents/com~apple~CloudDocs/Documents/Cybera/world-teachers-day/data/enroll_data"
all_files = [file for file in listdir(enrol_path)]

In [73]:
def advanced_organize_df(path):
      df = pd.read_csv(path)
      df = organize_df(df)
      df = df[(df["School type"] == "Total, school type") & (df["Program type"] == "Regular programs for youth")]
      return df

In [74]:
all_files[0]

'enroll_on.csv'

In [75]:
for file in all_files:
      file_path = f"data/enroll_data/{file}"
      file_df = advanced_organize_df(file_path)
      if file == all_files[0]:
            joined_df = file_df.copy()
      else:
            joined_df = pd.concat([joined_df, file_df], axis=0)

joined_df.head()

Unnamed: 0,REF_DATE,GEO,DGUID,School type,Program type,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS,Year
15,2003/2004,Ontario,2016A000235,"Total, school type",Regular programs for youth,Number,223,units,0,v1001753215,7.1.2,2129742.0,,,,0,2004
16,2004/2005,Ontario,2016A000235,"Total, school type",Regular programs for youth,Number,223,units,0,v1001753215,7.1.2,2123904.0,,,,0,2005
17,2005/2006,Ontario,2016A000235,"Total, school type",Regular programs for youth,Number,223,units,0,v1001753215,7.1.2,2118543.0,,,,0,2006
18,2006/2007,Ontario,2016A000235,"Total, school type",Regular programs for youth,Number,223,units,0,v1001753215,7.1.2,2221422.0,,,,0,2007
19,2007/2008,Ontario,2016A000235,"Total, school type",Regular programs for youth,Number,223,units,0,v1001753215,7.1.2,2208771.0,,,,0,2008


In [76]:
joined_df.to_csv("data/enroll_data/all_enrollment.csv", index=False)

In [77]:
count_df = all_df[["GEO", "Count", "Year", "Sex"]]
count_df = count_df[count_df["Sex"] == "Both sexes"]
enrol_df = joined_df[["GEO", "VALUE", "Year"]].rename(columns={"VALUE": "Enrollment"})
comb_df = count_df.merge(enrol_df, on=["GEO", "Year"])
comb_df

Unnamed: 0,GEO,Count,Year,Sex,Enrollment
0,Newfoundland and Labrador,8034.0,2003,Both sexes,84336.0
1,Newfoundland and Labrador,8034.0,2003,Both sexes,84336.0
2,Newfoundland and Labrador,7920.0,2004,Both sexes,81510.0
3,Newfoundland and Labrador,7920.0,2004,Both sexes,81510.0
4,Newfoundland and Labrador,7737.0,2005,Both sexes,79485.0
...,...,...,...,...,...
383,Nunavut,783.0,2015,Both sexes,9888.0
384,Nunavut,762.0,2016,Both sexes,10026.0
385,Nunavut,762.0,2016,Both sexes,10026.0
386,Nunavut,864.0,2017,Both sexes,10041.0


In [78]:
comb_df["Count"].corr(comb_df["Enrollment"])
# Include another column to divide enrollment divided by counts to predict class size
# Bar graph with province on the x-axis and enrollment per teacher on the y-axis
# PISA scores. 15 Years old. Compare provinces. FRASIER institude.
# See enrollment per educator is changing over time across provinces. 
# Higher enrollment vs. educator rate is related to higher literacy rates?

0.9847489932529957