Weighted Means
==============
Because our results are reported at the school level and not
student level, our observations should be weighted by
the number of students at the school.

This notebook explores ways to handle this.

In [5]:
# automatically reload changes we make to schools.py and ui.py
%load_ext autoreload
%autoreload 2

In [None]:
# load the demographic data
import pandas as pd
import numpy as np
import scipy as scipy
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols

from IPython.display import Markdown as md

import schools
import ui

In [7]:
# load the demographic data and merge it with the ELA data
df = schools.load_school_demographics()

# load the data from the csv file
ela = pd.read_csv("ela-combined.csv")


#drop the rows with NaN (where the pop is too small to report)
ela = ela[ela["mean_scale_score"].notnull()]
df = df.merge(ela, how="inner", on=["dbn", "year"])

# show the columns in our merged data set
df.columns

Index(['dbn', 'district', 'boro', 'school_name', 'year', 'total_enrollment',
       'grade_3k_pk_half_day_full', 'grade_k', 'grade_1', 'grade_2', 'grade_3',
       'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9',
       'grade_10', 'grade_11', 'grade_12', 'female_n', 'female_pct', 'male_n',
       'male_pct', 'asian_n', 'asian_pct', 'black_n', 'black_pct',
       'hispanic_n', 'hispanic_pct', 'multi_racial_n', 'multi_racial_pct',
       'native_american_n', 'native_american_pct', 'white_n', 'white_pct',
       'missing_race_ethnicity_data_n', 'missing_race_ethnicity_data_pct',
       'swd_n', 'swd_pct', 'ell_n', 'ell_pct', 'poverty_n', 'poverty_pct',
       'eni_pct', 'grade', 'category', 'number_tested', 'mean_scale_score',
       'level_1', 'level_1_pct', 'level_2', 'level_2_pct', 'level_3',
       'level_3_pct', 'level_4', 'level_4_pct', 'level_3_4', 'level_3_4_pct'],
      dtype='object')

In [12]:
avg = df["mean_scale_score"].mean()
# https://github.com/adelphi-ed-tech/school-data-portal/blob/main/nb/anova.ipynb
display(avg)
np.average(df.mean_scale_score, weights=df.total_enrollment)


452.34530947209913

451.1622110567875

In [8]:
# calculate the mean test score and standard deviation for each group
mean_std = df.groupby('category').agg(Mean=('mean_scale_score', np.mean), STD=('mean_scale_score', np.std))
display(md("**Mean average and standard deviation of test scores for each group.**"))
display(mean_std)



**Mean average and standard deviation of test scores for each group.**

Unnamed: 0_level_0,Mean,STD
category,Unnamed: 1_level_1,Unnamed: 2_level_1
All Students,453.824116,148.209826
Asian,468.987975,144.430294
Black,448.021425,148.946499
Current ELL,428.820341,155.200938
Econ Disadv,450.794722,148.539211
Ever ELL,459.765105,147.375612
Female,457.652562,147.049476
Hispanic,451.02017,148.981667
Male,449.781515,149.430891
Never ELL,456.203887,147.562668


In [21]:
# run OLS regression on the different category of test scores
model = ols('mean_scale_score ~ C(category, Treatment(reference="All Students"))', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
display(md("### OLS Model Summary: Demographic Group Test Scores Regression"))
display(model.summary())
display(md("**ANOVA Table**"))
anova_table

### OLS Model Summary: Demographic Group Test Scores Regression

0,1,2,3
Dep. Variable:,mean_scale_score,R-squared:,0.398
Model:,OLS,Adj. R-squared:,0.396
Method:,Least Squares,F-statistic:,249.3
Date:,"Thu, 24 Mar 2022",Prob (F-statistic):,0.0
Time:,17:18:39,Log-Likelihood:,-17569.0
No. Observations:,4919,AIC:,35170.0
Df Residuals:,4905,BIC:,35260.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,600.9199,0.398,1508.033,0.000,600.139,601.701
"C(category, Treatment(reference=""All Students""))[T.Asian]",9.6385,0.774,12.459,0.000,8.122,11.155
"C(category, Treatment(reference=""All Students""))[T.Black]",-2.7529,0.626,-4.394,0.000,-3.981,-1.525
"C(category, Treatment(reference=""All Students""))[T.Current ELL]",-22.9465,0.692,-33.152,0.000,-24.303,-21.590
"C(category, Treatment(reference=""All Students""))[T.Econ Disadv]",0.1917,0.594,0.323,0.747,-0.974,1.357
"C(category, Treatment(reference=""All Students""))[T.Ever ELL]",3.2859,0.671,4.897,0.000,1.971,4.601
"C(category, Treatment(reference=""All Students""))[T.Female]",3.0494,0.568,5.364,0.000,1.935,4.164
"C(category, Treatment(reference=""All Students""))[T.Hispanic]",-2.2691,0.598,-3.795,0.000,-3.441,-1.097
"C(category, Treatment(reference=""All Students""))[T.Male]",-3.0541,0.569,-5.366,0.000,-4.170,-1.938

0,1,2,3
Omnibus:,223.023,Durbin-Watson:,0.537
Prob(Omnibus):,0.0,Jarque-Bera (JB):,330.244
Skew:,0.417,Prob(JB):,1.94e-72
Kurtosis:,3.957,Cond. No.,13.2


**ANOVA Table**

Unnamed: 0,sum_sq,df,F,PR(>F)
"C(category, Treatment(reference=""All Students""))",240840.943761,13.0,249.304265,0.0
Residual,364498.941506,4905.0,,


In [19]:
# run different OLS regression using the school demographics to predict mean_scale_score for all students
# the above example uses the R-like formual method for OLS
# this uses the python/programmatic approach

# calculate coefficients for these factors
factors = ['total_enrollment', 'asian_pct',  'black_pct', 
       'hispanic_pct',  'white_pct','swd_pct',  'ell_pct',  'poverty_pct', 'number_tested']

# only use the All Students data
data = df[df["category"]=="All Students"].copy()

# add boro as a factor, but convert to category first
# "dummies" basically converts the categorical boro data to "wide" Bolean data with one col for reach boro
dummies = pd.get_dummies(data[['boro']])

display(md("### OLS Model Summary: School Demographics Regression"))

y = data['mean_scale_score']
X = pd.concat([data[factors], dummies], axis=1)
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

display(model.summary())



### OLS Model Summary: School Demographics Regression

0,1,2,3
Dep. Variable:,mean_scale_score,R-squared:,0.728
Model:,OLS,Adj. R-squared:,0.72
Method:,Least Squares,F-statistic:,93.41
Date:,"Thu, 24 Mar 2022",Prob (F-statistic):,2.95e-119
Time:,17:17:25,Log-Likelihood:,-1405.3
No. Observations:,468,AIC:,2839.0
Df Residuals:,454,BIC:,2897.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,515.4631,9.411,54.774,0.000,496.969,533.957
total_enrollment,-0.0004,0.001,-0.420,0.675,-0.003,0.002
asian_pct,15.8875,12.735,1.248,0.213,-9.139,40.914
black_pct,-1.1354,12.589,-0.090,0.928,-25.875,23.604
hispanic_pct,6.1447,12.319,0.499,0.618,-18.064,30.353
white_pct,8.7286,12.240,0.713,0.476,-15.326,32.784
swd_pct,-45.9423,4.452,-10.319,0.000,-54.692,-37.193
ell_pct,-36.1449,2.778,-13.009,0.000,-41.605,-30.685
poverty_pct,-9.4903,2.893,-3.280,0.001,-15.176,-3.805

0,1,2,3
Omnibus:,21.447,Durbin-Watson:,1.823
Prob(Omnibus):,0.0,Jarque-Bera (JB):,58.654
Skew:,0.027,Prob(JB):,1.83e-13
Kurtosis:,4.734,Cond. No.,2.02e+18
