In [1]:
# load the demographic data
import pandas as pd
import numpy as np
import scipy as scipy
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols

import schools
df = schools.load_school_demographics()



# load the data from the csv file
ela = pd.read_csv("ela-combined.csv")


# # drop the rows with NaN (where the pop is too small to report)
ela = ela[ela["mean_scale_score"].notnull()]
df = df.merge(ela, how="inner", on=["dbn", "year"])

df.columns

Index(['dbn', 'school_name', 'year', 'total_enrollment',
       'grade_3k_pk_half_day_full', 'grade_k', 'grade_1', 'grade_2', 'grade_3',
       'grade_4', 'grade_5', 'grade_6', 'grade_7', 'grade_8', 'grade_9',
       'grade_10', 'grade_11', 'grade_12', 'female', 'female_1', 'male',
       'male_1', 'asian', 'asian_1', 'black', 'black_1', 'hispanic',
       'hispanic_1', 'multi_racial', 'multi_racial_1', 'native_american',
       'native_american_1', 'white', 'white_1', 'missing_race_ethnicity_data',
       'missing_race_ethnicity_data_1', 'students_with_disabilities',
       'students_with_disabilities_1', 'english_language_learners',
       'english_language_learners_1', 'poverty', 'poverty_1',
       'economic_need_index', 'district', 'boro', 'grade', 'category',
       'number_tested', 'mean_scale_score', 'level_1', 'level_1_pct',
       'level_2', 'level_2_pct', 'level_3', 'level_3_pct', 'level_4',
       'level_4_pct', 'level_3_4', 'level_3_4_pct'],
      dtype='object')

In [2]:
# df["grade"] = df["grade"].astype(int)
df = df[df["grade"] =='8']
df = df[df["year"] == 2019]
df[["dbn", "grade","mean_scale_score"]]



Unnamed: 0,dbn,grade,mean_scale_score
602,01M034,8,596.843750
608,01M034,8,603.545471
614,01M034,8,582.099976
619,01M034,8,587.400024
626,01M034,8,599.117676
...,...,...,...
200396,32K562,8,595.762390
200399,32K562,8,595.625000
200403,32K562,8,577.666687
200407,32K562,8,600.583313


In [3]:
# create 4 groups

black = df[df["category"] == "Black"][["dbn", "mean_scale_score"]]
white = df[df["category"] == "White"][["dbn", "mean_scale_score"]]
hispanic = df[df["category"] == "Hispanic"][["dbn", "mean_scale_score"]]
asian = df[df["category"] == "Asian"][["dbn", "mean_scale_score"]]



In [4]:
t = scipy.stats.ttest_ind(white["mean_scale_score"],black["mean_scale_score"])

print(f"""
T-Test results comparing school averages of White (n={white["dbn"].count()}) and Black (n={black["dbn"].count()})students in 8th grade student ELA scores for 2019-20 academic year.

White students: M={white["mean_scale_score"].mean()}, SD={white["mean_scale_score"].std()}
Black students: M={black["mean_scale_score"].mean()}, SD={black["mean_scale_score"].std()}
T-score: {round(t.statistic, 4)}, p-val: {round(t.pvalue, 4)}
""")


T-Test results comparing school averages of White (n=176) and Black (n=318)students in 8th grade student ELA scores for 2019-20 academic year.

White students: M=608.2399111659091, SD=10.524117180693764
Black students: M=598.1670142591196, SD=7.7265981679485085
T-score: 12.1507, p-val: 0.0



In [5]:
# fvalue, pvalue = scipy.stats.f_oneway(
#     df[df["category"]=="Asian"]["mean_scale_score"], 
#     df[df["category"]=="Black"]["mean_scale_score"],
#     df[df["category"]=="Hispanic"]["mean_scale_score"],
#     df[df["category"]=="White"]["mean_scale_score"])

fvalue, pvalue = scipy.stats.f_oneway(
    asian["mean_scale_score"], 
    black["mean_scale_score"],
    hispanic["mean_scale_score"],
    white["mean_scale_score"])

# (47938.68162514735, 0.0)
fvalue, pvalue

(124.03897740929158, 1.2920393648289998e-68)

In [6]:
adf = df[df["category"].isin(["All Students", "Black", "White", "Asian", "Hispanic"])]
adf = adf[["category","mean_scale_score"]]



In [7]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

model = ols('mean_scale_score ~ C(category)', data=adf).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

Unnamed: 0,sum_sq,df,F,PR(>F)
C(category),28520.374094,4.0,89.997968,1.258879e-68
Residual,118837.575402,1500.0,,


In [8]:
squares = ols('mean_scale_score ~ C(category)', data=adf).fit()
squares.summary()

0,1,2,3
Dep. Variable:,mean_scale_score,R-squared:,0.194
Model:,OLS,Adj. R-squared:,0.191
Method:,Least Squares,F-statistic:,90.0
Date:,"Mon, 21 Mar 2022",Prob (F-statistic):,1.2599999999999999e-68
Time:,18:30:36,Log-Likelihood:,-5423.1
No. Observations:,1505,AIC:,10860.0
Df Residuals:,1500,BIC:,10880.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,600.9199,0.411,1460.522,0.000,600.113,601.727
C(category)[T.Asian],9.6385,0.799,12.066,0.000,8.072,11.205
C(category)[T.Black],-2.7529,0.647,-4.256,0.000,-4.022,-1.484
C(category)[T.Hispanic],-2.2691,0.617,-3.676,0.000,-3.480,-1.058
C(category)[T.White],7.3200,0.787,9.301,0.000,5.776,8.864

0,1,2,3
Omnibus:,59.979,Durbin-Watson:,0.9
Prob(Omnibus):,0.0,Jarque-Bera (JB):,105.76
Skew:,0.309,Prob(JB):,1.0799999999999999e-23
Kurtosis:,4.142,Cond. No.,5.01
