In [29]:
# load the demographic data
import pandas as pd
import numpy as np
import scipy as scipy
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
import pingouin as pg


import schools


In [46]:
demo_df = schools.load_school_demographics()
math_public_url = "https://data.cityofnewyork.us/resource/74ah-8ukf.csv?$limit=1000000"
math_charter_url = "https://data.cityofnewyork.us/resource/3xsw-bpuy.csv?$limit=1000000"

public_df = pd.read_csv(math_public_url)
public_df["mean_scale_score"] = pd.to_numeric(public_df["mean_scale_score"], downcast='integer', errors='coerce')
public_df.drop(["school_name"], axis=1, inplace=True)
public_df["charter"] = False

charter_df = pd.read_csv(math_charter_url)
charter_df["mean_scale_score"] = pd.to_numeric(charter_df["mean_scale_score"], downcast='integer', errors='coerce')

charter_df.drop(["unnamed_column", "school_name"], axis=1, inplace=True)
charter_df["charter"] = True


public_df = public_df[['number_tested', 'grade', 'dbn', 'year', 'mean_scale_score', 'charter']]
charter_df = charter_df[['number_tested', 'grade', 'dbn', 'year', 'mean_scale_score', 'charter']]

a = demo_df.merge(public_df, how="inner", on=["dbn", "year"])
b = demo_df.merge(charter_df, how="inner", on=["dbn", "year"])
df = a.append(b)

In [41]:
df["charter"]


0       False
1       False
2       False
3       False
4       False
        ...  
3479     True
3480     True
3481     True
3482     True
3483     True
Name: charter, Length: 22364, dtype: bool

In [47]:
# the columns we want to look at
cols = ['dbn', 
        'district',
        'boro',
        'school_name', 
        'charter',
        'year',
        'grade',
        'number_tested',
        'mean_scale_score',
        'total_enrollment',
        'female_1',
        'male_1',
        'asian_1', 
        'black_1', 
        'hispanic_1', 
        'multi_racial_1', 
        'native_american_1', 
        'white_1', 
        'students_with_disabilities_1', 
        'english_language_learners_1',  
        'poverty_1',
        'economic_need_index',
       ]
# make the combined df the default df
df = df[cols]

In [48]:
# Pearson R correlation
charter_R = pg.corr(df["charter"], df["mean_scale_score"])

charter_R


Unnamed: 0,n,r,CI95%,p-val,BF10,power
pearson,22299,0.05598,"[0.04, 0.07]",5.986905000000001e-17,13160000000000.0,1.0


In [51]:
# T test
t = scipy.stats.ttest_ind(public_df["mean_scale_score"], charter_df["mean_scale_score"])

print(f"""
T-Test results comparing average math test scores at the school level of charter schools (n={charter_df["dbn"].count()}) and community schools (n={public_df["dbn"].count()}).

  Charter schools: M={round(charter_df["mean_scale_score"].mean(), 2)}, SD={round(charter_df["mean_scale_score"].std(), 2)}
Community schools: M={round(public_df["mean_scale_score"].mean(), 2)}, SD={round(public_df["mean_scale_score"].std(), 2)}
T-score: {round(t.statistic, 2)}, p-val: {round(t.pvalue, 4)}
""")

Ttest_indResult(statistic=nan, pvalue=nan)


T-Test results comparing average math test scores at the school level of charter schools (n=5267) and community schools (n=32606).

  Charter schools: M=419.15, SD=142.22
Community schools: M=387.41, SD=136.96
T-score: nan, p-val: nan

