In [1]:
# Import pandas and load the school demographic data set into df
import pandas as pd
import numpy as np
from IPython.display import Markdown as md
# Core functions for importing and manipulating school data

# Using some of the statistics tools from the new example notebook files
import scipy as scipy
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
from nycschools import schools, exams, ui, geo

In [2]:
math_df = exams.load_math()
demo_df = schools.load_school_demographics()
data = demo_df.merge(math_df, on=['dbn', 'ay'], how='inner')
data.columns

  warn(msg)


ValueError: Length mismatch: Expected axis has 20 elements, new values have 19 elements

In [9]:
data.mean_scale_score.describe()

# data.describe

count    165330.000000
mean        499.594753
std         141.557821
min         219.000000
25%         317.587540
50%         590.075012
75%         602.125000
max         644.222229
Name: mean_scale_score, dtype: float64

In [18]:
import numpy as np
import scipy as scipy
from scipy.stats import pearsonr
import statsmodels.api as sm
import pingouin as pg
from statsmodels.formula.api import ols

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import scale 
from sklearn.cross_decomposition import PLSRegression

data = data[data.mean_scale_score.notnull()]
data = data[data.category == 'All Students']
# data = data.query(f"ay == {data.ay.max()} and category == 'All Students'")


model = LinearRegression()

# factors = ['asian_1', 'hispanic_1', 'black_1', 'white_1', 'poverty_1', 'students_with_disabilities_1', 'english_language_learners_1']
factors = ['asian_pct', 'hispanic_pct', 'black_pct', 'white_pct', 'swd_pct', 'ell_pct', 'poverty_pct', 'eni_pct']

X = data[factors]
y = data['level_3_4_pct']
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.3)


model.fit(X_train, y_train)
predictions = model.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
print('The r2 is: ', r2)
print('The rmse is: ', rmse)
list(zip(factors,model.coef_))

The r2 is:  0.4765337087089281
The rmse is:  18.18922138974904


[('asian_pct', 40.24329724156775),
 ('hispanic_pct', 7.4467214152752135),
 ('black_pct', -1.0411387647653454),
 ('white_pct', 25.900129394751726),
 ('swd_pct', -128.7506962924882),
 ('ell_pct', -51.61717392314424),
 ('poverty_pct', -36.24487677415301),
 ('eni_pct', 18.964808905075067)]

In [10]:
model = LinearRegression()

# shuffle our data frame so test, train are randomized, but the same across runs
data = data.sample(frac=1).reset_index(drop=True)

# make a small function so that we can report r2 and mse for different factors

def show_predict(factors, title):
    X = data[factors]
    y = data['mean_scale_score']
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, train_size=0.3)

    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    r2 = r2_score(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    report = f"""
**{title}**

- factors: {factors}
- r2: {r2}
- rmse: {rmse}
"""
    display(md(report))

factors = ['total_enrollment', 'asian_pct','black_pct', 
       'hispanic_pct',  'white_pct','swd_pct', 'ell_pct',  'poverty_pct', 'charter']
show_predict(factors, "With total enrollment")

factors = ['asian_pct','black_pct', 
       'hispanic_pct',  'white_pct','swd_pct', 'ell_pct',  'poverty_pct', 'charter']
show_predict(factors, "Without total enrollment")

factors = ['asian_pct','black_pct', 
       'hispanic_pct',  'white_pct','swd_pct', 'ell_pct',  'poverty_pct', 'eni_pct', 'charter']
show_predict(factors, "Adding ENI" )


factors = ['asian_pct','black_pct', 
       'hispanic_pct',  'white_pct','swd_pct', 'ell_pct', 'eni_pct', 'charter']
show_predict(factors, "ENI without Poverty %" )


**With total enrollment**

- factors: ['total_enrollment', 'asian_pct', 'black_pct', 'hispanic_pct', 'white_pct', 'swd_pct', 'ell_pct', 'poverty_pct', 'charter']
- r2: 0.03383721201375112
- rmse: 139.13207209363262



**Without total enrollment**

- factors: ['asian_pct', 'black_pct', 'hispanic_pct', 'white_pct', 'swd_pct', 'ell_pct', 'poverty_pct', 'charter']
- r2: 0.0335044316542944
- rmse: 139.15603101409255



**Adding ENI**

- factors: ['asian_pct', 'black_pct', 'hispanic_pct', 'white_pct', 'swd_pct', 'ell_pct', 'poverty_pct', 'eni_pct', 'charter']
- r2: 0.21668620272220873
- rmse: 125.27660861322495



**ENI without Poverty %**

- factors: ['asian_pct', 'black_pct', 'hispanic_pct', 'white_pct', 'swd_pct', 'ell_pct', 'eni_pct', 'charter']
- r2: 0.19080106965335386
- rmse: 127.3297099796732
