In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import cross_validate
from sklearn.impute import SimpleImputer

Let's open our xgboostmath model.

In [2]:
expected_model_version = '1.0'
model_path = './xgboostmath.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        modelm = pickle.load(f)
    if modelm.version != expected_model_version:
        print("Expected model version doesn't match version loaded")
    if modelm.sklearn_version != sklearn_version:
        print("Warning: model created under different sklearn version")
else:
    print("Expected model not found")

I will retrain the model on our entire dataset then use it to predict the performance of the state of Delaware as a whole on math proficiency.

In [3]:
math_data = pd.read_csv('SBSA_math.csv')
math_data

Unnamed: 0,PctProficient,ScaleScoreAvg,FallEnrollment,"Students Per Teacher, Regular","Students Per Teacher, Special","Students Per Specialist, Instructional",Students Per Assistant Superintendent,Students Per Administrative Assistant,"Students Per Supervisor, Pupil Support",Students Per Crafts & Trades,...,Students Per Nurse,Students Per Social Worker,Students Per Principal,Students Per Guidance Counselor,Students Per Assistant Principal,Students Per Librarian,Students Per Service Aide,Students Per Bus Driver & Laborer,Students Per Other General Support,Students Per Psychometrist
0,59.75,,699.0,19.971429,116.500000,699.00,,,,,...,699.0,,699.0,699.0,,,,,,
1,65.12,,436.0,16.769231,145.333333,436.00,,,,,...,,,436.0,436.0,436.0,436.0,,,,
2,57.60,,442.0,17.680000,55.250000,,,,,,...,442.0,,442.0,442.0,442.0,442.0,,,,
3,68.15,,495.0,19.800000,123.750000,495.00,,,,,...,495.0,,495.0,495.0,495.0,495.0,495.0,,,
4,56.66,,542.0,19.357143,180.666667,,,,,,...,542.0,,542.0,542.0,,542.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,14.17,2388.06,613.0,17.514286,153.250000,,,613.0,,,...,613.0,,,,,,,76.625,204.333333,
814,42.86,2493.83,600.0,18.181818,,,,150.0,,600.0,...,300.0,,600.0,,,600.0,,,,
815,42.86,2493.83,600.0,18.181818,,,,150.0,,600.0,...,300.0,,600.0,,,600.0,,,,
816,18.69,2366.36,341.0,24.357143,170.500000,85.25,,,,,...,341.0,170.5,,,,,,,341.000000,


In [4]:
X = math_data[modelm.X_columns]
y = math_data['PctProficient']

In [5]:
len(X), len(y)

(818, 818)

Fit the model to the entire dataset.

In [6]:
modelm.fit(X, y)

In [7]:
cv_results = cross_validate(modelm, X, y, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)

In [8]:
cv_results['test_score']

array([-12.19651864, -10.02295218, -12.85816164, -11.17770319,
        -9.5965747 ])

In [9]:
mae_mean, mae_std = np.mean(-1 * cv_results['test_score']), np.std(-1 * cv_results['test_score'])
mae_mean, mae_std

(11.170382068203788, 1.2405704118374818)

Get the mean absolute error and mean standard deviation of our model. Then get the stats for the state of Delaware as a whole.

In [10]:
Delaware_math = pd.read_csv('StateofDelawarestats.csv')
Delaware_math = Delaware_math[Delaware_math['ContentArea'] == 'MATH']
Delaware_math

Unnamed: 0,School Year,School Code,Organization,Assessment Name,ContentArea,Grade,PctProficient,FallEnrollment,"Students Per Teacher, Regular","Students Per Teacher, Special",...,Students Per Assistant Superintendent,Students Per Speech and Hearing Therapist,Students Per Bus Driver & Laborer,Students Per Psychologist,Students Per Managerial,Students Per Principal,Students Per Superintendent,Students Per Teaching & Clerical Aide,"Students Per Specialist, Instructional",Students Per Psychometrist
1,2022,0,State of Delaware,Smarter Balanced Summative Assessment,MATH,All Students,30.21,140263.0,18.743719,53.953533,...,8766.4375,684.87793,304.919565,638.429677,882.157233,623.391111,6375.590909,48.721039,476.113374,


In [11]:
X_del = Delaware_math.loc[Delaware_math.Organization == "State of Delaware", modelm.X_columns]
y_del = Delaware_math.loc[Delaware_math.Organization == "State of Delaware", 'PctProficient']

In [12]:
del_pred = modelm.predict(X_del).item()

In [13]:
y_del = y_del.values.item()

In [14]:
print(f'The predicted PctProficient for math is {del_pred:.2f}, actual PctProficient is {y_del:.2f}.')
print(f'The expected mean absolute error of {mae_mean:.2f}.')

The predicted PctProficient for math is 42.89, actual PctProficient is 30.21.
The expected mean absolute error of 11.17.


The model predicts that math proficiency in the state should be at 42.89%, while in reality the proficiency sits at 30.21%. This is a bigger difference than the mean absolute error. There are a number of reasons why this may be. The first is that proficiency scores are not normally distributed across schools. Another is that COVID learning loss may have affected math proficiency in a dramatic way. Or it could be a fluke down here. Whatever the case this suggests that math scores will improve in the state regardless of action taken to change staffing levels. A somewhat unsatisfying conclusion, but lets try to improve on the model score regardless.

In [15]:
def predict_increase(features, deltas):
    
    dele2 = X_del.copy()
    for f, d in zip(features, deltas):
        dele2[f] += d
    return modelm.predict(dele2).item() - modelm.predict(X_del).item()

Students per Assistant Principal, and Students Per Director were the most important features in creating our model. Let's try changing the student staff ratio for both these positions and maximize the model proficiency.

In [16]:
proficiency_increase = 0
icounter = 0
jcounter = 0

for i in range(-1, -100, -1):
    for j in np.arange(-1, -100, -1):
        proficiency_increase_temp = predict_increase(['Students Per Assistant Principal', 'Students Per Director'], [i, j])
        if proficiency_increase_temp > proficiency_increase:
            proficiency_increase = proficiency_increase_temp
            icounter = i
            jcounter = j


print('A decrease in students per assistant principal by ' + str(icounter) + ' and director by ' + str(jcounter) + ' will improve the model proficiency by ' + \
    str(proficiency_increase) + '%.')

A decrease in students per assistant principal by -17 and director by -1 will improve the model proficiency by 2.4587249755859375%.


Not a very big increase in performance. Dissapointing; let's try English.

In [17]:
expected_model_version = '1.0'
model_path = './xgboostela.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        modele = pickle.load(f)
    if modele.version != expected_model_version:
        print("Expected model version doesn't match version loaded")
    if modele.sklearn_version != sklearn_version:
        print("Warning: model created under different sklearn version")
else:
    print("Expected model not found")

I will go through the same steps with this model as I did with the math model.

In [18]:
ela_data = pd.read_csv('SBSA_ela.csv')
ela_data

Unnamed: 0,PctProficient,ScaleScoreAvg,FallEnrollment,"Students Per Teacher, Regular","Students Per Teacher, Special","Students Per Specialist, Instructional",Students Per Assistant Superintendent,Students Per Administrative Assistant,"Students Per Supervisor, Pupil Support",Students Per Crafts & Trades,...,Students Per Nurse,Students Per Social Worker,Students Per Principal,Students Per Guidance Counselor,Students Per Assistant Principal,Students Per Librarian,Students Per Service Aide,Students Per Bus Driver & Laborer,Students Per Other General Support,Students Per Psychometrist
0,69.06,,699.0,19.971429,116.500000,699.00,,,,,...,699.0,,699.0,699.0,,,,,,
1,80.84,,436.0,16.769231,145.333333,436.00,,,,,...,,,436.0,436.0,436.0,436.0,,,,
2,63.08,,442.0,17.680000,55.250000,,,,,,...,442.0,,442.0,442.0,442.0,442.0,,,,
3,75.00,,495.0,19.800000,123.750000,495.00,,,,,...,495.0,,495.0,495.0,495.0,495.0,495.0,,,
4,73.04,,542.0,19.357143,180.666667,,,,,,...,542.0,,542.0,542.0,,542.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1141,33.33,2491.88,285.0,15.000000,71.250000,142.50,,142.5,,,...,285.0,,95.0,285.0,95.0,,,,,
1142,11.36,2447.20,503.0,16.225806,503.000000,,,,,,...,503.0,,503.0,251.5,,,503.0,,,
1143,11.36,2447.20,503.0,16.225806,503.000000,,,,,,...,503.0,,503.0,251.5,,,503.0,,,
1144,28.44,2390.86,341.0,24.357143,170.500000,85.25,,,,,...,341.0,170.5,,,,,,,341.0,


In [19]:
X = ela_data[modele.X_columns]
y = ela_data['PctProficient']

In [20]:
len(X), len(y)

(1146, 1146)

In [21]:
modele.fit(X, y)

In [22]:
cv_results = cross_validate(modele, X, y, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)

In [23]:
cv_results['test_score']

array([ -9.89241583,  -7.387582  ,  -7.87582363,  -8.78458863,
       -10.91375004])

In [24]:
mae_mean, mae_std = np.mean(-1 * cv_results['test_score']), np.std(-1 * cv_results['test_score'])
mae_mean, mae_std

(8.97083202513765, 1.2934185509606109)

In [25]:
Delaware_ela = pd.read_csv('StateofDelawarestats.csv')
Delaware_ela = Delaware_ela[Delaware_ela['ContentArea'] == 'ELA']
Delaware_ela

Unnamed: 0,School Year,School Code,Organization,Assessment Name,ContentArea,Grade,PctProficient,FallEnrollment,"Students Per Teacher, Regular","Students Per Teacher, Special",...,Students Per Assistant Superintendent,Students Per Speech and Hearing Therapist,Students Per Bus Driver & Laborer,Students Per Psychologist,Students Per Managerial,Students Per Principal,Students Per Superintendent,Students Per Teaching & Clerical Aide,"Students Per Specialist, Instructional",Students Per Psychometrist
0,2022,0,State of Delaware,Smarter Balanced Summative Assessment,ELA,All Students,41.69,140263.0,18.743719,53.953533,...,8766.4375,684.87793,304.919565,638.429677,882.157233,623.391111,6375.590909,48.721039,476.113374,


In [26]:
X_del = Delaware_ela.loc[Delaware_ela.Organization == "State of Delaware", modele.X_columns]
y_del = Delaware_ela.loc[Delaware_ela.Organization == "State of Delaware", 'PctProficient']

In [27]:
del_pred = modele.predict(X_del).item()

In [28]:
y_del = y_del.values.item()

In [29]:
print(f'The predicted PctProficient for ela is {del_pred:.2f}, actual PctProficient is {y_del:.2f}.')
print(f'The expected mean absolute error of {mae_mean:.2f}.')

The predicted PctProficient for ela is 39.43, actual PctProficient is 41.69.
The expected mean absolute error of 8.97.


This seems to suggest that this models performs a lot better than the math model.

In [30]:
def predict_increase(features, deltas):
    
    dele2 = X_del.copy()
    for f, d in zip(features, deltas):
        dele2[f] += d
    return modele.predict(dele2).item() - modele.predict(X_del).item()

In [33]:
proficiency_increase = 0
icounter = 0
jcounter = 0

for i in range(-1, -100, -1):
    for j in np.arange(-1, -100, -1):
        proficiency_increase_temp = predict_increase(['Students Per Assistant Principal', 'Students Per Director'], [i, j])
        if proficiency_increase_temp > proficiency_increase:
            proficiency_increase = proficiency_increase_temp
            icounter = i
            jcounter = j


print('A decrease in students per assistant principal by ' + str(icounter) + ' and director by ' + str(jcounter) + ' will improve the model proficiency by ' + \
    str(proficiency_increase) + '%.')

A decrease in students per assistant principal by -89 and director by -1 will improve the model proficiency by 11.994335174560547%.


Here it appears that adding more assistant principals will improve the performance of the state on the English Language Arts version of the SBSA. But how many assistant principals is that?

In [36]:
Delaware_ela['Students Per Assistant Principal'], (Delaware_ela['Students Per Assistant Principal'] - 89)

(0    448.411125
 Name: Students Per Assistant Principal, dtype: float64,
 0    359.411125
 Name: Students Per Assistant Principal, dtype: float64)

There are currently 448 students per assistant principal we want to get that down to 359.

In [38]:
delaware_enrollment = pd.read_csv('Student_Enrollment.csv')
delaware_enrollment.head()

Unnamed: 0,School Year,District Code,District,School Code,Organization,Race,Gender,Grade,SpecialDemo,Geography,SubGroup,RowStatus,Students,EOYEnrollment,PctOfEOYEnrollment,FallEnrollment
0,2015,0,State of Delaware,0,State of Delaware,White,All Students,9th Grade,All Students,All Students,White/9th Grade,REPORTED,5631.0,141336.0,3.98,134932.0
1,2015,0,State of Delaware,0,State of Delaware,White,All Students,Twelfth,All Students,All Students,White/Twelfth,REPORTED,4828.0,141336.0,3.42,134932.0
2,2015,0,State of Delaware,0,State of Delaware,White,All Students,All Students,All Students,All Students,White,REPORTED,65185.0,141336.0,46.12,134932.0
3,2015,0,State of Delaware,0,State of Delaware,White,Female,4th Grade,Homeless,All Students,White/Female/4th Grade/Homeless,REDACTED,37.0,141336.0,0.03,
4,2015,0,State of Delaware,0,State of Delaware,White,Female,4th Grade,Low-Income,All Students,White/Female/4th Grade/Low-Income,REPORTED,600.0,141336.0,0.42,134932.0


In [41]:
delaware_enrollment = delaware_enrollment[delaware_enrollment['School Year'] == 2022]
delaware_enrollment['FallEnrollment'].max()

140263.0

There were 140263 students enrolled in the 2021-2022 school year.

In [42]:
140263 / 448.411

312.80008741980015

Meaning there were 313 assistant principals in the state.

In [43]:
140263 / 359.411

390.25794981233184

We want to increase that to 390 principals, an increase of 77 staff. How much would that cost?

In [52]:
delaware_ap_salary = pd.read_csv('Educator_Average_Salary.csv')
delaware_ap_salary.head()

Unnamed: 0,School Year,District Code,District,School Code,Organization,Race,Gender,Grade,SpecialDemo,Geography,...,Staff Category,Job Classification,Experience,Educators (FTE),Average Total Salary,Average State Salary,Average Local Salary,Average Federal Salary,Average Years of Experience,Average Years of Age
0,2020,24,Smyrna School District,697,Smyrna Administrative Office,White,All Educators,All Educators,Regular,All Educators,...,Official/Administrative,ALL,ALL,14.0,95625.22,54023.65,41601.56,,19,53
1,2020,24,Smyrna School District,697,Smyrna Administrative Office,White,All Educators,All Educators,Regular,All Educators,...,Pupil Support,ALL,ALL,1.0,97917.55,60056.88,37860.68,,1,33
2,2020,24,Smyrna School District,697,Smyrna Administrative Office,White,All Educators,All Educators,All Educators,All Educators,...,ALL,ALL,ALL,25.0,48982.63,35892.53,13090.1,,20,46
3,2020,24,Smyrna School District,697,Smyrna Administrative Office,White,All Educators,All Educators,Regular,All Educators,...,Instructional Support,"Supervisor, Instructional",ALL,3.0,115362.61,64883.95,50478.65,,12,53
4,2020,24,Smyrna School District,697,Smyrna Administrative Office,White,All Educators,All Educators,All Educators,All Educators,...,ALL,ALL,ALL,46.0,69886.27,44649.62,25236.64,,19,48


In [53]:
delaware_ap_salary = delaware_ap_salary[delaware_ap_salary['Job Classification'] == 'Assistant Principal']
delaware_ap_salary = delaware_ap_salary[delaware_ap_salary['Organization'] == 'State of Delaware']
delaware_ap_salary = delaware_ap_salary[delaware_ap_salary['SpecialDemo'] == 'All Educators']
delaware_ap_salary = delaware_ap_salary[delaware_ap_salary['Race'] == 'All Educators']
delaware_ap_salary = delaware_ap_salary[delaware_ap_salary['Gender'] == 'All Educators']
delaware_ap_salary = delaware_ap_salary[delaware_ap_salary['Experience'] == 'ALL']
delaware_ap_salary = delaware_ap_salary[delaware_ap_salary['School Year'] == 2022]
delaware_ap_salary

Unnamed: 0,School Year,District Code,District,School Code,Organization,Race,Gender,Grade,SpecialDemo,Geography,...,Staff Category,Job Classification,Experience,Educators (FTE),Average Total Salary,Average State Salary,Average Local Salary,Average Federal Salary,Average Years of Experience,Average Years of Age
1633927,2022,0,State of Delaware,0,State of Delaware,All Educators,All Educators,All Educators,All Educators,All Educators,...,Official/Administrative,Assistant Principal,ALL,312.8,112222.42,65941.47,52236.87,83283.72,15,45


The average salary of an assistant principal was $112222.42 in 2022 an increase of 77 of them at the average salary would cost:

In [54]:
112222.42 * 77

8641126.34

About 8 and a half million dollars, well within our budget.