In [71]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import cross_validate
from sklearn.impute import SimpleImputer

In [72]:
expected_model_version = '1.0'
model_path = './xgboostmath.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        modelm = pickle.load(f)
    if modelm.version != expected_model_version:
        print("Expected model version doesn't match version loaded")
    if modelm.sklearn_version != sklearn_version:
        print("Warning: model created under different sklearn version")
else:
    print("Expected model not found")

In [73]:
math_data = pd.read_csv('SBSA_math.csv')
math_data

Unnamed: 0,PctProficient,ScaleScoreAvg,FallEnrollment,"Students Per Teacher, Regular","Students Per Teacher, Special","Students Per Specialist, Instructional",Students Per Assistant Superintendent,Students Per Administrative Assistant,"Students Per Supervisor, Pupil Support",Students Per Crafts & Trades,...,Students Per Nurse,Students Per Social Worker,Students Per Principal,Students Per Guidance Counselor,Students Per Assistant Principal,Students Per Librarian,Students Per Service Aide,Students Per Bus Driver & Laborer,Students Per Other General Support,Students Per Psychometrist
0,59.75,,699.0,19.971429,116.500000,699.00,,,,,...,699.0,,699.0,699.0,,,,,,
1,65.12,,436.0,16.769231,145.333333,436.00,,,,,...,,,436.0,436.0,436.0,436.0,,,,
2,57.60,,442.0,17.680000,55.250000,,,,,,...,442.0,,442.0,442.0,442.0,442.0,,,,
3,68.15,,495.0,19.800000,123.750000,495.00,,,,,...,495.0,,495.0,495.0,495.0,495.0,495.0,,,
4,56.66,,542.0,19.357143,180.666667,,,,,,...,542.0,,542.0,542.0,,542.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,14.17,2388.06,613.0,17.514286,153.250000,,,613.0,,,...,613.0,,,,,,,76.625,204.333333,
814,42.86,2493.83,600.0,18.181818,,,,150.0,,600.0,...,300.0,,600.0,,,600.0,,,,
815,42.86,2493.83,600.0,18.181818,,,,150.0,,600.0,...,300.0,,600.0,,,600.0,,,,
816,18.69,2366.36,341.0,24.357143,170.500000,85.25,,,,,...,341.0,170.5,,,,,,,341.000000,


In [74]:
X = math_data[modelm.X_columns]
y = math_data['PctProficient']

In [75]:
len(X), len(y)

(818, 818)

In [76]:
modelm.fit(X, y)

In [77]:
cv_results = cross_validate(modelm, X, y, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)

In [78]:
cv_results['test_score']

array([-12.19651864, -10.02295218, -12.85816164, -11.17770319,
        -9.5965747 ])

In [79]:
mae_mean, mae_std = np.mean(-1 * cv_results['test_score']), np.std(-1 * cv_results['test_score'])
mae_mean, mae_std

(11.170382068203788, 1.2405704118374818)

In [80]:
Delaware_math = pd.read_csv('StateofDelawarestats.csv')
Delaware_math = Delaware_math[Delaware_math['ContentArea'] == 'MATH']
Delaware_math

Unnamed: 0,School Year,School Code,Organization,Assessment Name,ContentArea,Grade,PctProficient,FallEnrollment,"Students Per Teacher, Regular","Students Per Teacher, Special",...,Students Per Assistant Superintendent,Students Per Speech and Hearing Therapist,Students Per Bus Driver & Laborer,Students Per Psychologist,Students Per Managerial,Students Per Principal,Students Per Superintendent,Students Per Teaching & Clerical Aide,"Students Per Specialist, Instructional",Students Per Psychometrist
1,2022,0,State of Delaware,Smarter Balanced Summative Assessment,MATH,All Students,30.21,140263.0,18.743719,53.953533,...,8766.4375,684.87793,304.919565,638.429677,882.157233,623.391111,6375.590909,48.721039,476.113374,


In [81]:
X_del = Delaware_math.loc[Delaware_math.Organization == "State of Delaware", modelm.X_columns]
y_del = Delaware_math.loc[Delaware_math.Organization == "State of Delaware", 'PctProficient']

In [82]:
del_pred = modelm.predict(X_del).item()

In [83]:
y_del = y_del.values.item()

In [84]:
print(f'The predicted PctProficient for math is {del_pred:.2f}, actual PctProficient is {y_del:.2f}.')
print(f'With the expected mean absolute error of {mae_mean:.2f}, that suggests that the state is underperforming the model.')

The predicted PctProficient for math is 42.89, actual PctProficient is 30.21.
With the expected mean absolute error of 11.17, that suggests that the state is underperforming the model.


In [85]:
def predict_increase(features, deltas):
    
    dele2 = X_del.copy()
    for f, d in zip(features, deltas):
        dele2[f] += d
    return modelm.predict(dele2).item() - modelm.predict(X_del).item()

In [86]:
proficiency_increase = 0
icounter = 0
jcounter = 0

for i in range(-1, -100, -1):
    for j in np.arange(0, -2, -0.1):
        proficiency_increase_temp = predict_increase(['Students Per Assistant Principal', 'Students Per Teacher, Regular'], [i, j])
        if proficiency_increase_temp > proficiency_increase:
            proficiency_increase = proficiency_increase_temp
            icounter = i
            jcounter = j


proficiency_increase, icounter, jcounter

(3.328125, -17, -0.5)

In [87]:
expected_model_version = '1.0'
model_path = './xgboostela.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        modele = pickle.load(f)
    if modele.version != expected_model_version:
        print("Expected model version doesn't match version loaded")
    if modele.sklearn_version != sklearn_version:
        print("Warning: model created under different sklearn version")
else:
    print("Expected model not found")

In [88]:
ela_data = pd.read_csv('SBSA_ela.csv')
ela_data

Unnamed: 0,PctProficient,ScaleScoreAvg,FallEnrollment,"Students Per Teacher, Regular","Students Per Teacher, Special","Students Per Specialist, Instructional",Students Per Assistant Superintendent,Students Per Administrative Assistant,"Students Per Supervisor, Pupil Support",Students Per Crafts & Trades,...,Students Per Nurse,Students Per Social Worker,Students Per Principal,Students Per Guidance Counselor,Students Per Assistant Principal,Students Per Librarian,Students Per Service Aide,Students Per Bus Driver & Laborer,Students Per Other General Support,Students Per Psychometrist
0,69.06,,699.0,19.971429,116.500000,699.00,,,,,...,699.0,,699.0,699.0,,,,,,
1,80.84,,436.0,16.769231,145.333333,436.00,,,,,...,,,436.0,436.0,436.0,436.0,,,,
2,63.08,,442.0,17.680000,55.250000,,,,,,...,442.0,,442.0,442.0,442.0,442.0,,,,
3,75.00,,495.0,19.800000,123.750000,495.00,,,,,...,495.0,,495.0,495.0,495.0,495.0,495.0,,,
4,73.04,,542.0,19.357143,180.666667,,,,,,...,542.0,,542.0,542.0,,542.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1141,33.33,2491.88,285.0,15.000000,71.250000,142.50,,142.5,,,...,285.0,,95.0,285.0,95.0,,,,,
1142,11.36,2447.20,503.0,16.225806,503.000000,,,,,,...,503.0,,503.0,251.5,,,503.0,,,
1143,11.36,2447.20,503.0,16.225806,503.000000,,,,,,...,503.0,,503.0,251.5,,,503.0,,,
1144,28.44,2390.86,341.0,24.357143,170.500000,85.25,,,,,...,341.0,170.5,,,,,,,341.0,


In [89]:
X = ela_data[modele.X_columns]
y = ela_data['PctProficient']

In [90]:
len(X), len(y)

(1146, 1146)

In [91]:
modele.fit(X, y)

In [92]:
cv_results = cross_validate(modele, X, y, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)

In [93]:
cv_results['test_score']

array([ -9.89241583,  -7.387582  ,  -7.87582363,  -8.78458863,
       -10.91375004])

In [94]:
mae_mean, mae_std = np.mean(-1 * cv_results['test_score']), np.std(-1 * cv_results['test_score'])
mae_mean, mae_std

(8.97083202513765, 1.2934185509606109)

In [95]:
Delaware_ela = pd.read_csv('StateofDelawarestats.csv')
Delaware_ela = Delaware_ela[Delaware_ela['ContentArea'] == 'ELA']
Delaware_ela

Unnamed: 0,School Year,School Code,Organization,Assessment Name,ContentArea,Grade,PctProficient,FallEnrollment,"Students Per Teacher, Regular","Students Per Teacher, Special",...,Students Per Assistant Superintendent,Students Per Speech and Hearing Therapist,Students Per Bus Driver & Laborer,Students Per Psychologist,Students Per Managerial,Students Per Principal,Students Per Superintendent,Students Per Teaching & Clerical Aide,"Students Per Specialist, Instructional",Students Per Psychometrist
0,2022,0,State of Delaware,Smarter Balanced Summative Assessment,ELA,All Students,41.69,140263.0,18.743719,53.953533,...,8766.4375,684.87793,304.919565,638.429677,882.157233,623.391111,6375.590909,48.721039,476.113374,


In [96]:
X_del = Delaware_ela.loc[Delaware_ela.Organization == "State of Delaware", modele.X_columns]
y_del = Delaware_ela.loc[Delaware_ela.Organization == "State of Delaware", 'PctProficient']

In [97]:
del_pred = modele.predict(X_del).item()

In [98]:
y_del = y_del.values.item()

In [99]:
print(f'The predicted PctProficient for ela is {del_pred:.2f}, actual PctProficient is {y_del:.2f}.')
print(f'With the expected mean absolute error of {mae_mean:.2f}, that suggests that the state is performing near the model.')

The predicted PctProficient for ela is 39.43, actual PctProficient is 41.69.
With the expected mean absolute error of 8.97, that suggests that the state is performing near the model.


In [100]:
Delaware_ela_prof = Delaware_ela['PctProficient']
Delaware_ela = Delaware_ela[modele.X_columns]
Delaware_ela['PctProficient'] = Delaware_ela_prof.values.item()
Delaware_ela

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Delaware_ela['PctProficient'] = Delaware_ela_prof.values.item()


Unnamed: 0,"Students Per Teacher, Regular","Students Per Teacher, Special","Students Per Specialist, Instructional",Students Per Crafts & Trades,Students Per Custodial,Students Per Managerial,Students Per Director,Students Per Secretarial,Students Per Cafeteria Worker,Students Per Psychologist,...,Students Per Teaching & Clerical Aide,Students Per Nurse,Students Per Principal,Students Per Guidance Counselor,Students Per Assistant Principal,Students Per Librarian,Students Per Service Aide,Students Per Bus Driver & Laborer,Students Per Other General Support,PctProficient
0,18.743719,53.953533,476.113374,874.457606,101.507454,882.157233,1178.680672,165.268057,413.755162,638.429677,...,48.721039,461.239724,623.391111,338.717701,448.411125,1301.141002,1835.903141,304.919565,1544.746696,41.69


In [101]:
Delaware_ela['Students Per Assistant Principal'], Delaware_ela['Students Per Teacher, Regular'], Delaware_ela['Students Per Teacher, Special']

(0    448.411125
 Name: Students Per Assistant Principal, dtype: float64,
 0    18.743719
 Name: Students Per Teacher, Regular, dtype: float64,
 0    53.953533
 Name: Students Per Teacher, Special, dtype: float64)

In [102]:
def predict_increase(features, deltas):
    
    dele2 = X_del.copy()
    for f, d in zip(features, deltas):
        dele2[f] += d
    return modele.predict(dele2).item() - modele.predict(X_del).item()

In [103]:
proficiency_increase = 0
icounter = 0
jcounter = 0

for i in range(-1, -100, -1):
    for j in np.arange(0, -2, -0.1):
        proficiency_increase_temp = predict_increase(['Students Per Assistant Principal', 'Students Per Teacher, Regular'], [i, j])
        if proficiency_increase_temp > proficiency_increase:
            proficiency_increase = proficiency_increase_temp
            icounter = i
            jcounter = j


proficiency_increase, icounter, jcounter

(11.994335174560547, -89, 0.0)