In [185]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt
import warnings
import statsmodels.api as sm

In [186]:
data = pd.read_csv("cleaned_with_salaries.csv")

# Remove certain variables
varsToKeep = list(data.columns.values)
varsToKeep.remove("leaname")
varsToKeep = varsToKeep[:6] + varsToKeep[16:]
varsToKeep.remove("leatype")
varsToKeep.remove("school")
varsToKeep.remove("schoolid_y")
varsToKeep.remove("cohortyear")

data = data[varsToKeep]
X = data.copy()

# CHANGE LATER. Maybe I'll start with just a few variables and then tack them on later
X = X[['meanschooladministratorsalaryind', 'administratortoteachermeansalary', 
       'students_11_12', 'gradrate', 'charter', 'attendance22', 'male_2022_p', 'log_students', 'economicallydisadvantaged_2022_p']]
# I took out district and attendance21 for collinearity purposes
# Include male, minority attendance, economically disadvantaged

# The attendance values have nulls. attendance21 has 5 nulls. attendance22 has 3 nulls
X = X.dropna()
y_rate = X['gradrate'].copy()

# It doesn't like that y is continuous. This is the average high school graduation rate across the country
# Alternatively, I could use the Utah average, which is .882
NATIONAL_GRAD_RATE = .87
UTAH_GRAD_RATE = .882
y = y_rate > UTAH_GRAD_RATE

del X['gradrate']

# Define the sample weights based on the number of 11th and 12th graders students_11_12
weights = X['students_11_12'] / X['students_11_12'].sum()

del X['students_11_12']


In [189]:
# Run the optimal regression and display results
# Alpha is regularization
log_reg = sm.Logit(y, sm.add_constant(X), weights=weights, alpha=0.5).fit()
log_reg.summary()

Optimization terminated successfully.
         Current function value: 0.537358
         Iterations 7


0,1,2,3
Dep. Variable:,gradrate,No. Observations:,129.0
Model:,Logit,Df Residuals:,121.0
Method:,MLE,Df Model:,7.0
Date:,"Mon, 04 Dec 2023",Pseudo R-squ.:,0.2149
Time:,17:41:25,Log-Likelihood:,-69.319
converged:,True,LL-Null:,-88.293
Covariance Type:,nonrobust,LLR p-value:,3.102e-06

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-20.8137,7.242,-2.874,0.004,-35.008,-6.620
meanschooladministratorsalaryind,-9.669e-06,1.21e-05,-0.798,0.425,-3.34e-05,1.41e-05
administratortoteachermeansalary,-0.5176,0.491,-1.055,0.291,-1.479,0.444
charter,-1.1878,0.597,-1.990,0.047,-2.358,-0.018
attendance22,27.0175,7.528,3.589,0.000,12.262,41.773
male_2022_p,-3.2647,3.671,-0.889,0.374,-10.460,3.931
log_students,0.1094,0.211,0.518,0.604,-0.304,0.523
economicallydisadvantaged_2022_p,-1.8084,0.975,-1.855,0.064,-3.719,0.102


In [183]:
# Do a GridSearch to find the optimal hyperparameters
# Define a hyperparameter grid
# Regularization is applied automatically
C_list = [0.1, 0.5, 1.0]
maxIter_list = [50, 100, 150]

model = LogisticRegression(fit_intercept=True)

param_grid = {
               "C": C_list,
               "max_iter": maxIter_list
              }
logist_gs = GridSearchCV(model, param_grid, scoring="f1")
logist_gs.fit(X, y)

logist_gs.best_estimator_.fit(X, y, sample_weight=weights)
#logist_best_prediction = rfc_gs.best_estimator_.predict(X_test) # 

# The display is kinda ugly. I could potentially run the logistic regression in statsmodels and get a pretty summary table.

# Prettify the display
print("Variables:", X.columns.values)
print("Coefficients:\n", logist_gs.best_estimator_.coef_[0])
print("Score:", logist_gs.best_score_)


Variables: ['meanschooladministratorsalaryind' 'administratortoteachermeansalary'
 'charter' 'attendance22' 'male_2022_p' 'log_students'
 'economicallydisadvantaged_2022_p']
Coefficients:
 [ 4.42349009e-06  4.84538938e-10 -4.73906473e-10  9.57112088e-10
  3.73548394e-10  6.13365412e-09 -1.64680474e-09]
Score: 0.7226141338336461
