In [16]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from statsmodels.api import add_constant

In [17]:
#Read in Dataset, drop appropriate columns
full_dataset = pd.read_csv('cleaned_with_salaries.csv')
full_dataset = full_dataset.fillna(0)
schoolnames = full_dataset['school']
columns = full_dataset.columns
to_drop = ['leaname','school', 'leatype','act21', 'act22', 'english21',
       'english22', 'math21', 'math22', 'reading21', 'reading22', 'science21',
       'science22', 'students_11_12','grade_9_2022', 'grade_10_2022',
       'grade_11_2022', 'grade_12_2022','female_2022_p']
to_drop = to_drop + ['gradrate','cohortyear','schoolid_y','totalk12_2022','district']
X = full_dataset.drop(columns=to_drop)
y = full_dataset['act22']

#Do L1 regularization with sklearn
reg_lasso = LassoCV(alphas=np.logspace(-4, 3, 15),max_iter=100000)
reg_lasso.fit(X, y,sample_weight=full_dataset['students_11_12']) # Fit the model
lasso_pred = reg_lasso.predict(X) # Best parameter values
MSE_lasso = mean_squared_error(y,lasso_pred)
R2_lasso = r2_score(y,lasso_pred)
params = reg_lasso.coef_
zero_index = params==0
not_selected_features_lasso = np.array(X.columns)[zero_index]
selected_features_lasso = np.array(X.columns)[~zero_index]

print([item for item in columns if item not in to_drop])
print(to_drop,'\n')
print(f'Selected features Lasso: {selected_features_lasso}')
print(f'Unselected features Lasso: {not_selected_features_lasso}')
print(f"Best MSE Lasso: {MSE_lasso}")
print(f"Best R2 Lasso: {R2_lasso}")
print('\n\n')

['meanschooladministratorsalaryind', 'administratortoteachermeansalary', 'meanteachersalaryindollars', 'log_students', 'attendance21', 'attendance22', 'attendance_2022_minority_p', 'americanindian_2022_p', 'afamblack_2022_p', 'asian_2022_p', 'hispanic_2022_p', 'multiplerace_2022_p', 'pacificislander_2022_p', 'male_2022_p', 'white_2022_p', 'economicallydisadvantaged_2022_p', 'englishlearner_2022_p', 'studentwithadisability_2022_p', 'homeless_2022_p', 'charter']
['leaname', 'school', 'leatype', 'act21', 'act22', 'english21', 'english22', 'math21', 'math22', 'reading21', 'reading22', 'science21', 'science22', 'students_11_12', 'grade_9_2022', 'grade_10_2022', 'grade_11_2022', 'grade_12_2022', 'female_2022_p', 'gradrate', 'cohortyear', 'schoolid_y', 'totalk12_2022', 'district'] 

Selected features Lasso: ['meanschooladministratorsalaryind' 'meanteachersalaryindollars'
 'log_students' 'attendance21' 'attendance22' 'attendance_2022_minority_p'
 'afamblack_2022_p' 'asian_2022_p' 'hispanic_202

Do Same analysis but with district instead of salary

In [18]:
#Read in Dataset, drop appropriate columns
full_dataset = pd.read_csv('cleaned_with_salaries.csv')
full_dataset = full_dataset.dropna(axis=0)
schoolnames = full_dataset['school']
columns = full_dataset.columns
to_drop = ['leaname','school', 'leatype','act21', 'act22', 'english21',
       'english22', 'math21', 'math22', 'reading21', 'reading22', 'science21',
       'science22', 'students_11_12',     'gradrate','cohortyear','schoolid_y','totalk12_2022','meanschooladministratorsalaryind',
       'administratortoteachermeansalary', 'meanteachersalaryindollars']
X = full_dataset.drop(columns=to_drop)
y = full_dataset['act22']

#Do L1 regularization with sklearn
reg_lasso = LassoCV(alphas=np.logspace(-4, 3, 8),max_iter=100000)
reg_lasso.fit(X, y,sample_weight=full_dataset['students_11_12']) # Fit the model
lasso_pred = reg_lasso.predict(X) # Best parameter values
MSE_lasso = mean_squared_error(y,lasso_pred)
R2_lasso = r2_score(y,lasso_pred)
params = reg_lasso.coef_
zero_index = params==0
not_selected_features_lasso = np.array(X.columns)[zero_index]
selected_features_lasso = np.array(X.columns)[~zero_index]

print(f'Selected features Lasso: {selected_features_lasso}')
print(f'Unselected features Lasso: {not_selected_features_lasso}')
print(f"Best MSE Lasso: {MSE_lasso}")
print(f"Best R2 Lasso: {R2_lasso}")
print('\n\n')

Selected features Lasso: ['attendance21' 'attendance22' 'attendance_2022_minority_p'
 'afamblack_2022_p' 'asian_2022_p' 'hispanic_2022_p' 'multiplerace_2022_p'
 'grade_9_2022' 'grade_10_2022' 'grade_11_2022' 'grade_12_2022'
 'female_2022_p' 'white_2022_p' 'economicallydisadvantaged_2022_p'
 'studentwithadisability_2022_p' 'homeless_2022_p' 'charter' 'district']
Unselected features Lasso: ['log_students' 'americanindian_2022_p' 'pacificislander_2022_p'
 'male_2022_p' 'englishlearner_2022_p']
Best MSE Lasso: 1.8491786469507638
Best R2 Lasso: 0.6807222662349006





Include all - INCLUDE THIS ONE IN THE FINAL REPORT

In [24]:
#Read in Dataset, drop appropriate columns
full_dataset = pd.read_csv('cleaned_with_salaries.csv')
full_dataset = full_dataset.fillna(0)
schoolnames = full_dataset['school']
columns = full_dataset.columns
to_drop = ['leaname','school', 'leatype','act21', 'act22', 'english21',
       'english22', 'math21', 'math22', 'reading21', 'reading22', 'science21','grade_9_2022','grade_10_2022','grade_11_2022', 'grade_12_2022',
       'science22', 'students_11_12','gradrate','cohortyear','schoolid_y','totalk12_2022','female_2022_p','white_2022_p','district','attendance_2022_minority_p']
X = full_dataset.drop(columns=to_drop)
y = full_dataset['act22']

#Do L1 regularization with sklearn
reg_lasso = LassoCV(alphas=np.logspace(-4, 20, 100),max_iter=100000)
reg_lasso.fit(X, y,sample_weight=full_dataset['students_11_12']) # Fit the model
lasso_pred = reg_lasso.predict(X) # Best parameter values
MSE_lasso = mean_squared_error(y,lasso_pred)
R2_lasso = r2_score(y,lasso_pred)
params = reg_lasso.coef_
params[np.abs(params) <1e-2] = 0
zero_index = params==0
not_selected_features_lasso = np.array(X.columns)[zero_index]
selected_features_lasso = np.array(X.columns)[~zero_index]

selected_features_lasso_str = ', '.join(map(str, selected_features_lasso))
not_selected_features_lasso_str = ', '.join(map(str, not_selected_features_lasso))
print(f'''
Selected features Lasso:\n{selected_features_lasso_str}\n
Unselected features Lasso:\n{not_selected_features_lasso_str}\n
Params Lasso: {params}\n
Best MSE Lasso: {MSE_lasso}
Best R2 Lasso: {R2_lasso}
''')


Selected features Lasso:
log_students, attendance21, attendance22, afamblack_2022_p, asian_2022_p, hispanic_2022_p, economicallydisadvantaged_2022_p, englishlearner_2022_p, studentwithadisability_2022_p, charter

Unselected features Lasso:
meanschooladministratorsalaryind, administratortoteachermeansalary, meanteachersalaryindollars, americanindian_2022_p, multiplerace_2022_p, pacificislander_2022_p, male_2022_p, homeless_2022_p

Params Lasso: [ 0.          0.          0.          0.4061693   0.44135415  0.9696068
  0.         -5.40312234 13.76860745 -4.05829705  0.          0.
  0.         -4.25693717 -1.23854425 -7.13236527  0.          1.50829298]

Best MSE Lasso: 2.3745828138971388
Best R2 Lasso: 0.5985425264313309



In [None]:
#Only look at salary Data:
to_include = ['meanschooladministratorsalaryind','administratortoteachermeansalary', 'meanteachersalaryindollars']
X = full_dataset[to_include]
y = full_dataset['act22']

#Do L1 regularization with sklearn
reg_lasso = LassoCV(alphas=np.logspace(-4, 3, 8),max_iter=100000)
reg_lasso.fit(X, y,sample_weight=full_dataset['students_11_12']) # Fit the model
lasso_pred = reg_lasso.predict(X) # Best parameter values
MSE_lasso = mean_squared_error(y,lasso_pred)
R2_lasso = r2_score(y,lasso_pred)
params = reg_lasso.coef_
zero_index = params==0
not_selected_features_lasso = np.array(X.columns)[zero_index]
selected_features_lasso = np.array(X.columns)[~zero_index]

print(f'Selected features Lasso: {selected_features_lasso}')
print(f'Unselected features Lasso: {not_selected_features_lasso}')
print(f"Best MSE Lasso: {MSE_lasso}")
print(f"Best R2 Lasso: {R2_lasso}")
print('\n\n')

Selected features Lasso: []
Unselected features Lasso: ['meanschooladministratorsalaryind' 'administratortoteachermeansalary'
 'meanteachersalaryindollars']
Best MSE Lasso: 6.349716192803665
Best R2 Lasso: -0.0735111050759294



