In [458]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from sklearn.preprocessing import scale

In [460]:
# get ELA scores
dfELA = pd.read_csv('ELA_Scores.csv')
print(dfELA.shape)
dfELA.head()

(3652890, 18)


Unnamed: 0,School Year,State,NCES LEA ID,LEA,School,NCES SCH ID,Data Group,Data Description,Value,Numerator,Denominator,Population,Subgroup,Characteristics,Age/Grade,Academic Subject,Outcome,Program Type
0,2020-2021,ALABAMA,100005,Albertville City,Albertville Middle School,10000500870,584,Performance on Statewide Reading/Language Arts...,42%,,881,All Students,All Students in School,,All Grades,Reading/Language Arts,Percent Proficient,
1,2020-2021,ALABAMA,100005,Albertville City,Albertville Middle School,10000500870,584,Performance on Statewide Reading/Language Arts...,S,,2,All Students,American Indian/Alaska Native/Native American,,All Grades,Reading/Language Arts,Percent Proficient,
2,2020-2021,ALABAMA,100005,Albertville City,Albertville Middle School,10000500870,584,Performance on Statewide Reading/Language Arts...,S,,3,All Students,Asian/Pacific Islander,,All Grades,Reading/Language Arts,Percent Proficient,
3,2020-2021,ALABAMA,100005,Albertville City,Albertville Middle School,10000500870,584,Performance on Statewide Reading/Language Arts...,21-39%,,30,All Students,Black (not Hispanic) African American,,All Grades,Reading/Language Arts,Percent Proficient,
4,2020-2021,ALABAMA,100005,Albertville City,Albertville Middle School,10000500870,584,Performance on Statewide Reading/Language Arts...,31%,,462,All Students,Hispanic/Latino,,All Grades,Reading/Language Arts,Percent Proficient,


In [461]:
# get only schoolwide videos
dfELA = dfELA.loc[(dfELA['Age/Grade'] == 'All Grades') & (dfELA['Subgroup'] == 'All Students in School') ]
dfELA = dfELA[['NCES LEA ID', 'LEA', 'Denominator', 'School', 'NCES SCH ID', 'Value']]
print(dfELA.shape)
dfELA.head()

(77573, 6)


Unnamed: 0,NCES LEA ID,LEA,Denominator,School,NCES SCH ID,Value
0,100005,Albertville City,881,Albertville Middle School,10000500870,42%
47,100005,Albertville City,341,Albertville High School,10000500871,22%
73,100005,Albertville City,879,Albertville Intermediate School,10000500879,39%
120,100005,Albertville City,906,Albertville Elementary School,10000500889,42%
167,100006,Marshall County,356,Kate Duncan Smith DAR Middle,10000600193,54%


In [462]:
# get math scores
dfMath = pd.read_csv('Math_scores.csv')
print(dfMath.shape)
dfMath.head()

(3935036, 18)


Unnamed: 0,School Year,State,NCES LEA ID,LEA,School,NCES SCH ID,Data Group,Data Description,Value,Numerator,Denominator,Population,Subgroup,Characteristics,Age/Grade,Academic Subject,Outcome,Program Type
0,2020-2021,ALABAMA,100005,Albertville City,Albertville Middle School,10000500870,583,Performance on Statewide Mathematics Assessment,11%,,884,All Students,All Students in School,,All Grades,Mathematics,Percent Proficient,
1,2020-2021,ALABAMA,100005,Albertville City,Albertville Middle School,10000500870,583,Performance on Statewide Mathematics Assessment,S,,2,All Students,American Indian/Alaska Native/Native American,,All Grades,Mathematics,Percent Proficient,
2,2020-2021,ALABAMA,100005,Albertville City,Albertville Middle School,10000500870,583,Performance on Statewide Mathematics Assessment,S,,4,All Students,Asian/Pacific Islander,,All Grades,Mathematics,Percent Proficient,
3,2020-2021,ALABAMA,100005,Albertville City,Albertville Middle School,10000500870,583,Performance on Statewide Mathematics Assessment,<=10%,,32,All Students,Black (not Hispanic) African American,,All Grades,Mathematics,Percent Proficient,
4,2020-2021,ALABAMA,100005,Albertville City,Albertville Middle School,10000500870,583,Performance on Statewide Mathematics Assessment,9%,,465,All Students,Hispanic/Latino,,All Grades,Mathematics,Percent Proficient,


In [465]:
# get only schoolwide videos
dfMath = dfMath.loc[(dfMath['Age/Grade'] == 'All Grades') & (dfMath['Subgroup'] == 'All Students in School')]
dfMath = dfMath[['NCES LEA ID', 'LEA', 'Denominator', 'School', 'NCES SCH ID', 'Value']]
print(dfMath.shape)
dfMath.head()

(82303, 6)


Unnamed: 0,NCES LEA ID,LEA,Denominator,School,NCES SCH ID,Value
0,100005,Albertville City,884,Albertville Middle School,10000500870,11%
47,100005,Albertville City,353,Albertville High School,10000500871,19%
73,100005,Albertville City,879,Albertville Intermediate School,10000500879,16%
120,100005,Albertville City,904,Albertville Elementary School,10000500889,25%
167,100006,Marshall County,357,Kate Duncan Smith DAR Middle,10000600193,13%


In [468]:
# merge math and ELA scores together
dfScores = dfMath.merge(dfELA, how = 'inner', on = 'NCES SCH ID')
dfScores = dfScores.rename({'Value_x':'Math_scores', 'Value_y':'ELA_scores', 'NCES LEA ID_x':'LEA_ID', 'Denominator_x':'TotPop'}, axis = 1)
dfScores = dfScores[['NCES SCH ID', 'LEA_ID', 'Math_scores', 'ELA_scores', 'TotPop']]
dfScores = dfScores.loc[(dfScores['Math_scores'] != 'S') & (dfScores['ELA_scores'] != 'S')]
dfScores.head()

Unnamed: 0,NCES SCH ID,LEA_ID,Math_scores,ELA_scores,TotPop
0,10000500870,100005,11%,42%,884
1,10000500871,100005,19%,22%,353
2,10000500879,100005,16%,39%,879
3,10000500889,100005,25%,42%,904
4,10000600193,100006,13%,54%,357


In [470]:
# clean up percent columns
def percent_to_float(value):
    if not isinstance(value, str):
        return float('nan')  # or handle as you prefer
        
    value = value.strip()
    
    # Handle special cases first
    if value == 'S':
        return float('nan')  # or some other value
    
    # Remove percentage signs
    value = value.replace('%', '')
    
    # Handle inequalities
    if value.startswith('>='):
        return float(value[2:]) / 100
    elif value.startswith('<='):
        return float(value[2:]) / 100
    elif value.startswith('<'):
        return float(value[1:]) / 100
    elif value.startswith('>'):
        return float(value[1:]) / 100
    
    # Handle ranges
    if '-' in value:
        parts = value.split('-')
        try:
            low = float(parts[0])
            high = float(parts[1])
            return ((low + high) / 2) / 100
        except:
            return float('nan')
    
    # Handle simple percentages
    try:
        return float(value) / 100
    except:
        return float('nan')

In [472]:
# clean up df
for i in list(dfScores.index):
    dfScores.loc[i, 'Math_scores'] = percent_to_float(dfScores.loc[i, 'Math_scores'])
    dfScores.loc[i, 'ELA_scores'] = percent_to_float(dfScores.loc[i, 'ELA_scores'])

dfScores['Math_scores'] = dfScores['Math_scores'].astype('float')
dfScores['ELA_scores'] = dfScores['ELA_scores'].astype('float')
dfScores = dfScores.dropna()
dfScores.dtypes

NCES SCH ID      int64
LEA_ID           int64
Math_scores    float64
ELA_scores     float64
TotPop           int64
dtype: object

In [473]:
# get district student population
temp = dfScores.groupby('LEA_ID', as_index = False)['TotPop'].sum()
temp = temp.rename({'TotPop':'Stu_Pop_Dist'}, axis = 1)
dfScores = dfScores.merge(temp, how = 'left', on = 'LEA_ID')

# calculate weighted average of proficient
dfScores['prof_math'] = dfScores['Math_scores'] * dfScores['TotPop']
dfScores['prof_ELA'] = dfScores['ELA_scores'] * dfScores['TotPop']
dfScores['weight_avg_math_scores'] = dfScores['prof_math'] / dfScores['Stu_Pop_Dist']
dfScores['weight_avg_ELA_scores'] = dfScores['prof_ELA'] / dfScores['Stu_Pop_Dist']

scores = dfScores.groupby('LEA_ID', as_index = False)[['weight_avg_math_scores', 'weight_avg_ELA_scores']].sum()
print(scores.shape)
scores.head()

(14325, 3)


Unnamed: 0,LEA_ID,weight_avg_math_scores,weight_avg_ELA_scores
0,100005,0.175811,0.387891
1,100006,0.139665,0.382316
2,100007,0.449939,0.6627
3,100008,0.506402,0.714093
4,100011,0.201311,0.448574


In [474]:
# convert to z-score
#scores['weight_avg_math_scores'] = scale(scores['weight_avg_math_scores'])
#scores['weight_avg_ELA_scores'] = scale(scores['weight_avg_ELA_scores'])
#print(scores.shape)
#scores.head()

In [478]:
# add together scores to create index for school performance
#scores['Agg_Score_Index'] = scores['weight_avg_math_scores'] + scores['weight_avg_ELA_scores']
#scores = scores[['LEA_ID', 'Agg_Score_Index']]
#print(scores.shape)
#scores.head()

In [480]:
# district block group data
districtBG = pd.read_excel('/Users/austincoffelt/Downloads/GRF21/grf21_lea_blkgrp.xlsx')
print(districtBG.shape)
districtBG.head()

(307505, 6)


Unnamed: 0,LEAID,NAME_LEA21,BLKGRP,COUNT,LANDAREA,WATERAREA
0,100001,Fort Rucker School District,10310103001,4,23.428369,0.0
1,100001,Fort Rucker School District,10450200001,4,37.888108,1.038161
2,100001,Fort Rucker School District,10450200002,4,4.803419,0.0
3,100001,Fort Rucker School District,10450200003,4,23.821877,0.043584
4,100003,Maxwell AFB School District,11010009001,3,2.21403,0.037874


In [481]:
result = districtBG.loc[districtBG.groupby('BLKGRP')['LANDAREA'].idxmax()]
print(result.shape)
result.head()

(242644, 6)


Unnamed: 0,LEAID,NAME_LEA21,BLKGRP,COUNT,LANDAREA,WATERAREA
559,100240,Autauga County School District,10010201001,45,1.646455,0.010979
560,100240,Autauga County School District,10010201002,45,2.147116,0.0
561,100240,Autauga County School District,10010202001,45,0.794743,0.0
562,100240,Autauga County School District,10010202002,45,0.487432,0.002189
563,100240,Autauga County School District,10010203001,45,1.492869,0.003496


In [484]:
# merge school and score data
temp = result.merge(scores, how = 'left', left_on = 'LEAID', right_on = 'LEA_ID')
temp = temp.dropna()
print(temp.shape)
temp.head()

(196259, 9)


Unnamed: 0,LEAID,NAME_LEA21,BLKGRP,COUNT,LANDAREA,WATERAREA,LEA_ID,weight_avg_math_scores,weight_avg_ELA_scores
0,100240,Autauga County School District,10010201001,45,1.646455,0.010979,100240.0,0.236767,0.499152
1,100240,Autauga County School District,10010201002,45,2.147116,0.0,100240.0,0.236767,0.499152
2,100240,Autauga County School District,10010202001,45,0.794743,0.0,100240.0,0.236767,0.499152
3,100240,Autauga County School District,10010202002,45,0.487432,0.002189,100240.0,0.236767,0.499152
4,100240,Autauga County School District,10010203001,45,1.492869,0.003496,100240.0,0.236767,0.499152


In [488]:
for i in list(temp.index):
    temp.loc[i, 'BLKGRP'] = str(temp.loc[i, 'BLKGRP']).zfill(12)

temp['AFFGEOID'] = '1500000US' + temp['BLKGRP']
temp = temp[['AFFGEOID', 'weight_avg_math_scores', 'weight_avg_ELA_scores']]
print(temp.shape)
temp.head()

(196259, 3)


Unnamed: 0,AFFGEOID,weight_avg_math_scores,weight_avg_ELA_scores
0,1500000US010010201001,0.236767,0.499152
1,1500000US010010201002,0.236767,0.499152
2,1500000US010010202001,0.236767,0.499152
3,1500000US010010202002,0.236767,0.499152
4,1500000US010010203001,0.236767,0.499152


In [490]:
# export as a csv
temp.to_csv('Block_Group_Exam_Score.csv', index=False)