In [268]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 40
from copy import deepcopy

In [388]:
!ls "raw data"

ACC HS Chronic Absenteeism.xlsx
ACC HS Core and Weighted Performance.csv
ACC HS Core and Weighted Performance.xlsx
ACC HS Graduation Rate.xlsx
Annual Regents Exams.xlsx
GRAD_RATE_AND_OUTCOMES_2022.csv
nys_annualregents_score.csv


In [203]:
# annual graduation data by cohort and subgroup 
data = pd.read_csv("raw data/GRAD_RATE_AND_OUTCOMES_2022.csv",index_col=False,dtype='string')

In [204]:
data.columns

Index(['report_school_year', 'aggregation_index', 'aggregation_type',
       'aggregation_code', 'aggregation_name', 'lea_beds', 'lea_name',
       'nrc_code', 'nrc_desc', 'county_code', 'county_name', 'nyc_ind',
       'boces_code', 'boces_name', 'membership_code', 'membership_key',
       'membership_desc', 'subgroup_code', 'subgroup_name', 'enroll_cnt',
       'grad_cnt', 'grad_pct', 'local_cnt', 'local_pct', 'reg_cnt', 'reg_pct',
       'reg_adv_cnt', 'reg_adv_pct', 'non_diploma_credential_cnt',
       'non_diploma_credential_pct', 'still_enr_cnt', 'still_enr_pct',
       'ged_cnt', 'ged_pct', 'dropout_cnt', 'dropout_pct'],
      dtype='object')

In [210]:
data.head(1)

Unnamed: 0,report_school_year,aggregation_index,aggregation_type,aggregation_code,aggregation_name,lea_beds,lea_name,nrc_code,nrc_desc,county_code,county_name,nyc_ind,boces_code,boces_name,membership_code,membership_key,membership_desc,subgroup_code,subgroup_name,enroll_cnt,grad_cnt,grad_pct,local_cnt,local_pct,reg_cnt,reg_pct,reg_adv_cnt,reg_adv_pct,non_diploma_credential_cnt,non_diploma_credential_pct,still_enr_cnt,still_enr_pct,ged_cnt,ged_pct,dropout_cnt,dropout_pct
0,2021-22,0,Statewide,0,All Districts and Charters,,,,,,,,,,6,180,2016 Total Cohort - 6 Year Outcome,1,All Students,207157,183454,89%,6212,3%,96334,47%,80908,39%,2058,1%,4664,2%,1746,1%,15088,7%


In [243]:
data.membership_desc.unique().tolist()

['2016 Total Cohort - 6 Year Outcome',
 '2017 Total Cohort - 5 Year Outcome',
 '2018 Total Cohort - 4 Year Outcome',
 '2017 Total Cohort - 5 Year Outcome - August 2022',
 '2018 Total Cohort - 4 Year Outcome - August 2022',
 '2016 Total Cohort - 6 Year Outcome - August 2022']

In [242]:
data.nrc_desc.unique().tolist()

['Urban-Suburban High Needs',
 'Charters',
 'Average Needs',
 'Low Needs',
 'Rural High Needs',
 'Buffalo  Rochester  Yonkers  Syracuse',
 'NYC']

In [249]:
copy_3 = data.loc[:,["aggregation_code","aggregation_name","county_name","nrc_desc","membership_desc","subgroup_name","grad_cnt",
           "grad_pct"]]

In [248]:
# transforming our assessment data 
data.columns = data.columns.str.lower()
data = data[data.aggregation_type == 'School']
data.loc[:,["aggregation_code","aggregation_name","county_name","nrc_desc","membership_desc","subgroup_name","grad_cnt"]]
#data = data[data.subgroup_name != 'All Students']
#nys_assessment_df = nys_assessment_df[nys_assessment_df.subject != 'Combined']
#nys_assessment_df = nys_assessment_df.reset_index(drop=True)
#data

Unnamed: 0,aggregation_code,aggregation_name,county_name,nrc_desc,membership_desc,subgroup_name,grad_cnt
81831,010100010034,ALBANY HIGH SCHOOL,ALBANY,Urban-Suburban High Needs,2016 Total Cohort - 6 Year Outcome,Male,254
81832,010100010034,ALBANY HIGH SCHOOL,ALBANY,Urban-Suburban High Needs,2016 Total Cohort - 6 Year Outcome,Female,295
81833,010100010034,ALBANY HIGH SCHOOL,ALBANY,Urban-Suburban High Needs,2016 Total Cohort - 6 Year Outcome,General Education Students,501
81834,010100010034,ALBANY HIGH SCHOOL,ALBANY,Urban-Suburban High Needs,2016 Total Cohort - 6 Year Outcome,Students with Disabilities,48
81835,010100010034,ALBANY HIGH SCHOOL,ALBANY,Urban-Suburban High Needs,2016 Total Cohort - 6 Year Outcome,American Indian or Alaska Native,-
...,...,...,...,...,...,...,...
221861,680801040001,DUNDEE JUNIOR-SENIOR HIGH SCHOOL,YATES,Rural High Needs,2016 Total Cohort - 6 Year Outcome - August 2022,Not in Foster Care,30
221862,680801040001,DUNDEE JUNIOR-SENIOR HIGH SCHOOL,YATES,Rural High Needs,2016 Total Cohort - 6 Year Outcome - August 2022,Homeless,-
221863,680801040001,DUNDEE JUNIOR-SENIOR HIGH SCHOOL,YATES,Rural High Needs,2016 Total Cohort - 6 Year Outcome - August 2022,Not Homeless,-
221864,680801040001,DUNDEE JUNIOR-SENIOR HIGH SCHOOL,YATES,Rural High Needs,2016 Total Cohort - 6 Year Outcome - August 2022,Not Migrant,30


### Reading in Datasets

In [324]:
nys_absenteeism_df = pd.read_excel("raw data/ACC HS Chronic Absenteeism.xlsx",index_col=False, 
                                   dtype=({'ENTITY_CD':'string'}))

In [393]:
nys_annual_regents = pd.read_csv("raw data/nys_annualregents_score.csv",index_col=False,
                                dtype=({'ENTITY_CD':'string','INSTITUTION_ID':'string'}))

In [441]:
# reformating column names and entity names 
nys_annual_regents.columns = nys_annual_regents.columns.str.lower()
nys_annual_regents.entity_name = nys_annual_regents.entity_name.str.title()

# removing aggregated measurements and district entities
nys_annual_regents = nys_annual_regents[~nys_annual_regents['entity_cd'].astype('str').str.endswith('0000')]
nys_annual_regents = nys_annual_regents[~nys_annual_regents['entity_cd'].astype('str').str.endswith('111111111111')]
nys_annual_regents = nys_annual_regents[~nys_annual_regents['entity_cd'].astype('str').str.endswith('000000000001')]
nys_annual_regents = nys_annual_regents[~nys_annual_regents['entity_cd'].astype('str').str.endswith('000000000002')]
nys_annual_regents = nys_annual_regents[~nys_annual_regents['entity_cd'].astype('str').str.endswith('000000000003')]
nys_annual_regents = nys_annual_regents[~nys_annual_regents['entity_cd'].astype('str').str.endswith('000000000004')]
nys_annual_regents = nys_annual_regents[~nys_annual_regents['entity_cd'].astype('str').str.endswith('000000000005')]
nys_annual_regents = nys_annual_regents[~nys_annual_regents['entity_cd'].astype('str').str.endswith('000000000006')]
nys_annual_regents = nys_annual_regents[~nys_annual_regents['entity_cd'].astype('str').str.endswith('000000000007')]

In [485]:
ann_regs_cols = ['entity_cd','entity_name','year','subject','subgroup_name',
                 'tested','num_prof']

In [486]:
nys_annual_regents = nys_annual_regents.loc[:,ann_regs_cols]
nys_annual_regents = nys_annual_regents[(nys_annual_regents.year == 2022) 
                                        & (nys_annual_regents.subgroup_name != 'All Students')]

In [488]:
nys_annual_regents

Unnamed: 0,entity_cd,entity_name,year,subject,subgroup_name,tested,num_prof
381971,010100010030,William S Hackett Middle School,2022,Regents Common Core Algebra I,Asian or Native Hawaiian/Other Pacific Islander,17,14
381972,010100010030,William S Hackett Middle School,2022,Regents Living Environment,Asian or Native Hawaiian/Other Pacific Islander,14,14
381973,010100010030,William S Hackett Middle School,2022,Regents Common Core Algebra I,Black or African American,10,s
381974,010100010030,William S Hackett Middle School,2022,Regents Living Environment,Black or African American,13,s
381975,010100010030,William S Hackett Middle School,2022,Regents Common Core Algebra I,Economically Disadvantaged,23,20
...,...,...,...,...,...,...,...
748975,680801040001,Dundee Junior-Senior High School,2022,Regents Living Environment,White,37,32
748976,680801040001,Dundee Junior-Senior High School,2022,Regents NF Global History,White,34,s
748977,680801040001,Dundee Junior-Senior High School,2022,Regents Phy Set/Chemistry,White,11,5
748978,680801040001,Dundee Junior-Senior High School,2022,Regents Phy Set/Earth Sci,White,27,s


In [353]:
#nys_assessment_df = pd.read_csv("raw data/ACC HS Core and Weighted Performance.csv",index_col=False,
#                                dtype=({"ENTITY_CD":"string"}))

In [284]:
nys_grad_rate_df = pd.read_excel("raw data/ACC HS Graduation Rate.xlsx",index_col=False,dtype=({'ENTITY_CD':'string'}))

### Feature Selection

Based on the documentation, 

In [395]:
# transforming our assessment data 
#nys_assessment_df.columns = nys_assessment_df.columns.str.lower()
#nys_assessment_df = nys_assessment_df.loc[:,['entity_cd', 'entity_name','year','subject','subgroup_name','weighted_index']]
#nys_assessment_df = nys_assessment_df[~nys_assessment_df['entity_cd'].astype('str').str.endswith('0000')]
#nys_assessment_df = nys_assessment_df[nys_assessment_df.subgroup_name != 'All Students']
#nys_assessment_df = nys_assessment_df.reset_index(drop=True)
#nys_assessment_df.loc[:,['entity_cd', 'entity_name','year','subject','subgroup_name','weighted_index','wgt_level']]
#nys_assessment_df

In [308]:
# transforming our grad rate dataset 
nys_grad_rate_df.columns = nys_grad_rate_df.columns.str.lower()
nys_grad_rate_df = nys_grad_rate_df.loc[:,['entity_cd','entity_name','year','subgroup_name','cohort','grad_rate']]
nys_grad_rate_df = nys_grad_rate_df[~nys_grad_rate_df['entity_cd'].astype('str').str.endswith('0000')]
nys_grad_rate_df = nys_grad_rate_df[nys_grad_rate_df.subgroup_name != 'All Students']
nys_grad_rate_df = nys_grad_rate_df.reset_index(drop=True)

In [500]:
# transforming our chronic absenteeism dataset
nys_absenteeism_df.columns = nys_absenteeism_df.columns.str.lower()
nys_absenteeism_df = nys_absenteeism_df.loc[:,['entity_cd', 'entity_name','year','subgroup_name','absent_rate','level']]
nys_absenteeism_df = nys_absenteeism_df[~nys_absenteeism_df['entity_cd'].astype('str').str.endswith('0000')]
nys_absenteeism_df = nys_absenteeism_df[nys_absenteeism_df.subgroup_name != 'All Students']
nys_absenteeism_df.entity_name = nys_absenteeism_df.entity_name.str.title()
nys_absenteeism_df = nys_absenteeism_df.reset_index(drop=True)
nys_absenteeism_df

Unnamed: 0,entity_cd,entity_name,year,subgroup_name,absent_rate,level
0,010100010034,Albany High School,2021,Students with Disabilities,47.3,
1,010100010034,Albany High School,2021,American Indian or Alaska Native,s,
2,010100010034,Albany High School,2021,Asian or Native Hawaiian/Other Pacific Islander,18.3,
3,010100010034,Albany High School,2021,Black or African American,44.6,
4,010100010034,Albany High School,2021,Hispanic or Latino,38.4,
...,...,...,...,...,...,...
25086,680801040002,Dundee Elementary School,2022,Hispanic or Latino,s,s
25087,680801040002,Dundee Elementary School,2022,White,s,s
25088,680801040002,Dundee Elementary School,2022,Multiracial,s,s
25089,680801040002,Dundee Elementary School,2022,Economically Disadvantaged,s,s


In [349]:
nys_grad_rate_22 = nys_grad_rate_df[(nys_grad_rate_df.cohort == 'Combined') & (nys_grad_rate_df.year == 2022) 
                 & (nys_grad_rate_df.grad_rate != 's')]
nys_grad_rate_22 = nys_grad_rate_22.drop(columns=['cohort'])

In [460]:
nys_grad_rate_22.entity_name = nys_grad_rate_22.entity_name.str.title()

In [461]:
nys_grad_rate_22

Unnamed: 0,entity_cd,entity_name,year,subgroup_name,grad_rate
3,010100010034,Albany High School,2022,Hispanic or Latino,73.7
7,010100010034,Albany High School,2022,White,86.9
10,010100010034,Albany High School,2022,Multiracial,77.4
13,010100010034,Albany High School,2022,English Language Learner,70.4
14,010100010034,Albany High School,2022,Economically Disadvantaged,75.5
...,...,...,...,...,...
94838,671501040002,Warsaw Senior High School,2022,Economically Disadvantaged,88.1
94862,680601060001,Penn Yan Academy,2022,White,91.1
94874,680601060001,Penn Yan Academy,2022,Economically Disadvantaged,88.6
94897,680801040001,Dundee Junior-Senior High School,2022,White,78.6


In [490]:
shared_cols

['entity_cd', 'entity_name', 'year', 'subgroup_name']

In [501]:
pd.merge(pd.merge(nys_annual_regents,nys_grad_rate_22,how='inner',on=shared_cols),nys_absenteeism_df,
        how='inner',on=shared_cols)

Unnamed: 0,entity_cd,entity_name,year,subject,subgroup_name,tested,num_prof,grad_rate,absent_rate,level
0,010100010034,Albany High School,2022,Regents US History&Gov't (Framework),Asian or Native Hawaiian/Other Pacific Islander,0,0,82.4,41.8,2
1,010100010034,Albany High School,2022,Regents Phy Set/Physics,Asian or Native Hawaiian/Other Pacific Islander,22,8,82.4,41.8,2
2,010100010034,Albany High School,2022,Regents Phy Set/Chemistry,Asian or Native Hawaiian/Other Pacific Islander,37,18,82.4,41.8,2
3,010100010034,Albany High School,2022,Regents NF Global History,Asian or Native Hawaiian/Other Pacific Islander,61,46,82.4,41.8,2
4,010100010034,Albany High School,2022,Regents Living Environment,Asian or Native Hawaiian/Other Pacific Islander,40,22,82.4,41.8,2
...,...,...,...,...,...,...,...,...,...,...
42642,680801040001,Dundee Junior-Senior High School,2022,Regents Living Environment,White,37,32,78.6,35.1,3
42643,680801040001,Dundee Junior-Senior High School,2022,Regents NF Global History,White,34,s,78.6,35.1,3
42644,680801040001,Dundee Junior-Senior High School,2022,Regents Phy Set/Chemistry,White,11,5,78.6,35.1,3
42645,680801040001,Dundee Junior-Senior High School,2022,Regents Phy Set/Earth Sci,White,27,s,78.6,35.1,3


Now, we need to merge the two columns to have a total of 4 shared columns and 5 features, thus having 9 columns in our final dataframe

In [118]:
shared_cols = ['entity_cd','entity_name','year','subgroup_name']

In [121]:
copy_final_df =pd.merge(pd.merge(nys_absenteeism_df,nys_assessment_df, on = shared_cols),nys_grad_rate_df, on = shared_cols)

In [127]:
copy_final_df[copy_final_df.entity_name == 'ALBANY HIGH SCHOOL']

Unnamed: 0,entity_cd,entity_name,year,subgroup_name,enrollment,absent_count,subject,weighted_index,cohort,cohort_count,grad_count
0,010100010034,ALBANY HIGH SCHOOL,2022,Students with Disabilities,422,272,ELA,5.8,4-Year,76,47
1,010100010034,ALBANY HIGH SCHOOL,2022,Students with Disabilities,422,272,ELA,5.8,5-Year,90,48
2,010100010034,ALBANY HIGH SCHOOL,2022,Students with Disabilities,422,272,ELA,5.8,6-Year,76,45
3,010100010034,ALBANY HIGH SCHOOL,2022,Students with Disabilities,422,272,MATH,22.5,4-Year,76,47
4,010100010034,ALBANY HIGH SCHOOL,2022,Students with Disabilities,422,272,MATH,22.5,5-Year,90,48
...,...,...,...,...,...,...,...,...,...,...,...
247,010100010034,ALBANY HIGH SCHOOL,2022,English Language Learner,433,236,SCIENCE,83.6,5-Year,105,78
248,010100010034,ALBANY HIGH SCHOOL,2022,English Language Learner,433,236,SCIENCE,83.6,6-Year,74,43
249,010100010034,ALBANY HIGH SCHOOL,2022,English Language Learner,433,236,SOCIAL,3.3,4-Year,94,74
250,010100010034,ALBANY HIGH SCHOOL,2022,English Language Learner,433,236,SOCIAL,3.3,5-Year,105,78


In [128]:
nys_final_df = pd.merge(nys_absenteeism_df,nys_grad_rate_df,how='inner',on=['entity_cd','entity_name','year','subgroup_name'])

In [129]:
# number of unique schools 
len(nys_final_df.entity_name.unique())

1426

This dataset contains 1426 unique schools 

In [130]:
nys_final_df

Unnamed: 0,entity_cd,entity_name,year,subgroup_name,enrollment,absent_count,cohort,cohort_count,grad_count
0,010100010034,ALBANY HIGH SCHOOL,2021,Students with Disabilities,414,196,6-Year,93,48
1,010100010034,ALBANY HIGH SCHOOL,2021,Students with Disabilities,414,196,5-Year,77,42
2,010100010034,ALBANY HIGH SCHOOL,2021,Students with Disabilities,414,196,4-Year,89,45
3,010100010034,ALBANY HIGH SCHOOL,2021,American Indian or Alaska Native,9,s,6-Year,3,s
4,010100010034,ALBANY HIGH SCHOOL,2021,American Indian or Alaska Native,9,s,5-Year,2,s
...,...,...,...,...,...,...,...,...,...
66280,680801040002,DUNDEE ELEMENTARY SCHOOL,2022,Economically Disadvantaged,s,s,5-Year,0,s
66281,680801040002,DUNDEE ELEMENTARY SCHOOL,2022,Economically Disadvantaged,s,s,6-Year,0,s
66282,680801040002,DUNDEE ELEMENTARY SCHOOL,2022,English Language Learner,s,s,4-Year,0,s
66283,680801040002,DUNDEE ELEMENTARY SCHOOL,2022,English Language Learner,s,s,6-Year,0,s


In [None]:
nys_final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 66285 entries, 0 to 66284
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   entity_cd      66285 non-null  int64 
 1   entity_name    66285 non-null  object
 2   year           66285 non-null  int64 
 3   subgroup_name  66285 non-null  object
 4   enrollment     66285 non-null  object
 5   absent_count   66285 non-null  object
 6   cohort         66285 non-null  object
 7   cohort_count   66285 non-null  object
 8   grad_count     66285 non-null  object
dtypes: int64(2), object(7)
memory usage: 5.1+ MB


In [None]:
nys_final_df = nys_final_df[nys_final_df.grad_count.str.contains('s')==False]
nys_final_df = nys_final_df[nys_final_df.absent_count.str.contains('s') == False]

In [None]:
nys_final_df

Unnamed: 0,entity_cd,entity_name,year,subgroup_name,enrollment,absent_count,cohort,cohort_count,grad_count
0,10100010034,ALBANY HIGH SCHOOL,2021,Students with Disabilities,414,196,6-Year,93,48
1,10100010034,ALBANY HIGH SCHOOL,2021,Students with Disabilities,414,196,5-Year,77,42
2,10100010034,ALBANY HIGH SCHOOL,2021,Students with Disabilities,414,196,4-Year,89,45
6,10100010034,ALBANY HIGH SCHOOL,2021,Asian or Native Hawaiian/Other Pacific Islander,219,40,4-Year,57,48
7,10100010034,ALBANY HIGH SCHOOL,2021,Asian or Native Hawaiian/Other Pacific Islander,219,40,6-Year,53,44
...,...,...,...,...,...,...,...,...,...
66227,680601060001,PENN YAN ACADEMY,2022,Economically Disadvantaged,191,67,6-Year,65,58
66246,680801040001,DUNDEE JUNIOR-SENIOR HIGH SCHOOL,2022,White,174,61,6-Year,54,46
66247,680801040001,DUNDEE JUNIOR-SENIOR HIGH SCHOOL,2022,White,174,61,5-Year,40,30
66248,680801040001,DUNDEE JUNIOR-SENIOR HIGH SCHOOL,2022,White,174,61,4-Year,53,40


In [185]:
copy_1[copy_1.cohort == '6-Year']

Unnamed: 0,entity_cd,entity_name,year,subgroup_name,cohort,cohort_count,grad_count
5,10100010034,ALBANY HIGH SCHOOL,2022,White,6-Year,139,125
54,10100010034,ALBANY HIGH SCHOOL,2021,White,6-Year,129,101


In [186]:
copy_2[copy_2.membership_desc.str.contains('6')]

Unnamed: 0,aggregation_code,aggregation_name,nrc_desc,membership_desc,subgroup_name,grad_cnt
81839,10100010034,ALBANY HIGH SCHOOL,Urban-Suburban High Needs,2016 Total Cohort - 6 Year Outcome,White,122
81948,10100010034,ALBANY HIGH SCHOOL,Urban-Suburban High Needs,2016 Total Cohort - 6 Year Outcome - August 2022,White,122


In [262]:
nys_grad_rate_df

Unnamed: 0,entity_cd,entity_name,year,subgroup_name,cohort,grad_count
0,010100010034,ALBANY HIGH SCHOOL,2022,Hispanic or Latino,6-Year,82
1,010100010034,ALBANY HIGH SCHOOL,2022,English Language Learner,4-Year,74
2,010100010034,ALBANY HIGH SCHOOL,2022,Multiracial,6-Year,s
3,010100010034,ALBANY HIGH SCHOOL,2022,White,4-Year,123
4,010100010034,ALBANY HIGH SCHOOL,2022,White,5-Year,122
...,...,...,...,...,...,...
71221,680801040002,DUNDEE ELEMENTARY SCHOOL,2022,Asian or Native Hawaiian/Other Pacific Islander,6-Year,s
71222,680801040002,DUNDEE ELEMENTARY SCHOOL,2022,Black or African American,4-Year,s
71223,680801040002,DUNDEE ELEMENTARY SCHOOL,2022,Black or African American,5-Year,s
71224,680801040002,DUNDEE ELEMENTARY SCHOOL,2022,Black or African American,6-Year,s


## Exporting Cleaned Data 

In [518]:
#nys_grad_rate_22.to_csv('data/2022_NYS_grad-rate.csv',index=False)