
# Data Cleaning and Manipulation

This notebook will be used for gathering data from our primary sources and transforming it into a format that can be utilized for machine learning applications

In [1]:
# import libraries
import pandas as pd
from functools import reduce
import glob

In [2]:
# read all data
crdc_dfs = [pd.read_csv(filename, encoding='ANSI', true_values=['Yes'], false_values=['No'], na_values=['-3','-5','-6','-8','-9','-11'], dtype={'LEAID':str,'COMBOKEY':str,'SCHID':str}) for filename in glob.glob("../data/2017-18-crdc-data/Data/SCH/CRDC/CSV/*.csv")]

  crdc_dfs = [pd.read_csv(filename, encoding='ANSI', true_values=['Yes'], false_values=['No'], na_values=['-3','-5','-6','-8','-9','-11'], dtype={'LEAID':str,'COMBOKEY':str,'SCHID':str}) for filename in glob.glob("../data/2017-18-crdc-data/Data/SCH/CRDC/CSV/*.csv")]
  crdc_dfs = [pd.read_csv(filename, encoding='ANSI', true_values=['Yes'], false_values=['No'], na_values=['-3','-5','-6','-8','-9','-11'], dtype={'LEAID':str,'COMBOKEY':str,'SCHID':str}) for filename in glob.glob("../data/2017-18-crdc-data/Data/SCH/CRDC/CSV/*.csv")]


In [3]:
# merge dfs together
merge_cols = ['LEA_STATE','LEA_STATE_NAME','LEAID','LEA_NAME','SCHID','SCH_NAME','COMBOKEY','JJ']
school_data = reduce(lambda left,right: pd.merge(left,right,on=merge_cols,how='left'), crdc_dfs)

In [4]:
# keep only schools that have Grade 12
high_schools = school_data[school_data['SCH_GRADE_G12']]

In [5]:
# remove juvenile justice facilities
high_schools = high_schools[high_schools.JJ.eq(False)]
high_schools = high_schools.drop(columns='JJ')

In [6]:
# get rid of non-high-school data and calculated tot columns
import re
high_schools = high_schools.drop(columns=[col for col in high_schools.columns if bool(re.match(r'.*_(GS?0[1-8]|KG|PS|UG).*', col))])

In [7]:
high_schools.describe()

Unnamed: 0,SCH_MATHCLASSES_ADVM,SCH_MATHCERT_ADVM,SCH_MATHENR_ADVM_HI_M,SCH_MATHENR_ADVM_HI_F,SCH_MATHENR_ADVM_AM_M,SCH_MATHENR_ADVM_AM_F,SCH_MATHENR_ADVM_AS_M,SCH_MATHENR_ADVM_AS_F,SCH_MATHENR_ADVM_HP_M,SCH_MATHENR_ADVM_HP_F,...,SCH_DISCWDIS_TFRALT_IDEA_WH_M,SCH_DISCWDIS_TFRALT_IDEA_WH_F,SCH_DISCWDIS_TFRALT_IDEA_TR_M,SCH_DISCWDIS_TFRALT_IDEA_TR_F,TOT_DISCWDIS_TFRALT_IDEA_M,TOT_DISCWDIS_TFRALT_IDEA_F,SCH_DISCWDIS_TFRALT_LEP_M,SCH_DISCWDIS_TFRALT_LEP_F,SCH_DISCWDIS_TFRALT_504_M,SCH_DISCWDIS_TFRALT_504_F
count,24469.0,17534.0,17534.0,17534.0,17534.0,17534.0,17534.0,17534.0,17534.0,17534.0,...,24467.0,24467.0,24467.0,24467.0,24467.0,24467.0,24471.0,24471.0,24468.0,24467.0
mean,5.655932,6.675488,13.579389,14.926429,0.346641,0.418729,5.473252,5.656154,0.186096,0.220657,...,0.075816,0.017411,0.006458,0.002166,0.228103,0.059018,0.013853,0.002207,0.035475,0.010954
std,9.564453,9.356777,32.233838,34.572369,1.680068,2.013496,20.738184,20.642909,1.506309,1.97638,...,0.519872,0.186456,0.093301,0.049886,1.348344,0.501806,0.199745,0.056415,0.391557,0.174976
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,4.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8.0,9.0,11.0,13.0,0.0,0.0,3.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,608.0,608.0,571.0,552.0,70.0,69.0,587.0,564.0,103.0,138.0,...,15.0,6.0,5.0,2.0,76.0,33.0,12.0,4.0,17.0,9.0


In [8]:
# add NC ACT data
nc_act = pd.read_excel('../data/act-results/NCactresults1718.xlsx',header=2,usecols=[0,1,2,3,4],na_values=[' ---- ','*'])
nc_act.dropna(inplace=True)
nc_act = nc_act[['System or School Name','Mean']]
nc_act.columns = ['SCH_NAME','ACT']
nc_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('NC')], nc_act, on='SCH_NAME')

In [9]:
# add SC ACT data
sc_act = pd.read_excel('../data/act-results/SCACT-Schools2017final2.xlsx',header=2,usecols=[0,1,2,10])
sc_act.dropna(inplace=True)
sc_act = sc_act[['Unnamed: 2',18.4]]
sc_act.columns = ['SCH_NAME','ACT']
high_schools.loc[high_schools['LEA_STATE'].eq('SC'),'SCH_NAME'] = high_schools[high_schools['LEA_STATE'].eq('SC')]['SCH_NAME'].str.upper()
sc_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('SC')], sc_act, on='SCH_NAME')

In [10]:
# add GA ACT data
ga_act = pd.read_csv('../data/act-results/GA_ACT_HIGHEST_2018_FEB_24_2020.csv',usecols=[1,4,6,14])
ga_act.dropna(inplace=True)
ga_act = ga_act[ga_act['TEST_CMPNT_TYP_CD'].eq('Composite')]
ga_act = ga_act[['INSTN_NAME','INSTN_AVG_SCORE_VAL']]
ga_act.columns = ['SCH_NAME','ACT']
ga_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('GA')], ga_act, on='SCH_NAME')

In [11]:
# add CO
co_act = pd.read_excel('../data/act-results/CO_ACT District and School Summary 2015_2016.xlsx',header=4,usecols=[3,11])
co_act.dropna(inplace=True)
co_act.columns = ['SCH_NAME','ACT']
high_schools.loc[high_schools['LEA_STATE'].eq('CO'),'SCH_NAME'] = high_schools[high_schools['LEA_STATE'].eq('CO')]['SCH_NAME'].str.upper()
co_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('CO')], co_act, on='SCH_NAME')
high_schools[high_schools['LEA_STATE'].eq('CO')].shape, co_high_schools.shape

((494, 1396), (431, 1397))

In [12]:
# add FL
fl_act = pd.read_excel('../data/act-results/FL_2017ACTSchool.xlsx',header=3,usecols=[3,9],na_values=['*'])
fl_act.dropna(inplace=True)
fl_act['Unnamed: 3'] = fl_act['Unnamed: 3'].str.replace(' HS',' HIGH SCHOOL')
fl_act.columns = ['SCH_NAME','ACT']
high_schools.loc[high_schools['LEA_STATE'].eq('FL'),'SCH_NAME'] = high_schools[high_schools['LEA_STATE'].eq('FL')]['SCH_NAME'].str.upper()
fl_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('FL')], fl_act, on='SCH_NAME')
high_schools[high_schools['LEA_STATE'].eq('FL')].shape, fl_high_schools.shape

((951, 1396), (286, 1397))

In [13]:
# add LA
la_act = pd.read_excel('../data/act-results/LA_act-class-of-2018.xlsx',sheet_name='Site',header=5,usecols=[3,5])
la_act.dropna(inplace=True)
la_act.columns = ['SCH_NAME','ACT']
la_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('LA')], la_act, on='SCH_NAME')
high_schools[high_schools['LEA_STATE'].eq('LA')].shape, la_high_schools.shape

((353, 1396), (341, 1397))

In [14]:
mo_act = pd.read_excel('../data/act-results/MO_Building ACT Results.xlsx',header=0,usecols=[0,4,9])
mo_act.dropna(inplace=True)
mo_act = mo_act[mo_act['YEAR'].eq(2018)].iloc[:,1:]
mo_act.columns = ['SCH_NAME','ACT']
high_schools.loc[high_schools['LEA_STATE'].eq('MO'),'SCH_NAME'] = high_schools[high_schools['LEA_STATE'].eq('MO')]['SCH_NAME'].str.upper()
mo_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('MO')], mo_act, on='SCH_NAME')
high_schools[high_schools['LEA_STATE'].eq('MO')].shape, mo_high_schools.shape

((639, 1396), (576, 1397))

In [15]:
mn_act = pd.read_excel('../data/act-results/Minnesota 2018 Public Schools Graduating Class 5 Year Trends.xlsx',header=0,usecols=[0,3,5,11])
mn_act.dropna(inplace=True)
mn_act = mn_act[mn_act['Analysis Level'].eq('School') & mn_act['Grad Year'].eq(2018)]
mn_act = mn_act[['HS Name','Avg Comp']]
mn_act.columns = ['SCH_NAME','ACT']
high_schools.loc[high_schools['LEA_STATE'].eq('MN'),'SCH_NAME'] = high_schools[high_schools['LEA_STATE'].eq('MN')]['SCH_NAME'].str.upper()
high_schools.loc[high_schools['LEA_STATE'].eq('MN'),'SCH_NAME'] = high_schools[high_schools['LEA_STATE'].eq('MN')]['SCH_NAME'].str.replace('SECONDARY','HIGH SCHOOL')
high_schools.loc[high_schools['LEA_STATE'].eq('MN'),'SCH_NAME'] = high_schools[high_schools['LEA_STATE'].eq('MN')]['SCH_NAME'].str.replace('SENIOR HIGH','HIGH SCHOOL')
mn_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('MN')], mn_act, on='SCH_NAME')
high_schools[high_schools['LEA_STATE'].eq('MN')].shape, mn_high_schools.shape

((816, 1396), (215, 1397))

In [16]:
ne_act = pd.read_csv('../data/act-results/NE_ACT_Composite_20202021.csv')
ne_act.dropna(inplace=True)
ne_act = ne_act[ne_act['LEVEL'].eq('SC')]
ne_act = ne_act[['NAME','AVERAGE_COMPOSITE_SCORE']]
ne_act.columns = ['SCH_NAME','ACT']
high_schools.loc[high_schools['LEA_STATE'].eq('NE'),'SCH_NAME'] = high_schools[high_schools['LEA_STATE'].eq('NE')]['SCH_NAME'].str.upper()
ne_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('NE')], ne_act, on='SCH_NAME')
high_schools[high_schools['LEA_STATE'].eq('NE')].shape, ne_high_schools.shape

((312, 1396), (263, 1397))

In [17]:
nd_act = pd.read_excel('../data/act-results/North Dakota ACT Scores.xlsx',header=1,usecols=[1,10])
nd_act.dropna(inplace=True)
nd_act['School'] = nd_act['School'].str.upper()
nd_act.columns = ['SCH_NAME','ACT']
nd_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('ND')], nd_act, on='SCH_NAME')
high_schools[high_schools['LEA_STATE'].eq('ND')].shape, nd_high_schools.shape

((167, 1396), (88, 1397))

In [18]:
ok_act = pd.read_excel('../data/act-results/OK_School Level_ACT_2016_Senior.xlsx',header=0,usecols=[5,7])
ok_act.dropna(inplace=True)
ok_act.columns = ['SCH_NAME',"ACT"]
ok_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('OK')], ok_act, on='SCH_NAME')
high_schools[high_schools['LEA_STATE'].eq('OK')].shape, ok_high_schools.shape

((490, 1396), (458, 1397))

In [19]:
pa_act = pd.read_excel('../data/act-results/Pennsylvania_2017 ACT Scores Public Schools.xlsx',header=7,usecols=[4,11])
pa_act.dropna(inplace=True)
pa_act.columns = ['SCH_NAME','ACT']
pa_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('PA')], pa_act, on='SCH_NAME')
high_schools[high_schools['LEA_STATE'].eq('PA')].shape, pa_high_schools.shape

((751, 1396), (565, 1397))

In [20]:
tn_act = pd.read_excel('../data/act-results/TN_2017-18 ACT_school_suppressed.xlsx',header=0,usecols=[3,4,11])
tn_act.dropna(inplace=True)
tn_act = tn_act[tn_act['Subgroup'].eq('All Students')][['School Name','Average Composite Score']]
tn_act.columns = ['SCH_NAME','ACT']
tn_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('TN')], tn_act, on='SCH_NAME')
high_schools[high_schools['LEA_STATE'].eq('TN')].shape, tn_high_schools.shape

((402, 1396), (395, 1397))

In [21]:
tx_act = pd.read_excel('../data/act-results/TX_ACT_Campus_Data_Class_2018.xlsx',sheet_name='ACT_Campus_Data_Class_2018',header=0,usecols=[0,3,14])
tx_act.dropna(inplace=True)
tx_act = tx_act[tx_act['Group'].eq('All Students')].iloc[:,1:]
tx_act['CampName'] = tx_act['CampName'].str.upper()
tx_act.columns = ['SCH_NAME','ACT']
tx_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('TX')], tx_act, on='SCH_NAME')
high_schools[high_schools['LEA_STATE'].eq('TX')].shape, tx_high_schools.shape

((2033, 1396), (1605, 1397))

In [22]:
ut_act = pd.read_excel('../data/act-results/Utah_ACTGrade112018.xlsx',sheet_name='SCHOOL',header=0,usecols=[3,5])
ut_act.dropna(inplace=True)
ut_act.columns = ['SCH_NAME','ACT']
ut_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('UT')], ut_act, on='SCH_NAME')
high_schools[high_schools['LEA_STATE'].eq('UT')].shape, ut_high_schools.shape

((215, 1396), (186, 1397))

In [23]:
wi_act = pd.read_csv('../data/act-results/WI_act_graduates_certified_2017-18.csv')
wi_act = wi_act[wi_act['TEST_SUBJECT'].eq('Composite') & wi_act['GROUP_BY'].eq('All Students')]
wi_act = wi_act[['SCHOOL_NAME','AVERAGE_SCORE']]
wi_act.columns = ['SCH_NAME','ACT']
wi_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('WI')], wi_act, on='SCH_NAME')
high_schools[high_schools['LEA_STATE'].eq('WI')].shape, wi_high_schools.shape

((576, 1396), (682, 1397))

In [24]:
wy_act = pd.read_excel('../data/act-results/WY_DistrictACTSuiteAverages_20220329_050233.xlsx',header=8,usecols=[4,20])
wy_act.dropna(inplace=True)
wy_act.columns = ['SCH_NAME','ACT']
wy_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('WY')], wy_act, on='SCH_NAME')
high_schools[high_schools['LEA_STATE'].eq('WY')].shape, wy_high_schools.shape

  warn("Workbook contains no default style, apply openpyxl's default")


((95, 1396), (70, 1397))

In [25]:
high_schools_act = pd.concat([nc_high_schools,sc_high_schools,ga_high_schools,co_high_schools,fl_high_schools,la_high_schools,mn_high_schools,mo_high_schools,ne_high_schools,nd_high_schools,ok_high_schools,pa_high_schools,tn_high_schools,tx_high_schools,ut_high_schools,wi_high_schools,wy_high_schools])

In [26]:
import numpy as np

high_schools_act['ACT'] = high_schools_act['ACT'].replace(['*','--','~',-1,'***','N<10'],np.NaN)

In [27]:
high_schools_act.to_csv('../data/full_highschool_data.csv',index=False)