
# Data Cleaning and Manipulation

This notebook will be used for gathering data from our primary sources and transforming it into a format that can be utilized for machine learning applications

In [1]:
# import libraries
import pandas as pd
from functools import reduce
import glob

In [2]:
# read all data
crdc_dfs = [pd.read_csv(filename, encoding='ANSI', true_values=['Yes'], false_values=['No'], na_values=['-3','-5','-6','-8','-9','-11'], dtype={'LEAID':str,'COMBOKEY':str,'SCHID':str}) for filename in glob.glob("../data/2017-18-crdc-data/Data/SCH/CRDC/CSV/*.csv")]

  crdc_dfs = [pd.read_csv(filename, encoding='ANSI', true_values=['Yes'], false_values=['No'], na_values=['-3','-5','-6','-8','-9','-11'], dtype={'LEAID':str,'COMBOKEY':str,'SCHID':str}) for filename in glob.glob("../data/2017-18-crdc-data/Data/SCH/CRDC/CSV/*.csv")]
  crdc_dfs = [pd.read_csv(filename, encoding='ANSI', true_values=['Yes'], false_values=['No'], na_values=['-3','-5','-6','-8','-9','-11'], dtype={'LEAID':str,'COMBOKEY':str,'SCHID':str}) for filename in glob.glob("../data/2017-18-crdc-data/Data/SCH/CRDC/CSV/*.csv")]


In [3]:
# merge dfs together
merge_cols = ['LEA_STATE','LEA_STATE_NAME','LEAID','LEA_NAME','SCHID','SCH_NAME','COMBOKEY','JJ']
school_data = reduce(lambda left,right: pd.merge(left,right,on=merge_cols,how='left'), crdc_dfs)

In [4]:
# keep only schools that have Grade 12
high_schools = school_data[school_data['SCH_GRADE_G12']]

In [12]:
# remove juvenile justice facilities
high_schools = high_schools[high_schools.JJ.eq(False)]
high_schools = high_schools.drop(columns='JJ')

In [5]:
# get rid of non-high-school data and calculated tot columns
import re
high_schools = high_schools.drop(columns=[col for col in high_schools.columns if bool(re.match(r'.*_(G0[1-8]|KG|PS|UG).*', col))])
high_schools = high_schools.drop(columns=[col for col in high_schools.columns if bool(re.match(r'^TOT.*', col))])

In [6]:
# sort columns by percentage NA
(high_schools.isna().sum()/high_schools.shape[0]).sort_values(ascending=False)

SCH_HBREPORTED_DIS_BL_M     1.0
SCH_HBREPORTED_SEX_WH_F     1.0
SCH_HBREPORTED_SEX_TR_F     1.0
SCH_HBREPORTED_SEX_LEP_M    1.0
SCH_HBREPORTED_SEX_LEP_F    1.0
                           ... 
SCH_STATUS_SPED             0.0
SCH_STATUS_MAGNET           0.0
SCH_STATUS_CHARTER          0.0
SCH_STATUS_ALT              0.0
LEA_STATE                   0.0
Length: 1265, dtype: float64

In [7]:
# drop columns with no data
high_schools = high_schools.dropna(axis=1, how='all')
high_schools.shape

(24931, 836)

In [8]:
# drop columns with more than 40% na vals
high_schools = high_schools.dropna(axis=1, thresh=int(high_schools.shape[0]*(1-.4)))
high_schools.shape

(24931, 530)

In [9]:
high_schools.describe()

Unnamed: 0,SCH_MATHCLASSES_ADVM,SCH_MATHCERT_ADVM,SCH_MATHENR_ADVM_HI_M,SCH_MATHENR_ADVM_HI_F,SCH_MATHENR_ADVM_AM_M,SCH_MATHENR_ADVM_AM_F,SCH_MATHENR_ADVM_AS_M,SCH_MATHENR_ADVM_AS_F,SCH_MATHENR_ADVM_HP_M,SCH_MATHENR_ADVM_HP_F,...,SCH_DISCWDIS_TFRALT_IDEA_BL_M,SCH_DISCWDIS_TFRALT_IDEA_BL_F,SCH_DISCWDIS_TFRALT_IDEA_WH_M,SCH_DISCWDIS_TFRALT_IDEA_WH_F,SCH_DISCWDIS_TFRALT_IDEA_TR_M,SCH_DISCWDIS_TFRALT_IDEA_TR_F,SCH_DISCWDIS_TFRALT_LEP_M,SCH_DISCWDIS_TFRALT_LEP_F,SCH_DISCWDIS_TFRALT_504_M,SCH_DISCWDIS_TFRALT_504_F
count,24926.0,17600.0,17600.0,17600.0,17600.0,17600.0,17600.0,17600.0,17600.0,17600.0,...,24467.0,24467.0,24467.0,24467.0,24467.0,24467.0,24473.0,24473.0,24469.0,24467.0
mean,5.559416,6.659205,13.530795,14.870909,0.345398,0.417159,5.452727,5.634943,0.185398,0.21983,...,0.101892,0.027425,0.075816,0.017411,0.006458,0.002166,0.013852,0.002207,0.035473,0.010954
std,9.505211,9.345555,32.183223,34.519356,1.677053,2.00988,20.701964,20.607065,1.503524,1.972717,...,0.889207,0.350905,0.519872,0.186456,0.093301,0.049886,0.199737,0.056413,0.391549,0.174976
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,4.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7.0,9.0,11.0,13.0,0.0,0.0,3.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,608.0,608.0,571.0,552.0,70.0,69.0,587.0,564.0,103.0,138.0,...,64.0,30.0,15.0,6.0,5.0,2.0,12.0,4.0,17.0,9.0


In [60]:
# add NC ACT data
nc_act = pd.read_excel('../data/act-results/NCactresults1718.xlsx',header=2,usecols=[0,1,2,3,4],na_values=[' ---- ','*'])
nc_act.dropna(inplace=True)
nc_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('NC')], nc_act, left_on='SCH_NAME', right_on='System or School Name')

In [62]:
# add SC ACT data
sc_act = pd.read_excel('../data/act-results/SCACT-Schools2017final2.xlsx',header=2,usecols=[0,1,2,10])
sc_act.dropna(inplace=True)
high_schools.loc[high_schools['LEA_STATE'].eq('SC'),'SCH_NAME'] = high_schools[high_schools['LEA_STATE'].eq('SC')]['SCH_NAME'].str.upper()
sc_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('SC')], sc_act, left_on='SCH_NAME', right_on='Unnamed: 2')

In [63]:
# add GA ACT data
ga_act = pd.read_csv('../data/act-results/GA_ACT_HIGHEST_2018_FEB_24_2020.csv',usecols=[1,4,6,14])
ga_act.dropna(inplace=True)
ga_act = ga_act[ga_act['TEST_CMPNT_TYP_CD'].eq('Composite')]
ga_high_schools = pd.merge(high_schools[high_schools['LEA_STATE'].eq('GA')], ga_act, left_on='SCH_NAME', right_on='INSTN_NAME')

In [64]:
pd.concat([nc_high_schools,sc_high_schools,ga_high_schools])

Unnamed: 0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,COMBOKEY,SCH_MATHCLASSES_ADVM,SCH_MATHCERT_ADVM,SCH_MATHENR_ADVM_HI_M,...,Tested,Mean,STATEWIDE - PUBLIC ONLY,Unnamed: 1,Unnamed: 2,18.4,SCHOOL_DISTRCT_CD,INSTN_NAME,TEST_CMPNT_TYP_CD,INSTN_AVG_SCORE_VAL
0,NC,NORTH CAROLINA,3700011,Cumberland County Schools,00400,Douglas Byrd High,370001100400,10.0,2.0,7.0,...,246.0,14.7,,,,,,,,
1,NC,NORTH CAROLINA,3700011,Cumberland County Schools,00402,Cape Fear High,370001100402,14.0,2.0,9.0,...,336.0,18.3,,,,,,,,
2,NC,NORTH CAROLINA,3700011,Cumberland County Schools,00420,Massey Hill Classical High,370001100420,4.0,1.0,7.0,...,76.0,20.2,,,,,,,,
3,NC,NORTH CAROLINA,3700011,Cumberland County Schools,00426,Pine Forest High,370001100426,10.0,4.0,21.0,...,378.0,17.8,,,,,,,,
4,NC,NORTH CAROLINA,3700011,Cumberland County Schools,00433,Seventy-First High,370001100433,18.0,3.0,12.0,...,300.0,15.5,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,GA,GEORGIA,1305700,Whitfield County,04053,Coahulla Creek High School,130570004053,19.0,12.0,30.0,...,,,,,,,755.0,Coahulla Creek High School,Composite,19.6
380,GA,GEORGIA,1305730,Wilcox County,02232,Wilcox County High School,130573002232,0.0,,,...,,,,,,,756.0,Wilcox County High School,Composite,17.9
381,GA,GEORGIA,1305760,Wilkes County,03852,Washington-Wilkes Comprehensive High School,130576003852,4.0,2.0,3.0,...,,,,,,,757.0,Washington-Wilkes Comprehensive High School,Composite,16.7
382,GA,GEORGIA,1305790,Wilkinson County,01841,Wilkinson County High School,130579001841,0.0,,,...,,,,,,,758.0,Wilkinson County High School,Composite,17.0
