In [1]:
import os
import glob
import datetime
import pandas as pd

valid_columns = ["State", "County","School District", "School Name", "Grade #",	"School Year"	
           "Ethnicity/Race","Economically Disadvantaged",	"ESL",	"Disability",	"Migrant",	
           "Student/Teacher Ratio",	"Enrollment Rate", "Dropout Rate", "Gender"]

#add quarter or month to the valid col?

valid_years = list(range(2012,datetime.datetime.today().year))

valid_summaries = {"ss": "Statewide Totals" , "sr": "Statewide Region Totals", "so": "Statewide County Totals",
"sd": "Statewide District Totals", "sc":"Statewide Campus Totals", "rr": "Selected Regionwide Totals", "ro":"Selected Regionwide County Totals",
"rd": "Selected Regionwide District Totals", "rc": "Selected Regionwide Campus Totals", "oo":"Selected Countywide Totals", "od":"Selected Countywide District Totals",
"oc": "Selected Countywide Campus Totals", "id": "Selected District Totals using district number",
 "dd": "Selected District Totals using district name", "ic":"Selected Campus Totals using district number",
 "ic": "Selected Campus Totals using district number", "dc": "Selected Campus Totals using district name",
 "nc": "Selected Campus Totals using campus number", "cc":"Selected Campus Totals using campus name"}

valid_groupings = ["Ethnicity",  "Gender", "Grade", "Gender and Ethnicity", "Grade and Ethnicity", "Grade and Gender", "All"]

disability_mappings = { "MID": "Mild Intellectual Disability", "MoID": "Moderate Intellectual Disability", "SID":"Severe Intellectual Disability", 
                     "PID": "Profound Intellectual Disability", "EBD" : "Emotional/Behavioral Disorder", "SLD" : "Specific Learning Disability", 
                     "OI" :"Orthopedic Impairment", "D" :"Deaf", "OHI" : "Other Health Impairment", "BL" : "Blind", 
                     "DB":"Deaf and Blind", "SI":"Speech/Language Impairment", "AUT":"Autism", "TBI": "Traumatic Brain Injury","SDD" :"Significant Developmental Delay" }            


In [2]:
merged_years = []

for year in valid_years: 
    for grouping in ["All"]:
        datapath = "{}/{}/{}".format(year, valid_summaries["so"], grouping)
        filenames = glob.glob(os.path.join(datapath, "*.csv"))
        dfs = list(pd.read_csv(files, low_memory=False, error_bad_lines=False) for files in filenames) 
        merged_years = pd.concat(dfs, ignore_index=True, sort=True)
        
merged_years["School Year"] = year
merged_years["State"] = "Virginia"
merged_years.rename(columns={'DIV_NAME':'County'}, inplace = True)

merged_years.rename(columns={'GRADE_CODE':'Grade #'}, inplace = True)
merged_years.rename(columns={'FEDERAL_RACE_CODE':'Ethnicity/Race'}, inplace = True)
merged_years.rename(columns={'DISADVANTAGED_FLAG':'Economically Disadvantaged'}, inplace = True)
merged_years.rename(columns={'DISABILITY_FLAG':'Disability'}, inplace = True)
merged_years.rename(columns={'GENDER':'Gender'}, inplace = True)

# Unused cols: 
# From schema
# valid_columns = [School District", "School Name",		"ESL",	"Disability",	"Migrant",	
#            "Student/Teacher Ratio",	"Enrollment Rate", "Dropout Rate"]
# From data: 
#Index(['DEC1_CNT', 'DIV_NUM','LEP_FLAG','LEVEL_CODE', 'PRIMARY_DISABILITY_TYPE', 'SCHOOL_YEAR', 'SCH_NAME','SCH_NUM'],
        
merged_years.drop(inplace=True, columns=['DIV_NUM','LEP_FLAG','LEVEL_CODE', 'PRIMARY_DISABILITY_TYPE', 'SCHOOL_YEAR', 'SCH_NAME','SCH_NUM'])

print(merged_years.columns)
merged_years.head()

Index(['DEC1_CNT', 'Disability', 'Economically Disadvantaged', 'County',
       'Ethnicity/Race', 'Gender', 'Grade #', 'School Year', 'State'],
      dtype='object')


Unnamed: 0,DEC1_CNT,Disability,Economically Disadvantaged,County,Ethnicity/Race,Gender,Grade #,School Year,State
0,10,Y,Y,Accomack County,3.0,M,1,2019,Virginia
1,13,Y,,Accomack County,3.0,M,1,2019,Virginia
2,11,Y,Y,Accomack County,3.0,M,1,2019,Virginia
3,16,Y,,Accomack County,3.0,M,1,2019,Virginia
4,14,Y,Y,Accomack County,3.0,,1,2019,Virginia


In [3]:
enrollment_sums =  merged_years.groupby('County')['DEC1_CNT'].sum().reset_index(name ='Enrollment Totals')
print(enrollment_sums.head())

              County  Enrollment Totals
0   Accomack County               55391
1  Albemarle County              159870
2   Alexandria City              155989
3  Alleghany County               20230
4     Amelia County               13818


In [None]:
#Test: 
print(enrollment_sums.loc[enrollment_sums['County'] == 'Accomack County ']['Enrollment Totals'].item())

In [None]:
# # TAKING TOO LONG TO LOAD? 
# Enrollment Rate = 'DEC1_CNT' / enrollment_sums['County'] 
# merged_years['Enrollment Rate'] = merged_years.apply(lambda x: x['DEC1_CNT'] / enrollment_sums.loc[enrollment_sums['County'] == x['County']]['Enrollment Totals'].item(), 
# axis=1) 
# merged_years.drop(inplace=True, columns=['DEC1_CNT']
# print(merged_years.head())

In [None]:
# Sample Plot by Race : replace with Enrollment Rate once I have that col

# df['Race'].groupby('Race').size().plot(title='Enrollment Rate by Race', kind='line')

merged_years.groupby('Ethnicity/Race')['DEC1_CNT'].plot(legend=True, title='Enrollment Rate by Race', kind='bar')
# merged_years.groupby('Ethnicity/Race')['Enrollment Rate'].plot(legend=True, title='Enrollment Rate by Race', kind='bar')
