# Education data prep **Part 1**

### Standardizing county names, and general cleaning/refactoring

- We have 3 education datasets:
  - graduation rates
  - student mobility
  - student mobility by demographics

Fortunately, just ONE of them (grad rate) has a county column. Using the district column present in all 3 datasets, we will merge the county column with the other two datasets

In [1]:
import crime as cr
import pandas as pd
cr.load('dist_grad_rate', full=True)
cr.load('dist_student_mobility', full=True)
_ = cr.load('dist_mobility_demographics', full=True)

## Get County Names

In [2]:
df = cr.df('dist_grad_rate')[['organization_name', 'county_name']]
df.loc[df.organization_name == 'STATE TOTAL', 'county_name'] = 'STATE TOTAL'
df.loc[df.organization_name == 'CHARTER SCHOOL INSTITUTE', 'county_name'] = 'DENVER'

df.loc[df.organization_name == 'MOUNTAIN BOCES', 'county_name'] = 'CHAFFEE'
df.loc[df.organization_name == 'CENTENNIAL BOCES', 'county_name'] = 'WELD'
df.loc[df.organization_name == 'SAN JUAN BOCES', 'county_name'] = 'LA PLATA'
df.loc[df.organization_name == 'EXPEDITIONARY BOCES', 'county_name'] = 'DENVER'

county = df.copy()
county

Unnamed: 0,organization_name,county_name
0,STATE TOTAL,STATE TOTAL
1,MAPLETON 1,ADAMS
2,ADAMS 12 FIVE STAR SCHOOLS,ADAMS
3,ADAMS COUNTY 14,ADAMS
4,BRIGHTON 27J,ADAMS
...,...,...
180,MOUNTAIN BOCES,CHAFFEE
181,CENTENNIAL BOCES,WELD
182,SAN JUAN BOCES,LA PLATA
183,EXPEDITIONARY BOCES,DENVER


### Do the stuff
1. Merge county column
2. Refactor county and bring to front.
3. Drop unneeded columns
4. Drop duplicates (for some reason the 'STATE TOTAL' row is duplicated 2-4 times on some of the datasets. Weird.)
5. Repeat for the other two datasets

In [3]:
df = cr.df('dist_student_mobility')
df = county.merge(df, how='left')
df = df.rename(columns={'organization_name': 'school_dist'})
df.insert(0, 'county', df.pop('county_name'))
df = df.drop(columns=['category', 'school_year', 'org_code'])
df = df.drop_duplicates()
df.to_csv('output/dist_mobility_rate.csv')
df

Unnamed: 0,county,school_dist,total_pupil_count_all_students,total_stable_pupil_count_all_students,total_stability_rate_all_students,total_mobile_student_count_all_students,total_student_mobility_rate_all_students,total_instances_of_mobility_all_students,total_mobility_incidence_rate_all_students,students_with_disabilities_pupil_count,...,homeless_student_mobility_rate,homeless_instances_of_mobility,homeless_mobility_incidence_rate,gifted_talented_pupil_count,gifted_talented_stable_student_count,gifted_talented_stability_rate,gifted_talented_mobile_student_count,gifted_talented_student_mobility_rate,gifted_talented_instances_of_mobility,gifted_talented_mobility_incidence_rate
0,STATE TOTAL,STATE TOTAL,939283,705064,75.1,231706,24.7,253577,27,84121,...,45.3,11558,54.2,73344,66620,90.8,6641,9.1,7366,10
1,ADAMS,MAPLETON 1,9037,5077,56.2,3919,43.4,4133,45.7,735,...,32.7,79,36.9,250,205,82,44,17.6,47,18.8
2,ADAMS,ADAMS 12 FIVE STAR SCHOOLS,49889,34283,68.7,15424,30.9,16854,33.8,4339,...,57.2,481,68.2,3590,3225,89.8,361,10.1,404,11.3
3,ADAMS,ADAMS COUNTY 14,8265,5510,66.7,3038,36.8,3397,41.1,876,...,49.7,529,59.7,377,317,84.1,75,19.9,89,23.6
4,ADAMS,BRIGHTON 27J,17152,13109,76.4,3982,23.2,4294,25,1405,...,67.9,287,74.9,703,630,89.6,70,10,76,10.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,DENVER,CHARTER SCHOOL INSTITUTE,12020,5323,44.3,6660,55.4,6800,56.6,708,...,72.3,69,73.4,343,172,50.1,169,49.3,174,50.7
180,CHAFFEE,MOUNTAIN BOCES,184,76,41.3,113,61.4,116,63,36,...,57.1,21,60,1,1,100,0,0,1,100
181,WELD,CENTENNIAL BOCES,146,0,0,146,100,146,100,2,...,0,0,0,0,0,0,0,0,0,0
182,LA PLATA,SAN JUAN BOCES,84,0,0,84,100,84,100,5,...,0,0,0,3,0,0,3,100,3,100


In [4]:
df = cr.df('dist_grad_rate')
df = df.drop(columns='county_name')
df = county.merge(df, how='left')
df = df.rename(columns={'organization_name': 'school_dist'})
df.insert(0, 'county', df.pop('county_name'))
df = df.drop(columns=['organization_code'])
df = df.drop_duplicates()
df.to_csv('output/dist_grad_rate.csv')
df

Unnamed: 0,county,school_dist,students_with_disabilities_final_grad_base,students_with_disabilities_graduates_total,students_with_disabilities_graduation_rate,students_with_disabilities_completers_total,students_with_disabilities_completion_rate,limited_english_proficient_final_grad_base,limited_english_proficient_graduates_total,limited_english_proficient_graduation_rate,...,homeless_final_grad_base,homeless_graduates_total,homeless_graduation_rate,homeless_completers_total,homeless_completion_rate,gifted_talented_final_grad_base,gifted_talented_graduates_total,gifted_talented_graduation_rate,gifted_talented_completers_total,gifted_talented_completion_rate
0,STATE TOTAL,STATE TOTAL,5775,3099,53.7,3222,55.8,6171,3289,53.3,...,2394,1175,49.1,1262,52.7,6604,6048,91.6,6156,93.2
2,ADAMS,MAPLETON 1,49,18,36.7,19,38.8,219,73,33.3,...,41,12,29.3,16,39.0,44,27,61.4,27,61.4
3,ADAMS,ADAMS 12 FIVE STAR SCHOOLS,250,118,47.2,127,50.8,379,257,67.8,...,106,62,58.5,65,61.3,227,201,88.5,208,91.6
4,ADAMS,ADAMS COUNTY 14,59,32,54.2,32,54.2,170,86,50.6,...,99,52,52.5,57,57.6,30,27,90.0,27,90.0
5,ADAMS,BRIGHTON 27J,66,33,50.0,35,53.0,110,63,57.3,...,41,21,51.2,21,51.2,63,51,81.0,52,82.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,DENVER,CHARTER SCHOOL INSTITUTE,111,16,14.4,17,15.3,141,31,22.0,...,121,11,9.1,11,9.1,47,23,48.9,27,57.4
181,CHAFFEE,MOUNTAIN BOCES,19,14,73.7,16,84.2,17,10,58.8,...,14,9,64.3,10,71.4,2,1,50.0,1,50.0
182,WELD,CENTENNIAL BOCES,6,1,16.7,1,16.7,4,1,25.0,...,1,0,0.0,0,0.0,0,0,0.0,0,0.0
183,LA PLATA,SAN JUAN BOCES,1,0,0.0,0,0.0,1,1,100.0,...,0,0,0.0,0,0.0,1,0,0.0,1,100.0


In [5]:
df = cr.df('dist_mobility_demographics')
df = county.merge(df, how='left')
df = df.rename(columns={'organization_name': 'school_dist'})
df.insert(0, 'county', df.pop('county_name'))
df = df.drop(columns=['category', 'school_year', 'org_code'])
df = df.drop_duplicates()
df.to_csv('output/dist_mobility_rate_demographics.csv')
df

Unnamed: 0,county,school_dist,total_pupil_count,total_stable_student_count,total_stability_rate,total_mobile_student_count,total_student_mobility_rate,total_instances_of_mobility,total_mobility_incidence_rate,total_female_pupil_count,...,total_native_hawaiian_or_other_pacific_islander_student_mobility_rate,total_native_hawaiian_or_other_pacific_islander_instances_of_mobility,total_native_hawaiian_or_other_pacific_islander_mobility_incidence_rate,total_two_or_more_races_pupil_count,total_two_or_more_races_stable_student_count,total_two_or_more_races_stability_rate,total_two_or_more_races_mobile_student_count,total_two_or_more_races_student_mobility_rate,total_two_or_more_races_instances_of_mobility,total_two_or_more_races_mobility_incidence_rate
0,STATE TOTAL,STATE TOTAL,939283,705064,75.1,231706,24.7,253577,27,458512,...,34.8,840,38,29329,21501,73.3,7718,26.3,8433,28.8
2,ADAMS,MAPLETON 1,9037,5077,56.2,3919,43.4,4133,45.7,4450,...,70.8,17,70.8,219,129,58.9,90,41.1,91,41.6
3,ADAMS,ADAMS 12 FIVE STAR SCHOOLS,49889,34283,68.7,15424,30.9,16854,33.8,24340,...,45.3,42,48.8,662,455,68.7,203,30.7,222,33.5
4,ADAMS,ADAMS COUNTY 14,8265,5510,66.7,3038,36.8,3397,41.1,3966,...,0,0,0,55,28,50.9,26,47.3,28,50.9
5,ADAMS,BRIGHTON 27J,17152,13109,76.4,3982,23.2,4294,25,8452,...,30.6,12,33.3,423,304,71.9,117,27.7,127,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,DENVER,CHARTER SCHOOL INSTITUTE,12020,5323,44.3,6660,55.4,6800,56.6,6311,...,51.9,15,55.6,187,92,49.2,94,50.3,96,51.3
181,CHAFFEE,MOUNTAIN BOCES,184,76,41.3,113,61.4,116,63,103,...,0,0,0,0,0,0,0,0,0,0
182,WELD,CENTENNIAL BOCES,146,0,0,146,100,146,100,64,...,0,0,0,0,0,0,0,0,0,0
183,LA PLATA,SAN JUAN BOCES,84,0,0,84,100,84,100,36,...,0,0,0,3,0,0,3,100,3,100
