### Data Cleaning Exercise
Quick practice exercise to demonstrating my ability to clean data in Python, using sample data from the Boston Public School system. 

In [1]:
# import relevant Python packages
import pandas as pd
import numpy as np
import openpyxl

In [2]:
# read in relevant sheets from exercise file
sched = pd.read_excel('data/exercise.xlsx', 3)
roster = pd.read_excel('data/exercise.xlsx', 4)

In [21]:
sched

Unnamed: 0,School,Employee ID,First Name,Last Name,Class,Role
0,Community Academy of Science and Health,970755,Steve,Sotelo,161-001,
1,Community Academy of Science and Health,834907,Maria,Allen,161-001,Primary
2,Another Course to College,779664,Antonio,Gibson,151-TLC,
3,Charlestown High School,931139,Raymond,Bierman,151-S1-SN-E,
4,Charlestown High School,760190,Michael,Castro,151-S1-SN-E,Primary
...,...,...,...,...,...,...
11974,Boston Arts Academy,809648,Carol,Abel,748V_2-301,
11975,Boston Arts Academy,323700,Marcia,Elliott,748V_2-101,
11976,Boston Arts Academy,333293,Morgan,Beauchamp,748V_1-301,Co-Teacher
11977,Boston Arts Academy,323700,Marcia,Elliott,748V_1-301,Primary


In [22]:
[roster]

Unnamed: 0,ID,Empl Record,First Name,Last Name,Ethnicity,Sex,Job Code,Job Title
0,789946,1,Ryan,Smith,White,M,S30243,Prin Clerk/School Sec 21
1,367486,1,Rod,Trevino,Black,M,S40020,Part-Time Custodian
2,431970,0,Alfredo,Hughes,White,M,S20105,Paraprofessional
3,350768,600,Carmelina,Lamb,Black,F,S20212,Coach (NonTPP)
4,254104,0,Kevin,Davis,Black,M,S30142,Sen Clerk School Sec 766-15B
...,...,...,...,...,...,...,...,...
14662,286908,0,Karen,Taylor,White,F,S85001,Substitute Teacher
14663,279743,0,Susan,Walters,Black,F,S85030,Per Diem Substitute Teacher
14664,399596,0,Penny,Russell,Decline,F,S85030,Per Diem Substitute Teacher
14665,386164,0,Jerry,Friend,Black,M,S85030,Per Diem Substitute Teacher


In [23]:
# join job title column to names in schedule sheet to enable conditional categorization
full_sched = sched.merge(roster[['ID', 'Job Title']], 'left', left_on='Employee ID', right_on='ID')

# count how many times each class name shows up in the full_sched sheet
unique_classes, count = np.unique(full_sched['Class'], return_counts=True)


KeyError: ('ID', 'Job Title')

In [18]:
full_sched.describe()

Unnamed: 0,Employee ID,ID
count,13270.0,13270.0
mean,502790.403316,502790.403316
std,280151.857824,280151.857824
min,13920.0,13920.0
25%,250953.0,250953.0
50%,503717.0,503717.0
75%,740199.0,740199.0
max,999970.0,999970.0


In [4]:
# overwrite the Role column to match organizational logic: 
## if staff member is not a Teacher, their role is Other Support Staff; 
## if staff member has another Teacher assigned to their class, their role is Co-Teacher; 
## if staff member is the only Teacher assigned to their class, their role is Primary

full_sched['Role'] = np.where(full_sched['Job Title'].ne('Teacher'), 'Other Support Staff', 
                              ['Primary' if c in unique_classes[count==1] else 'Co-Teacher' for c in full_sched['Class']])


In [8]:
# create, and write to, a new Excel file for the cleaned data
cleaned_book = openpyxl.Workbook('data/cleaned_exercise.xlsx')
cleaned_sheet = cleaned_book.active

full_sched[::-2].to_excel('data/cleaned_exercise.xlsx')

In [None]:
full_sched[::-2]