In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 1000) 
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [None]:
#purpose: create our dataframe

#default column headers
col_headers = ['Name', 'Department', 'Title', 'Regular', 'Retro', 'Other', 'Overtime', 'Injured', 'Detail', 'Quinn', 'Total', 'Postal']

#create an empty df with default values
earnings_df = pd.DataFrame(columns = col_headers)

#add column 'Year' and set to NaN
earnings_df['Year'] = np.nan

#there are 9 sepearate CSV files, representing employee earnings reports for 9 years
for year in range(2011, 2020):
    
    #read in the CSV for the given year, set it to variable next_df
    next_df = pd.read_csv('./data/salaries_' + str(year) + '.csv', skiprows=1, names=col_headers, encoding='latin1')
    
    #add the column 'Year' to next_df and set to the given year
    next_df['Year'] = year
    
    #in the 2013 and 2014 datasets, title and department columns are in the wrong order
    if year == 2013 or year == 2014:
        
        #get the list of columns (including year)
        col_list = list(next_df)
        
        #swap the order of title and department in the list
        col_list[1], col_list[2] = col_list[2], col_list[1]
        
        #set the dataframe's columns to the new list
        next_df.columns = col_list
    
    #add next_df to earnings_df
    earnings_df = pd.concat([earnings_df, next_df], sort=False)

In [None]:
earnings_df[['Department', 'Name', 'Title', 'Postal']].sample(500)

In [None]:
#purpose: initial cleaning for our dataframe

#targeted (hard-coded) cleaning for specific rows
earnings_df = earnings_df.loc[earnings_df['Department']!='DEPARTMENT_NAME']

#dimensions are qualitative columns, facts are quantitative columns
facts = ['Regular', 'Retro', 'Other', 'Overtime', 'Injured', 'Detail', 'Quinn', 'Total']
    
##cast year to type 'int'
earnings_df['Year'] = earnings_df['Year'].astype(int)

#clean the facts columns and convert from type 'object' to 'float'
earnings_df[facts] = earnings_df[facts].astype(str).applymap(lambda x: x.strip())
earnings_df[facts] = earnings_df[facts].replace({'^-$|^None$|^nan$|\)':0, ',':'', '\$':'', ' ':'', '^\(':'-'}, regex=True)
earnings_df[facts] = earnings_df[facts].astype(float)

In [None]:
#convert dimensions from type 'object' to 'string' and remove leading/trailing whitespace
dimensions = ['Name', 'Department', 'Title', 'Postal']
earnings_df[dimensions] = earnings_df[dimensions].astype(str)

In [None]:
#purpose: clean Postal column

#add a 0 to the front of any code with 4 digits
mask = earnings_df['Postal'].str.len() == 4
earnings_df.loc[mask, 'Postal'] = '0' + earnings_df.loc[mask, 'Postal']

#remove delivery route number from any codes that have it (number after hyphen)
earnings_df['Postal'] = earnings_df['Postal'].str.split('-', expand=True)[0]

#any postal codes with non-numeric characters will be set to UNKNOWN
earnings_df.loc[earnings_df['Postal'].str.match('[A-Z]', na=False), 'Postal'] = 'UNKNOWN'

#hard cleaning for specific rows
earnings_df.loc[earnings_df['Name'].str.match('Ostiguy,David M'), 'Postal'] = '02327'
earnings_df.loc[earnings_df['Name'].str.match('Karales,George Alfred'), 'Postal'] = '02170'
earnings_df.loc[earnings_df['Name'].str.match('Smith,Kenneth J'), 'Postal'] = '02124'
earnings_df.loc[earnings_df['Name'].str.match('Thomas,Sarita J'), 'Postal'] = '02125'
earnings_df.loc[earnings_df['Name'].str.match('Morris,Judith A.'), 'Postal'] = '02170'
earnings_df.loc[earnings_df['Name'].str.match('Mendez,Jose R'), 'Postal'] = '02135'
earnings_df.loc[earnings_df['Name'].str.match('Morrison,June'), 'Postal'] = '02481'

In [None]:
#purpose: clean Name column
earnings_df['Name'] = earnings_df['Name'].replace({'\.':''}, regex=True).str.upper()

In [None]:
#purpose: preliminary cleaning for Title column - create uniform spacing, convert to all caps, and remove unwanted characters
earnings_df['Title'] = earnings_df['Title'].replace({'\.':'', '(?<=[a-z])([A-Z])':r' \1', '\(':' (', '\)':') ', '\\\\':'', '&':' AND ', ',':' /', '\#':''}, regex=True).str.upper()
earnings_df[dimensions] = earnings_df[dimensions].applymap(lambda x: x.strip())

In [None]:
earnings_df[['Department', 'Name', 'Title', 'Postal']].sample(500)

In [None]:
#purpose: replace abbreviations and spelling errors with proper words in title column
abbrevs = {
    '\)\/\(':'\/',
    'ADMIN':'ADMINISTRATIVE',
    'OFFC':'OFFICER',
    'OFFCR':'OFFICER',
    'SEN':'SENIOR',
    'DET':'DETECTIVE',
    'SUPV':'SUPERVISOR',
    'SPV':'SUPERVISOR',
    'EXEC':'EXECUTIVE',
    'ANL':'ANALYST',
    'ANAL':'ANALYST',
    'TECH':'TECHNICIAN',
    'EQUIP':'EQUIPMENT',
    'EQUIPMENT OPER':'EQUIPMENT OPERATOR',
    'PROJ':'PROJECT',
    'SP':'SPECIAL',
    'STFF':'STAFF',
    'ACAD':'ACADEMY',
    'INSTR':'INSTRUCTOR',
    'ASST':'ASSISTANT',
    'ASSIST':'ASSISTANT',
    'DEP':'DEPUTY',
    'SUPN':'SUPERINTENDENT',
    'SYS':'SYSTEMS',
    'COOR':'COORDINATOR',
    'COORD':'COORDINATOR',
    'SEC':'SECRETARY',
    'LIEUT':'LIEUTENANT',
    'LT':'LIEUTENANT',
    'MAINT':'MAINTENANCE',
    'DIR':'DIRECTOR',
    'MGMT':'MANAGEMENT',
    'MGR':'MANAGER',
    'MNGR':'MANAGER',
    'MANGR':'MANAGER',
    'MED':'MEDICAL',
    'OPER':'OPERATIONS',
    'DATA PROC':'DATA PROCESSING',
    'CORP COUNSEL':'CORPORATION COUNSEL',
    'ASSOC':'ASSOCIATE',
    'COMM SERV':'COMMUNITY SERVICE',
    'COMM':'COMMUNICATIONS',
    'COMMUNIC':'COMMUNICATIONS',
    'COMMUN':'COMMUNICATIONS',
    'BLDG':'BUILDING',
    'SERV':'SERVICE',
    'REG VOTERS':'REGISTRAR OF VOTERS',
    'SVC':'SERVICE',
    'SRV':'SERVICE',
    'PRIN':'PRINCIPAL',
    'PARA':'PARAPROFESSIONAL',
    'DIST':'DISTRICT',
    'FF':'FIRE FIGHTER',
    'INSTRUC':'INSTRUCTOR',
    'SR':'SENIOR',
    'JR':'JUNIOR',
    'MECH':'MECHANIC',
    'MECHA':'MECHANIC',
    'GEN':'GENERAL',
    'ADMN':'ADMINISTRATIVE',
    'ADM':'ADMINISTRATIVE',
    'ENG':'ENGINEER',
    'STRUCT':'STRUCTURAL',
    'FRPRS':'FOREPERSON',
    'FRPR':'FOREPERSON',
    'FOREPRS':'FOREPERSON',
    'CONST':'CONSTRUCTION',
    'LBR':'LABORER',
    'RPR':'REPAIR',
    'REP':'REPAIR',
    'SPEC':'SPECIAL',
    'INCT COMND SPECIAL':'INCIDENT COMMAND SPECIALIST',
    'MAS OF F BOAT':'MASTER OF FIRE BOAT',
    'RPPRS':'REPAIRPERSON',
    'REPPRS':'REPAIRPERSON',
    'REPRPRS':'REPAIRPERSON',
    'REPAIRPR':'REPAIRPERSON',
    'REPAIRPRS':'REPAIRPERSON',
    'RPPR':'REPAIR PERSON',
    'WKG':'WORKING',
    'PW':'PUBLIC WORKS',
    'P W':'PUBLIC WORKS',
    'HVY':'HEAVY',
    'MTR':'MOTOR',
    'INSP':'INSPECTOR',
    'INSPEC':'INSPECTOR',
    'TRA':'TRAFFIC',
    'OPR':'OPERATIONS',
    'MEO':'MOTOR EQUIPMENT OPERATOR',
    'CFM':'(CFM)',
    'ELEC EQUIPMENT':'ELECTRIC EQUIPMENT',
    'EQUI':'EQUIPMENT',
    'COLL TRS':'COLLECTOR TREASURER',
    'ACNTNG':'ACCOUNTING',
    'CRFTSPRS':'CRAFTSPERSON',
    'COMMSS':'COMMISSIONER',
    'COUNSLR':'COUNSELOR',
    'MEMBER BD OF ELECTION':'MEMBER OF BOARD OF ELECTIONS',
    'LIB':'LIBRARIAN',
    'LIBR':'LIBRARIAN',
    'LIBRARIN':'LIBRARIAN',
    'SVCS':'SERVICES',
    'CAMP JO':'(CAMP JOY)',
    'CAM JO':'(CAMP JOY)',
    'SER':'SERVICES',
    'PROT':'PROTECTIVE',
    'REL':'RELATIONS',
    'SUPVISING':'SUPERVISING',
    'PROP':'PROPERTY',
    'DISP':'DISPATCHER',
    'CHF':'CHIEF',
    'PMDGRAFF REMOVAL':'(GRAFFITI REMOVAL)',
    'PAINT':'PAINTER',
    'ENFORCE':'ENFORCEMENT',
    'DEVELOP':'DEVELOPMENT',
    'PROG':'PROGRAM',
    'PWD':' (PWD)',
    'SWIM':'SWIMMING',
    'REGNL':'REGIONAL',
    'ACCTNG':'ACCOUNTING',
    'ACCT':'ACCOUNTING',
    'ENGR':'ENGINEER',
    'EQU':'EQUIPMENT',
    'EQ':'EQUIPMENT',
    'ANIM CNTL':'ANIMAL CONTROL',
    'OFCR':'OFFICER',
    'CLRK':'CLERK',
    'DEVELOP':'DEVELOPMENT',
    'PARKS AND REC':'PARKS AND RECREATION',
    'P AND R':'PARKS AND RECREATION',
    '\(PARK\)':'(PARKS AND RECREATION)',
    'SPC':'SPECIAL',
    'HDQ':'HEADQUARTER',
    'DISPCH':'DISPATCHER',
    'SUB':'SUBSTITUTE',
    'CUST':'CUSTODIAN',
    'NEIGH':'NEIGHBORHOOD',
    'HE':'(HE)',
    'FGR PRT EV':'FINGERPRINT EVIDENCE',
    'CH':'CHIEF',
    'OP':'OPERATOR',
    'IBPDFLEET':'I (BPD FLEET',
    'IIBPDFLEET':'II (BPD FLEET',
    'EVIDENC':'EVIDENCE',
    'TECHNCN':'TECHNICIAN',
    'TCH':'TECHNICIAN',
    'DP SYSTEMS':'DATA PROCESSING SYSTEMS',
    'FIRE PREV':'FIRE PREVENTION',
    'CLASSIFICATN':'CLASSIFICATION',
    'SPECIAL$':'SPECIALIST',
}

abbrevs_regex = {}

#to prevent words within words from accidentally being changed, lets ensure that values from above dictionary must be
#between certain characters to change
for key in abbrevs:
    abbrevs[key] = ' '+abbrevs[key]+' '
    new_key = '(^|[\s\(\)\-\/])'+key+'([\s\(\)\-\/]|$)'
    abbrevs_regex[new_key] = abbrevs[key]

earnings_df['Title'] = earnings_df['Title'].replace(abbrevs_regex, regex=True).str.strip()

In [None]:
earnings_df[['Department', 'Name', 'Title', 'Postal']].sample(500)

In [None]:
#purpose: final cleaning for title column - eliminate excess parens

#split title column on first left paren
new_title = earnings_df['Title'].str.split("(", n=1, expand=True)

#strip whitespace and all parens
new_title[0] = new_title[0].str.strip()
new_title[1] = new_title[1].replace({'\(':'','\)':''}, regex=True).str.strip()

#concatenate both "halves", with parens around second half
earnings_df['Title'] = (new_title[0].astype(str) + ' (' + new_title[1].astype(str) + ')').replace('\(None\)', '', regex=True).str.strip()

In [None]:
#return list of all unique values for a given column sorted in alphabetical order
def show_unique(column):
    optns = earnings_df[column].unique()
    optns.sort()
    return optns

show_unique('Department')

In [None]:
#purpose: preliminary cleaning for Department column - create uniform spacing, convert to all caps, and remove unwanted characters

earnings_df['Department'] = earnings_df['Department'].replace({'\.':'', '(?<=[a-z])([A-Z])':r' \1', '\/':' AND ', '\\\\':'', '&':' AND ', '\,':' ', '\#':''}, regex=True).str.upper()
earnings_df['Department'] = earnings_df['Department'].replace('\s+', ' ', regex=True).str.strip()

In [None]:
#purpose: most depts are BPS related, lets move them to new 'Program' column and set dept to 'Boston Public Schools'

BPS_str = '^BPS|ELEMENTARY|ACADEMY|K-8|MIDDLE|HIGH|SCHOOL|ACAD$|PILOT| EEC$| ELC$| EES$|9-12|ACHIEVEMENT GAP|STUDENT|SUPERINTENDENT|CHIEF ACADEMIC OFFICER|ENGLISH LANGUAGE LEARN|ACCOUNTABILITY|^ADVANCEMENT|ENROLLMENT|^EARLY LEARNING|^HPEC|^INFO AND INSTR|TEACHING|CAREER AND TECHNICAL ED|CHIEF OF STAFF|CHIEF FINANCIAL OFFICER|CHIEF OPERATING OFFICER|^COMMUNICATIONS$|FOOD AND NUTRITION SVC|INNOVATION DEPARTMENT|INSTITUTIONAL ADVANCEMT|LEGAL ADVISOR|PROFESSIONAL DEVELOPMNT|RESEARCH ASSESS AND EVAL|STRATEGY DEPARTMENT'
earnings_df.loc[earnings_df['Department'].str.contains(BPS_str, regex=True),'Program'] = earnings_df.loc[earnings_df['Department'].str.contains(BPS_str, regex=True),'Department']
earnings_df.loc[earnings_df['Department'].str.contains(BPS_str, regex=True),'Department'] = 'BOSTON PUBLIC SCHOOLS'

In [None]:
#after creating program column, most tuples will be NaN, lets set those to empty string
earnings_df['Program'] = earnings_df['Program'].replace(np.nan,'')

In [None]:
show_unique('Department')

In [None]:
#purpose: the operating budget report in the Boston Open Data Portal has much better data on the organization of Boston's govt
#lets get all the cabinets, departments, and programs from the operating budget

budget_df = pd.read_csv('./data/operating_budget.csv', encoding='latin1')
budget_df = budget_df.drop_duplicates(['Cabinet','Dept','Program'])[['Cabinet','Dept','Program']].apply(lambda x: x.str.upper())
budget_df = budget_df.replace({'\.':' ', '\&':' AND ', '\,':' ', '\s+':' '}, regex=True).apply(lambda x: x.str.strip())

#some minor corrections to the data
budget_df.replace({'LIBRARY DEPARTMENT':'BOSTON PUBLIC LIBRARY', 
                   ' W/':' WITH ', 
                   'LICENSING_BOARD':'LICENSING BOARD',
                   'BOSTON VETS':"VETERANS' SERVICES'"})

In [None]:
#purpose: get pivot table of employee counts by department
#dept is an optional column that specifies what department to filter on, set to everything by default

def count_by_group(column, dept=r'(.*?)'):
    
    #create dataframe with department, year, and counts
    dept_counts = earnings_df.groupby([column, 'Year'])['Total'].count().reset_index(name="count")
    
    #remove BPS schools from result
    #dept_counts = dept_counts.loc[dept_counts['Department'].str.contains(BPS_str, regex=True) == False]
    
    #create the pivot table table, with calculated sums for each row and column
    dept_counts_table = pd.pivot_table(dept_counts, values='count', index=column, columns='Year', aggfunc='sum', fill_value=0, margins=True).reset_index()
    
    #get rid of the calculated sums by row, doesn't make sense for time series data
    return dept_counts_table.iloc[:,0:-1].loc[dept_counts_table[column].str.match(dept)]

In [None]:
#purpose: replace incorrect department names with correct ones when applicable

dept_abbrevs = {
    '^OFFICE OF FINANCE AND BUDGET$':'ADMINISTRATION AND FINANCE', #*OFFICE OF FINANCE AND BUDGET: 2014 - 2015
    '^OFFICE OF ADMIN AND FINANCE$':'ADMINISTRATION AND FINANCE', 
    '^DND NEIGHBORHOOD DEVELOPMENT$':'NEIGHBORHOOD DEVELOPMENT',
    '^ARTS AND CULTURAL DEVELOPMENT$':'OFFICE OF ARTS AND CULTURE', 
    '^ASD HUMAN RESOURCES$':'HUMAN RESOURCES',
    '^ASD INTERGVERNMTL RELATIONS$':'INTERGOVERNMENTAL RELATIONS',
    '^ASD OFFICE OF BUDGET MANGMNT$':'BUDGET MANAGEMENT',
    '^ASD OFFICE OF LABOR RELATION$':'OFFICE OF LABOR RELATIONS',
    '^ASD PURCHASING DIVISION$':'PROCUREMENT', #*AKA PURCHASING
    '^BOSTON CITY COUNCIL$':'CITY COUNCIL',
    '^BOSTON CNTR \- YOUTH AND FAMILIES$':'BOSTON CENTER FOR YOUTH AND FAMILIES',
    '^BOSTON CNTR\-YOUTH AND FAMILIES$':'BOSTON CENTER FOR YOUTH AND FAMILIES',
    '^BOSTON FIRE DEPARTMENT$':'FIRE DEPARTMENT',
    '^BOSTON POLICE DEPARTMENT$':'POLICE DEPARTMENT',
    '^BOSTON RETIREMENT SYSTEM$':'RETIREMENT DEPARTMENT',
    '^STATE BOSTON RETIREMENT SYST$':'RETIREMENT DEPARTMENT',
    '^COMM FOR PERSONS WITH DISABIL$':'COMMISSION FOR PERSONS WITH DISABILITIES',
    '^DEPT OF VOTER MOBILIZATION$':'ELECTION DEPARTMENT', #*DEPARTMENT OF VOTER MOBILIZATION: <2013
    '^DPT OF INNOVATION AND TECHNOLOGY$':'DEPARTMENT OF INNOVATION AND TECHNOLOGY',
    '^ELDERLY COMMISSION$':'AGE STRONG', #*ELDERLY COMMISSION <2018
    '^ELECTION DIVISION$':'ELECTION DEPARTMENT',
    '^IMMIGRANT ADVANCEMENT$':'OFFICE FOR IMMIGRANT ADVANCEMENT',
    '^INSPECTIONAL SERVICES DEPT$':'INSPECTIONAL SERVICES DEPARTMENT',
    '^OFC CHF PUBLIC WORKS TRANSPORT$':'OFFICE OF STREETS', #*OFFICE OF CHIEF OF PUBLIC WORKS AND TRANSPORT <2013
    '^OFC OF STRTS TRNSP AND SANI$':'OFFICE OF STREETS', #*OFFICE OF STREETS, TRANSPORTATION, AND SANITATION 2014 - 2015
    '^OFFICE OF CIVIL RIGHTS$':'FAIR HOUSING AND EQUITY', #*OFFICE OF CIVIL RIGHTS <2013
    '^OFFICE OF NEW BOSTONIANS$':'OFFICE FOR IMMIGRANT ADVANCEMENT', #*OFFICE OF NEW BOSTONIANS <2017
    '^PARKS DEPARTMENT$':'PARKS AND RECREATION DEPARTMENT',
    '^PROPERTY MANAGEMENT$':'PROPERTY MANAGEMENT DEPARTMENT',
    '^YOUTH FUND$':'YOUTH ENGAGEMENT AND EMPLOYMENT', #*YOUTH FUND <2013
    "^WOMEN'S COMMISSION$":"WOMEN'S ADVANCEMENT" #*WOMEN'S COMMISSION <2013
}

earnings_df['Department'] = earnings_df['Department'].replace(dept_abbrevs, regex=True)

In [None]:
show_unique('Department')

In [None]:
#purpose: some depts should actually be programs, this function can be used to set departments to programs

def replace_dept(orig_dept, new_dept, new_prog):
    earnings_df.loc[earnings_df['Department'].str.contains(orig_dept, regex=True),'Program'] = new_prog
    earnings_df.loc[earnings_df['Department'].str.contains(orig_dept, regex=True),'Department'] = new_dept

In [None]:
#use the above function to fix some of the Department data
replace_dept('CEMETARY DIVISION', 'PARKS AND RECREATION DEPARTMENT', 'CEMETARY')
replace_dept('LICENSING BOARD', 'CONSUMER AFFAIRS AND LICENSING', 'LICESNING BOARD')
replace_dept("MAYOR\'S OFFICE\-PUBLIC INFO", "MAYOR'S OFFICE", "MAYOR'S COMMUNICATIONS")
replace_dept('OFC BOSTON RESIDENTS JOB POL', 'OFFICE OF ECONOMIC DEVELOPMENT', 'BOSTON RESIDENTS JOB POLICY OFFICE')
replace_dept('SMALL AND LOCAL BUSINESS', 'OFFICE OF ECONOMIC DEVELOPMENT', 'SMALL AND LOCAL BUSINESS')
replace_dept('TRAFFIC DIVISION', 'TRANSPORTATION DEPARTMENT', 'TRAFFIC DIVISION')
replace_dept('TRANSPORTATION-PARKING CLERK', 'TRANSPORTATION DEPARTMENT', 'PARKING CLERK')
replace_dept('TREASURY-COLLECTING DIVISION', 'TREASURY DEPARTMENT', 'TREASURY DIVISION')
replace_dept('TREASURY-TREASURY DIVISION', 'TREASURY DEPARTMENT', 'COLLECTING DIVISION')
replace_dept('WORKERS COMPENSATION SERVICE', 'HUMAN RESOURCES', 'WORKERS COMP')

In [None]:
show_unique('Department')

In [None]:
#preliminary cleaning for program column
earnings_df['Program'] = earnings_df['Program'].replace({'^BPS':'', 'BOSTON PUBLIC SCHOOLS':'', '\"':'', '\-NETWORK':' - NETWORK'}, regex=True).str.strip()
show_unique('Program')

In [None]:
prog_abbrevs = {
    'MGMT':' MANAGEMENT ',
    'EXT AFFAIRS':' EXTERNAL AFFAIRS ',
    'CL10':' ',
    'COM ACD':' COMMUNITY ACADEMY OF ',
    'P A SHAW':' PA SHAW ',
    'NURS':' NURSES ',
    'WEST ROXBURY HIGH':' WEST ROXBURY ACADEMY ', 
    'WREC\:':' ',
    'HPEC\:':' ',
    'MC CORMACK MIDDLE':' MCCORMACK MIDDLE ', 
    'KENNEDY EM':' EDWARD M KENNEDY ',
    'KENNEDY JF':' JOHN F KENNEDY ',
    'KENNEDY PJ':' PATRICK J KENNEDY ',
    'GREENWOOD S':' SARAH GREENWOOD '
    'MC KINLEY MIDDLE':' MCKINLEY MIDDLE ',
    'HORACE MANN':'HORACE MANN SCHOOL',
    'WITHTHROP':' WINTHROP ',
    'SOUTH BOSTON HS - EXCEL':' EXCEL HIGH SCHOOL ',
    'MPHCOMMERCE':' MADISON PARK HIGH SCHOOL - COMMERCE ',
    'MPHCRAFTS':' MADISON PARK HIGH SCHOOL - CRAFTS ',
    'MPHHEALTH':' MADISON PARK HIGH SCHOOL - HEALTH ',
    'MPHFRESHMAN':' MADISON PARK HIGH SCHOOL - FRESHMAN ',
    'FACILITY MANAGEMENT AND A AND R':' FACILITIES MANAGEMENT ',
    'FACILITY MANAGEMENT':' FACILITIES MANAGEMENT ',
    'FACILITITES MANAGEMENT':' FACILITIES MANAGEMENT ',
    'GREENWOOD E':' ELIHU GREENWOOD ',
    'ALTERNATIVE EDUCATION HIGH SCHOOL':' ALTERNATIVE EDUCATION ',
    'COLLEG':' COLLEGE ',
    'SCH':' SCHOOL ',  
    'INTERVTN CT':' INTERVENTION CENTER ',
    'COUSELING':' COUNSELING ',
    'SERVICE':' SERVICES ',
    'SER':' SERVICES ',
    'HI':' HIGH ',
    'ED':' EDUCATION ',
    'HS':' HIGH SCHOOL ',
    'ACAD':' ACADEMY ',
    'SVC':' SERVICES ',
    'ASST':' ASSISTANT ',
    'COMM':' COMMUNITY ',
    'AC':' ACADEMY ',
    'ST':' STREET ',
    'HEARING AND APPEAL AND ATTENDANC':' HEARINGS AND APPEALS ',
    'KITCHEN FOOD':' KITCHEN AND FOOD ',
    'LEARN':' LEARNING ',
    'FAM AND STUDENT ENGAGEMT':' FAMILY AND STUDENT ENGAGEMENT ',
    'ADVANCEMT':' ADVANCEMENT ',
    'ACCOUNTABILITY':'TURNAROUND AND TRANSFORMATION',
    'RESEARCH ASSESS AND EVAL':'DATA AND ACCOUNTABILITY'
}

prog_abbrevs_regex = {}
for key in prog_abbrevs:
    new_key = '(^|[\s\(\)\-])'+key+'([\s\(\)\-]|$)'
    prog_abbrevs_regex[new_key] = prog_abbrevs[key]

earnings_df['Program'] = earnings_df['Program'].replace(prog_abbrevs_regex, regex=True).str.strip()
earnings_df['Program'] = earnings_df['Program'] .replace({'ELEMENTARY$':'ELEMENTARY SCHOOL',
                                                          'MIDDLE$':'MIDDLE SCHOOL',
                                                          'HIGH$':'HIGH SCHOOL',
                                                          'PILOT$':'PILOT SCHOOL'}, regex=True)

In [None]:
show_unique('Program')

In [None]:
earnings_df.sample(500)

In [None]:
pd.DataFrame(show_unique('Title'), columns=['Name']).head(1000)

In [None]:
#purpose: get pivot table of aggregates by department
    #column indicates what column to use for aggregation (i.e. Injured, Regular, Total, etc.)
    #aggfunc is the aggregation function (mean, sum, max, min)
    #dept is an optional column that specifies what department to filter on, set to everything by default
def agg_by_dept(column, aggfunc, dept=r'(.*?)'):
    
    #create dataframe with department, year, and aggregate column
    dept_costs = earnings_df.loc[earnings_df[column]>0].groupby(['Department', 'Year'])[column].agg(aggfunc).reset_index(name="Costs")
    
    #remove BPS schools from result
    dept_costs = dept_costs.loc[dept_costs['Department'].str.contains(BPS_str, regex=True) == False]
    
    #create the pivot table table, with calculated aggregate for each row and column
    dept_costs_table = pd.pivot_table(dept_costs, values='Costs', index='Department', columns='Year', aggfunc = aggfunc, fill_value=0, margins=True).reset_index()
    
    #get rid of the calculated aggregate by row, doesn't make sense for time series data
    return dept_costs_table.iloc[:,0:-1].loc[dept_costs_table['Department'].str.match(dept)]

#agg_by_dept('Injured', 'median', 'Boston Police Department|Boston Fire Department')

In [None]:
#purpose: get pivot table of employee counts by department and title
    #column indicates what column to count on (i.e. Injured or Total)
    #dept is an optional column that specifies what department to filter on, set to everything by default
    #title is an optional column that specifies what title to filter on, set to everything by default, not reccommended without setting dept first
def count_by_title(column, dept=r'(.*?)', title=r'(.*?)'):
    
    #create dataframe with title, year, and counts
    job_counts = earnings_df.loc[earnings_df['Department'].str.match(dept)]
    job_counts = job_counts.loc[job_counts[column]>0].groupby(['Title', 'Year'])[column].count().reset_index(name="count")
    
    #create the pivot table table, with calculated sums for each row and column
    job_counts_table = pd.pivot_table(job_counts, values='count', index='Title', columns='Year', aggfunc=np.sum, fill_value=0, margins=True).reset_index()
    
    #get rid of the calculated sums by row, doesn't make sense for time series data
    return job_counts_table.loc[job_counts_table['Title'].str.match(title)].sort_values(by='All', ascending=False).iloc[:,0:-1]

count_by_title('Total').head(1000)

In [None]:
#purpose: get pivot table of chosen aggregate by department and title
    #column indicates what column to aggregate on (i.e. Injured, Regular, Total, etc.)
    #dept is an optional column that specifies what department to filter on, set to everything by default
    #title is an optional column that specifies what title to filter on, set to everything by default, not reccommended without setting dept first
def agg_by_title(column, aggfunc, dept=r'(.*?)', title=r'(.*?)'):
    
    #create dataframe with title, year, and aggregate column
    job_pay = earnings_df.loc[earnings_df['Department'].str.match(dept)].groupby(['Title', 'Year'])[column].agg(aggfunc).reset_index(name="Costs")
    
    #create the pivot table table, with calculated aggregate for each row and column
    job_pay_table = pd.pivot_table(job_pay, values='Costs', index='Title', columns='Year', fill_value=0, margins=True).reset_index()
    
    #get rid of the calculated sums by row, doesn't make sense for time series data
    return job_pay_table.loc[job_pay_table['Title'].str.match(title)].sort_values(by='All', ascending=False).iloc[:,0:-1]

agg_by_title('Total', 'mean', 'Mayor\'s Office', 'Chief Diversity Officer')

In [None]:
print(earnings_df.loc[(earnings_df['Injured']>0) & (earnings_df['Year']==2014) & (earnings_df['Department']=='Boston Police Department'), ['Name', 'Title', 'Injured']])

In [None]:
count_by_title('Total', title='Prin')