In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 1000) 
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [2]:
#purpose: create our dataframe

#default column headers
col_headers = ['Name', 'Department', 'Title', 'Regular', 'Retro', 'Other', 'Overtime', 'Injured', 'Detail', 'Quinn', 'Total', 'Postal']

#create an empty df with default values
earnings_df = pd.DataFrame(columns = col_headers)

#add column 'Year' and set to NaN
earnings_df['Year'] = np.nan

#there are 9 sepearate CSV files, representing employee earnings reports for 9 years
for year in range(2011, 2020):
    
    #read in the CSV for the given year, set it to variable next_df
    next_df = pd.read_csv('./data/salaries_' + str(year) + '.csv', skiprows=1, names=col_headers, encoding='latin1')
    
    #add the column 'Year' to next_df and set to the given year
    next_df['Year'] = year
    
    #in the 2013 and 2014 datasets, title and department columns are in the wrong order
    if year == 2013 or year == 2014:
        
        #get the list of columns (including year)
        col_list = list(next_df)
        
        #swap the order of title and department in the list
        col_list[1], col_list[2] = col_list[2], col_list[1]
        
        #set the dataframe's columns to the new list
        next_df.columns = col_list
    
    #add next_df to earnings_df
    earnings_df = pd.concat([earnings_df, next_df], sort=False)

In [3]:
earnings_df[['Department', 'Name', 'Title', 'Postal']].sample(500)

Unnamed: 0,Department,Name,Title,Postal
7597,BPS Channing Elementary,"Flynn,Diane M.",Prin Clerk/School Sec 19,02132
5302,Boston Police Department,"Latson,Brian K",Police Sergeant,02067-2809
20833,Boston Cntr - Youth & Families,"Guinnane,Grace M",Teacher I,02132
6538,Boston Police Department,"Edghill-Yard,Susan R",Police Detective,02131-4704
14943,Boston Cntr - Youth & Families,"Nijjar,Jovante",Lifeguard I,02128
19758,Boston Fire Department,"Stokes,Kirk",Fire Fighter,02119-1245
16179,Inspectional Services Dept,"Piccarini,Dianna M.",Health Inspector##,02132
7376,BPS Special Education,"Houston,Maureen E.",Specialist (BTU),2136
18321,Boston Police Department,"Samson,Jennifer",Legal Asst (BPD),02124
12766,King K-8,"Nolasco,Isis A",Part-Time Cafeteria Attendant,02119


In [4]:
#purpose: initial cleaning for our dataframe

#targeted (hard-coded) cleaning for specific rows
earnings_df = earnings_df.loc[earnings_df['Department']!='DEPARTMENT_NAME']

#dimensions are qualitative columns, facts are quantitative columns
facts = ['Regular', 'Retro', 'Other', 'Overtime', 'Injured', 'Detail', 'Quinn', 'Total']
    
##cast year to type 'int'
earnings_df['Year'] = earnings_df['Year'].astype(int)

#clean the facts columns and convert from type 'object' to 'float'
earnings_df[facts] = earnings_df[facts].astype(str).applymap(lambda x: x.strip())
earnings_df[facts] = earnings_df[facts].replace({'^-$|^None$|^nan$|\)':0, ',':'', '\$':'', ' ':'', '^\(':'-'}, regex=True)
earnings_df[facts] = earnings_df[facts].astype(float)

In [5]:
#convert dimensions from type 'object' to 'string' and remove leading/trailing whitespace
dimensions = ['Name', 'Department', 'Title', 'Postal']
earnings_df[dimensions] = earnings_df[dimensions].astype(str)

In [6]:
#purpose: clean Postal column

#add a 0 to the front of any code with 4 digits
mask = earnings_df['Postal'].str.len() == 4
earnings_df.loc[mask, 'Postal'] = '0' + earnings_df.loc[mask, 'Postal']

#remove delivery route number from any codes that have it (number after hyphen)
earnings_df['Postal'] = earnings_df['Postal'].str.split('-', expand=True)[0]

#any postal codes with non-numeric characters will be set to UNKNOWN
earnings_df.loc[earnings_df['Postal'].str.match('[A-Z]', na=False), 'Postal'] = 'UNKNOWN'

#hard cleaning for specific rows
earnings_df.loc[earnings_df['Name'].str.match('Ostiguy,David M'), 'Postal'] = '02327'
earnings_df.loc[earnings_df['Name'].str.match('Karales,George Alfred'), 'Postal'] = '02170'
earnings_df.loc[earnings_df['Name'].str.match('Smith,Kenneth J'), 'Postal'] = '02124'
earnings_df.loc[earnings_df['Name'].str.match('Thomas,Sarita J'), 'Postal'] = '02125'
earnings_df.loc[earnings_df['Name'].str.match('Morris,Judith A.'), 'Postal'] = '02170'
earnings_df.loc[earnings_df['Name'].str.match('Mendez,Jose R'), 'Postal'] = '02135'
earnings_df.loc[earnings_df['Name'].str.match('Morrison,June'), 'Postal'] = '02481'

In [7]:
#purpose: clean Name column
earnings_df['Name'] = earnings_df['Name'].replace({'\.':''}, regex=True).str.upper()

In [8]:
#return list of all unique values for a given column sorted in alphabetical order
def show_unique(column):
    optns = earnings_df[column].unique()
    optns.sort()
    return optns

#purpose: get pivot table of employee counts by department
#rec is an optional column that specifies what record to filter on, set to everything by default
def count_by_group(column, rec=r'(.*?)'):
    
    #create dataframe with department, year, and counts
    group_counts = earnings_df.groupby([column, 'Year'])['Total'].count().reset_index(name="count")
    
    #remove BPS schools from result
    #dept_counts = dept_counts.loc[dept_counts['Department'].str.contains(BPS_str, regex=True) == False]
    
    #create the pivot table table, with calculated sums for each row and column
    group_counts_table = pd.pivot_table(group_counts, values='count', index=column, columns='Year', aggfunc='sum', fill_value=0, margins=True).reset_index()
    
    #get rid of the calculated sums by row, doesn't make sense for time series data
    return group_counts_table.iloc[:,0:-1].loc[group_counts_table[column].str.match(rec)]

In [9]:
#purpose: preliminary cleaning for Title column - create uniform spacing, convert to all caps, and remove unwanted characters
earnings_df['Title'] = earnings_df['Title'].replace({'\.':'', '(?<=[a-z])([A-Z])':r' \1', '\(':' (', '\)':') ', '\\\\':'', '&':' AND ', '\,':'/', '\#':'', '\s+':' '}, regex=True).str.upper()
earnings_df[dimensions] = earnings_df[dimensions].applymap(lambda x: x.strip())

In [10]:
#find all titles with given phrase
def find_matching_titles(s):
    titles = pd.DataFrame(show_unique('Title'), columns=["Title"])
    return titles.loc[titles["Title"].str.contains('(^|[\/\s\(])(' + s + ')([\/\s\)]|$)')]
    
find_matching_titles('JOURNEYPRS')

  return func(self, *args, **kwargs)


Unnamed: 0,Title
1175,MAINT MECH PLUMBER (JOURNEYPRS)


In [11]:
#purpose: preliminary cleaning for Title column to unstick words that were stuck together
earnings_df['Title']=earnings_df['Title'].replace({'([A-Z])(OF)([\/\s\)]|$)':r'\1 OF ', '([A-Z])(BPD)':r'\1 \2', '(BPD)([A-Z])':r'\1 \2'},regex=True)

In [12]:
#purpose: replace abbreviations and spelling errors with proper words in title column

#CHALLENGES --> 
#OP/OPER: OPERATOR VS OPERATIONS
#SP/SPEC: SPECIAL VS SPECIALIST
#SERV/SVC: SERVICE VS SERVICES
#COM/COMM: COMMUNITY VS COMMUNICATIONS VS COMMISSIONER VS COMMISSION VS COMMITTEE

abbrevs = {
    'ADMIN AND FINANCE':'ADMINISTRATION AND FINANCE',
    'ADMIN/FINANCE':'ADMINISTRATION AND FINANCE',
    'OFFC':'OFFICER',
    'OFFCR':'OFFICER',
    'SEN':'SENIOR',
    'DET':'DETECTIVE',
    'SUP':'SUPERVISOR',
    'SUPV':'SUPERVISOR',
    'SPV':'SUPERVISOR',
    'EXEC':'EXECUTIVE',
    'AN':'ANALYST',
    'ANL':'ANALYST',
    'ANAL':'ANALYST',
    'TECH':'TECHNICIAN',
    'EQUIP':'EQUIPMENT',
    'EQUIPMENT OPER':'EQUIPMENT OPERATOR',
    'ALARM OPER':'ALARM OPERATOR',
    'METER OPER':'METER OPERATOR',
    'COMPUTER OPER':'COMPUTER OPERATOR',
    'PROJ':'PROJECT',
    'SP':'SPECIAL',
    'STFF':'STAFF',
    'ACAD':'ACADEMY',
    'INSTR':'INSTRUCTOR',
    'ASST':'ASSISTANT',
    'ASSIST':'ASSISTANT',
    'ASS':'ASSISTANT',
    'DEP':'DEPUTY',
    'SUPN':'SUPERINTENDENT',
    'SYS':'SYSTEMS',
    'COOR':'COORDINATOR',
    'COORD':'COORDINATOR',
    'SEC':'SECRETARY',
    'LIEUT':'LIEUTENANT',
    'LT':'LIEUTENANT',
    'MAINT':'MAINTENANCE',
    'MAIN':'MAINTENANCE',
    'DIR':'DIRECTOR',
    'MGMT':'MANAGEMENT',
    'MGR':'MANAGER',
    'MNGR':'MANAGER',
    'MANGR':'MANAGER',
    'MED':'MEDICAL',
    'PROC':'PROCESSING',
    'CORP':'CORPORATION',
    'ASSOC':'ASSOCIATE',
    'ASSESS OPER MANAGEMENT':'ASSESSING OPERATIONS MANAGEMENT',
    'OPER':'OPERATIONS',
    'OP':'OPERATOR',
    'INC COMM':'INCIDENT COMMAND',
    'COMM SERV':'COMMUNITY SERVICE',
    'COMM OUTREACH':'COMMUNITY OUTREACH',
    'ASSISTANT COMM':'ASSISTANT COMMISSIONER',
    'DEPUTY COMM':'DEPUTY COMMISSIONER',
    'COMM OFFICE':'COMMISSIONERS OFFICE',
    'RADIO COMM':'RADIO COMMUNICATIONS',
    'COMM EQUIPMENT':'COMMUNICATIONS EQUIPMENT',
    'COMMUNIC':'COMMUNICATIONS',
    'COMMUN':'COMMUNICATIONS',
    'COMM LEADER':'COMMUNITY LEADER',
    'FIRE COMM':'FIRE COMMISSIONER',
    'HOUSING COMM':'HOUSING COMMISSION',
    'SCHOOL COMM':'SCHOOL COMMITTEE',
    'COMM AND INTERG':'COMMUNITY AND INTERGOVERNMENTAL',
    'BLDG':'BUILDINGS',
    'BLDGS':'BUILDINGS',
    'BDG':'BUILDINGS',
    'REG':'REGISTRAR',
    'SERV':'SERVICE',
    'SVC':'SERVICE',
    'SRV':'SERVICE',
    'PRIN':'PRINCIPAL',
    'PARA':'PARAPROFESSIONAL',
    'DIST':'DISTRICT',
    'FF':'FIRE FIGHTER',
    'A AND F':'ADMINISTRATION AND FINANCE',
    'F':'FIRE',
    'INSTRUC':'INSTRUCTOR',
    'SR':'SENIOR',
    'JR':'JUNIOR',
    'MECH':'MECHANIC',
    'MECHA':'MECHANIC',
    'MACH':'MACHINE',
    'GEN':'GENERAL',
    'ADMN':'ADMIN',
    'ADM':'ADMIN',
    'ENG':'ENGINEER',
    'STRUCT':'STRUCTURAL',
    'FRPRS':'FOREPERSON',
    'FRPR':'FOREPERSON',
    'FOREPRS':'FOREPERSON',
    'CONST':'CONSTRUCTION',
    'LBR':'LABORER',
    'RPR':'REPAIR',
    'REP':'REPAIR',
    'SPEC':'SPECIAL',
    'INCT':'INCIDENT',
    'COMND':'COMMAND',
    'FIN COM':'FINANCE COMMISSION',
    'MAS':'MASTER',
    'RPPRS':'REPAIRPERSON',
    'REPPRS':'REPAIRPERSON',
    'REPRPRS':'REPAIRPERSON',
    'REPAIRPR':'REPAIRPERSON',
    'REPAIRPRS':'REPAIRPERSON',
    'RPPR':'REPAIR PERSON',
    'WKG':'WORKING',
    'PW':'PUBLIC WORKS',
    'P W':'PUBLIC WORKS',
    'HVY':'HEAVY',
    'MTR':'MOTOR',
    'INSP':'INSPECTOR',
    'INSPEC':'INSPECTOR',
    'TRA':'TRAFFIC',
    'OPR':'OPERATIONS',
    'MEO':'MOTOR EQUIPMENT OPERATOR',
    'CFM':'(CFM)',
    'ELEC':'ELECTRIC',
    'EQUI':'EQUIPMENT',
    'COLL TRS':'COLLECTOR TREASURER',
    'ACNTNG':'ACCOUNTING',
    'CRFTSPRS':'CRAFTSPERSON',
    'COMMSS':'COMMISSIONER',
    'COMR':'COMMISSIONER',
    'COMMIS':'COMMISSIONER',
    'CFM':'',
    'COUNSLR':'COUNSELOR',
    'BD':'BOARD',
    'MEMBER BOARD':'MEMBER OF BOARD',
    'LIB':'LIBRARIAN',
    'LIBR':'LIBRARIAN',
    'LIBRARIN':'LIBRARIAN',
    'SVCS':'SERVICES',
    'CAMP JO':'(CAMP JOY)',
    'CAM JO':'(CAMP JOY)',
    'SER':'SERVICES',
    'PROT':'PROTECTIVE',
    'REL':'RELATIONS',
    'SUPVISING':'SUPERVISING',
    'PROP':'PROPERTY',
    'DISP':'DISPATCHER',
    'CHF':'CHIEF',
    'PMDGRAFF REMOVAL':'(GRAFFITI REMOVAL)',
    'PAINT':'PAINTER',
    'ENFORCE':'ENFORCEMENT',
    'DEVELOP':'DEVELOPMENT',
    'DEVEL':'DEVELOPMENT',
    'PROG':'PROGRAM',
    'PWD':'',
    'SWIM':'SWIMMING',
    'REGNL':'REGIONAL',
    'ACCTNG':'ACCOUNTING',
    'ACCT':'ACCOUNTING',
    'ENGR':'ENGINEER',
    'EQU':'EQUIPMENT',
    'EQ':'EQUIPMENT',
    'ANIM CNTL':'ANIMAL CONTROL',
    'OFCR':'OFFICER',
    'CLRK':'CLERK',
    'PARKS AND REC':'PARKS AND RECREATION',
    'P AND R':'PARKS AND RECREATION',
    '\(PARK\)':'(PARKS AND RECREATION)',
    'SPC':'SPECIAL',
    'HDQ':'HEADQUARTER',
    'TRANS':'TRANSPORTATION',
    'DISPCH':'DISPATCHER',
    'SUB':'SUBSTITUTE',
    'CUST':'CUSTODIAN',
    'NEIGH':'NEIGHBORHOOD',
    'YTH':'YOUTH',
    'HE':'',
    'BE':'',
    'FGR PRT EV':'FINGERPRINT EVIDENCE',
    'CH':'CHIEF',
    'SUM SCH':'SUMMER SCHOOL',
    'COM SCH':'COMMUNITY SCHOOL',
    'FAM':'FAMILY',
    'BPDFLEET':'BPD FLEET',
    'IBPDFLEET':'I (BPD FLEET',
    'IIBPDFLEET':'II (BPD FLEET',
    'EVIDENC':'EVIDENCE',
    'TECHNCN':'TECHNICIAN',
    'TCH':'TECHNICIAN',
    'SP ED':'SPECIAL ED',
    'SPED':'SPECIAL ED',
    'DP':'DATA PROCESSING',
    'PREV':'PREVENTION',
    'SCUBA DIV':'SCUBA DIVER',
    'DIV':'DIVISION',
    'POL':'POLICE',
    'OFFR':'OFFICER',
    'CLASSIFICATN':'CLASSIFICATION',
    'ATTN':'ATTENDANT',
    'CAFE':'CAFETERIA',
    'SCHL':'SCHOOL',
    'SCH':'SCHOOL',
    'HOSP':'HOSPITAL',
    'ASSTO':'ASSISTANT TO',
    'CHIEFOF':'CHIEF OF',
    'EXC':"EXECUTIVE",
    'OFF':'OFFICE',
    'SRGT':'SERGEANT',
    'RET':'RETIREMENT',
    'RETIRE':'RETIREMENT',
    'RETIREME':'RETIREMENT',
    'RECYCLE':'RECYCLING',
    'REAS':'RESEARCH',
    'INVESTNS':'INVESTIGATIONS',
    'INVEST':'INVESTIGATOR',
    'HACKNEY':'HACKNEY UNIT',
    'TOTHE':'TO THE',
    'PRGS':'PROGRAMS',
    'JOURNEYPRS':'JOURNEYPERSON',
    'BLD':'BUILDINGS',
    'PLMG AND GSFTG':'PLUMBING AND GASFITTING',
    'PLG AND GAS FTNG':'PLUMBING AND GASFITTING',
    'EL IN AND MNT':'ELECTRICAL INSPECTION AND MAINTENANCE',
    'AL':'ALTERATION',
    'ALT':'ALTERATION',
    'CLK':'CLERK',
    'ADV':'ADVANCED',
    'MAST\)':'MASTER',
    'ADR':'(RETIRED - ADR)',
    'LIBRRIN':'LIBRARIAN',
    'SPECL':'SPECIALIST',
    'LAB RELATIONS':'LABOR RELATIONS'
}

#to prevent words within words from accidentally being changed, lets ensure that values from above dictionary must be
#between certain characters to change
for key in abbrevs:
    abbrevs[key] = ' '+abbrevs[key]+' '
    new_key = '(^|[\s\(\)\-\/])'+key+'([\s\(\)\-\/]|$)'
    earnings_df['Title'] = earnings_df['Title'].replace(new_key, abbrevs[key], regex=True).str.strip()

earnings_df['Title'] = earnings_df['Title'].replace({'SPECIAL$':'SPECIALIST'}, regex=True)

In [13]:
count_by_group('Title').sort_values(2019, ascending=False).head(1000)

Year,Title,2011,2012,2013,2014,2015,2016,2017,2018,2019
2003,All,20509,21140,22469,22233,21902,22046,22245,23603,23312
1906,TEACHER,5121,5249,5512,5426,5375,5363,5437,5541,5255
1265,PARAPROFESSIONAL,1108,1161,1209,1113,1180,1187,1215,1416,1356
1311,POLICE OFFICER,1281,1231,1308,1349,1273,1279,1331,1334,1346
847,FIRE FIGHTER,804,816,861,873,815,837,795,792,820
1832,SUBSTITUTE TEACHER,770,743,1045,837,793,771,759,983,724
1278,PART-TIME CAFETERIA ATTENDANT,360,389,417,360,356,341,334,375,374
1132,LUNCH HOUR MONITORS,388,368,424,384,377,370,360,392,341
277,CAB MONITOR,252,319,379,343,354,365,306,232,337
1303,POLICE DETECTIVE,281,292,290,286,286,330,298,296,309


In [14]:
show_unique('Department')

array(['ASD Graphic Arts', 'ASD Human Resources',
       'ASD Intergvernmtl Relations', 'ASD Office Of Labor Relation',
       'ASD Office of Budget Mangmnt', 'ASD Purchasing Division',
       'Accountability', 'Achievement Gap', 'Administration and Finance',
       'Advancement & Ext. Affairs', 'Age Strong',
       'Alighieri Montessori School', 'Arts & Cultural Development',
       'Assessing Department', 'Asst Superintendent-Network A',
       'Asst Superintendent-Network B', 'Asst Superintendent-Network C',
       'Asst Superintendent-Network D', 'Asst Superintendent-Network E',
       'Asst Superintendent-Network F', 'Asst Superintendent-Network G',
       'Auditing Department', 'BPS Adams Elementary',
       'BPS Adult Education', 'BPS Alternative Education',
       'BPS Alternative Education HS', 'BPS Another Course To Colleg',
       'BPS Athletics', 'BPS Bates Elementary',
       'BPS Beethoven Elementary', 'BPS Blackstone Elementary',
       'BPS Boston Arts Academy', 'BPS Bo

In [15]:
find_matching_titles('LAB')

Unnamed: 0,Title
930,FIU DIGITAL LAB SUPERVISOR
1090,LAB INFORMATIOIN MANAGEMENT ADMIN BPD
1091,LAB TECHNICIAN


In [16]:
#purpose: preliminary cleaning for Department column - create uniform spacing, convert to all caps, and remove unwanted characters

earnings_df['Department'] = earnings_df['Department'].replace({'\.':'', '(?<=[a-z])([A-Z])':r' \1', '\/':' AND ', '\\\\':'', '&':' AND ', '\,':' ', '\#':''}, regex=True).str.upper()
earnings_df['Department'] = earnings_df['Department'].replace('\s+', ' ', regex=True).str.strip()

In [17]:
#purpose: most depts are BPS related, lets move them to new 'Program' column and set dept to 'Boston Public Schools'

BPS_str = '^BPS|ELEMENTARY|ACADEMY|K-8|MIDDLE|HIGH|SCHOOL|ACAD$|PILOT| EEC$| ELC$| EES$|9-12|ACHIEVEMENT GAP|STUDENT|SUPERINTENDENT|CHIEF ACADEMIC OFFICER|ENGLISH LANGUAGE LEARN|ACCOUNTABILITY|^ADVANCEMENT|ENROLLMENT|^EARLY LEARNING|^HPEC|^INFO AND INSTR|TEACHING|CAREER AND TECHNICAL ED|CHIEF OF STAFF|CHIEF FINANCIAL OFFICER|CHIEF OPERATING OFFICER|^COMMUNICATIONS$|FOOD AND NUTRITION SVC|INNOVATION DEPARTMENT|INSTITUTIONAL ADVANCEMT|LEGAL ADVISOR|PROFESSIONAL DEVELOPMNT|RESEARCH ASSESS AND EVAL|STRATEGY DEPARTMENT'
earnings_df.loc[earnings_df['Department'].str.contains(BPS_str, regex=True),'Program'] = earnings_df.loc[earnings_df['Department'].str.contains(BPS_str, regex=True),'Department']
earnings_df.loc[earnings_df['Department'].str.contains(BPS_str, regex=True),'Department'] = 'BOSTON PUBLIC SCHOOLS'

In [18]:
#after creating program column, most tuples will be NaN, lets set those to empty string
earnings_df['Program'] = earnings_df['Program'].replace(np.nan,'')

In [19]:
show_unique('Department')

array(['ADMINISTRATION AND FINANCE', 'AGE STRONG',
       'ARTS AND CULTURAL DEVELOPMENT', 'ASD GRAPHIC ARTS',
       'ASD HUMAN RESOURCES', 'ASD INTERGVERNMTL RELATIONS',
       'ASD OFFICE OF BUDGET MANGMNT', 'ASD OFFICE OF LABOR RELATION',
       'ASD PURCHASING DIVISION', 'ASSESSING DEPARTMENT',
       'AUDITING DEPARTMENT', 'BOSTON CITY COUNCIL',
       'BOSTON CNTR - YOUTH AND FAMILIES',
       'BOSTON CNTR-YOUTH AND FAMILIES', 'BOSTON FIRE DEPARTMENT',
       'BOSTON POLICE DEPARTMENT', 'BOSTON PUBLIC LIBRARY',
       'BOSTON PUBLIC SCHOOLS', 'BOSTON RETIREMENT SYSTEM',
       'CEMETERY DIVISION', 'CITY CLERK', 'COMM FOR PERSONS WITH DISABIL',
       'CONSUMER AFFAIRS AND LICENSING', 'DEPT OF VOTER MOBILIZATION',
       'DIPLOMAS PLUS', 'DND NEIGHBORHOOD DEVELOPMENT',
       'DPT OF INNOVATION AND TECHNOLOGY', 'ELDERLY COMMISSION',
       'ELECTION DIVISION', 'EMERGENCY MANAGEMENT',
       'ENVIRONMENT DEPARTMENT', 'FAIR HOUSING AND EQUITY',
       'FINANCE COMMISSION', 'HBI RET

In [20]:
#purpose: the operating budget report in the Boston Open Data Portal has much better data on the organization of Boston's govt
#lets get all the cabinets, departments, and programs from the operating budget

budget_df = pd.read_csv('./data/operating_budget.csv', encoding='latin1')
budget_df = budget_df.drop_duplicates(['Cabinet','Dept','Program'])[['Cabinet','Dept','Program']].apply(lambda x: x.str.upper())
budget_df = budget_df.replace({'\.':' ', '\&':' AND ', '\,':' ', '\s+':' '}, regex=True).apply(lambda x: x.str.strip())

#some minor corrections to the data
budget_df.replace({'LIBRARY DEPARTMENT':'BOSTON PUBLIC LIBRARY', 
                   ' W/':' WITH ', 
                   'LICENSING_BOARD':'LICENSING BOARD',
                   'BOSTON VETS':"VETERANS' SERVICES'"})

Unnamed: 0,Cabinet,Dept,Program
0,MAYORS CABINET,MAYOR'S OFFICE,MAYOR'S ADMINISTRATION
5,MAYORS CABINET,MAYOR'S OFFICE,MAYOR'S EXECUTIVE
7,MAYORS CABINET,MAYOR'S OFFICE,MAYOR'S POLICY AND PLANNING
11,MAYORS CABINET,MAYOR'S OFFICE,NEW URBAN MECHANICS
16,MAYORS CABINET,MAYOR'S OFFICE,MAYOR'S COMMUNICATIONS
20,MAYORS CABINET,ELECTION DEPARTMENT,ELECTION ADMINISTRATION
25,MAYORS CABINET,ELECTION DEPARTMENT,VOTER REGISTRATION
28,MAYORS CABINET,ELECTION DEPARTMENT,ELECTION ACTIVITIES
32,MAYORS CABINET,ELECTION DEPARTMENT,ANNUAL LISTING
35,MAYORS CABINET,INTERGOVERNMENTAL RELATIONS,IGR


In [21]:
#purpose: replace incorrect department names with correct ones when applicable

dept_abbrevs = {
    '^OFFICE OF FINANCE AND BUDGET$':'ADMINISTRATION AND FINANCE', #*OFFICE OF FINANCE AND BUDGET: 2014 - 2015
    '^OFFICE OF ADMIN AND FINANCE$':'ADMINISTRATION AND FINANCE', 
    '^DND NEIGHBORHOOD DEVELOPMENT$':'NEIGHBORHOOD DEVELOPMENT',
    '^ARTS AND CULTURAL DEVELOPMENT$':'OFFICE OF ARTS AND CULTURE', 
    '^ASD HUMAN RESOURCES$':'HUMAN RESOURCES',
    '^ASD INTERGVERNMTL RELATIONS$':'INTERGOVERNMENTAL RELATIONS',
    '^ASD OFFICE OF BUDGET MANGMNT$':'BUDGET MANAGEMENT',
    '^ASD OFFICE OF LABOR RELATION$':'OFFICE OF LABOR RELATIONS',
    '^ASD PURCHASING DIVISION$':'PROCUREMENT', #*AKA PURCHASING
    '^BOSTON CITY COUNCIL$':'CITY COUNCIL',
    '^BOSTON CNTR \- YOUTH AND FAMILIES$':'BOSTON CENTER FOR YOUTH AND FAMILIES',
    '^BOSTON CNTR\-YOUTH AND FAMILIES$':'BOSTON CENTER FOR YOUTH AND FAMILIES',
    '^BOSTON FIRE DEPARTMENT$':'FIRE DEPARTMENT',
    '^BOSTON POLICE DEPARTMENT$':'POLICE DEPARTMENT',
    '^BOSTON RETIREMENT SYSTEM$':'RETIREMENT DEPARTMENT',
    '^STATE BOSTON RETIREMENT SYST$':'RETIREMENT DEPARTMENT',
    '^COMM FOR PERSONS WITH DISABIL$':'COMMISSION FOR PERSONS WITH DISABILITIES',
    '^DEPT OF VOTER MOBILIZATION$':'ELECTION DEPARTMENT', #*DEPARTMENT OF VOTER MOBILIZATION: <2013
    '^DPT OF INNOVATION AND TECHNOLOGY$':'DEPARTMENT OF INNOVATION AND TECHNOLOGY',
    '^ELDERLY COMMISSION$':'AGE STRONG', #*ELDERLY COMMISSION <2018
    '^ELECTION DIVISION$':'ELECTION DEPARTMENT',
    '^IMMIGRANT ADVANCEMENT$':'OFFICE FOR IMMIGRANT ADVANCEMENT',
    '^INSPECTIONAL SERVICES DEPT$':'INSPECTIONAL SERVICES DEPARTMENT',
    '^OFC CHF PUBLIC WORKS TRANSPORT$':'OFFICE OF STREETS', #*OFFICE OF CHIEF OF PUBLIC WORKS AND TRANSPORT <2013
    '^OFC OF STRTS TRNSP AND SANI$':'OFFICE OF STREETS', #*OFFICE OF STREETS, TRANSPORTATION, AND SANITATION 2014 - 2015
    '^OFFICE OF CIVIL RIGHTS$':'FAIR HOUSING AND EQUITY', #*OFFICE OF CIVIL RIGHTS <2013
    '^OFFICE OF NEW BOSTONIANS$':'OFFICE FOR IMMIGRANT ADVANCEMENT', #*OFFICE OF NEW BOSTONIANS <2017
    '^PARKS DEPARTMENT$':'PARKS AND RECREATION DEPARTMENT',
    '^PROPERTY MANAGEMENT$':'PROPERTY MANAGEMENT DEPARTMENT',
    '^YOUTH FUND$':'YOUTH ENGAGEMENT AND EMPLOYMENT', #*YOUTH FUND <2013
    "^WOMEN'S COMMISSION$":"WOMEN'S ADVANCEMENT" #*WOMEN'S COMMISSION <2013
}

earnings_df['Department'] = earnings_df['Department'].replace(dept_abbrevs, regex=True)

In [22]:
show_unique('Department')

array(['ADMINISTRATION AND FINANCE', 'AGE STRONG', 'ASD GRAPHIC ARTS',
       'ASSESSING DEPARTMENT', 'AUDITING DEPARTMENT',
       'BOSTON CENTER FOR YOUTH AND FAMILIES', 'BOSTON PUBLIC LIBRARY',
       'BOSTON PUBLIC SCHOOLS', 'BUDGET MANAGEMENT', 'CEMETERY DIVISION',
       'CITY CLERK', 'CITY COUNCIL',
       'COMMISSION FOR PERSONS WITH DISABILITIES',
       'CONSUMER AFFAIRS AND LICENSING',
       'DEPARTMENT OF INNOVATION AND TECHNOLOGY', 'DIPLOMAS PLUS',
       'ELECTION DEPARTMENT', 'EMERGENCY MANAGEMENT',
       'ENVIRONMENT DEPARTMENT', 'FAIR HOUSING AND EQUITY',
       'FINANCE COMMISSION', 'FIRE DEPARTMENT', 'HBI RETIREES ET AL',
       'HUMAN RESOURCES', 'INSPECTIONAL SERVICES DEPARTMENT',
       'INTERGOVERNMENTAL RELATIONS', 'LAW DEPARTMENT', 'LICENSING BOARD',
       "MAYOR'S OFFICE", "MAYOR'S OFFICE-PUBLIC INFO",
       'NEIGHBORHOOD DEVELOPMENT', 'NEIGHBORHOOD SERVICES',
       'OFC BOSTON RESIDENTS JOB POL', 'OFFICE FOR IMMIGRANT ADVANCEMENT',
       'OFFICE OF ARTS

In [23]:
#purpose: some depts should actually be programs, this function can be used to set departments to programs

def replace_dept(orig_dept, new_dept, new_prog):
    earnings_df.loc[earnings_df['Department'].str.contains(orig_dept, regex=True),'Program'] = new_prog
    earnings_df.loc[earnings_df['Department'].str.contains(orig_dept, regex=True),'Department'] = new_dept

In [24]:
#use the above function to fix some of the Department data
replace_dept('CEMETARY DIVISION', 'PARKS AND RECREATION DEPARTMENT', 'CEMETARY')
replace_dept('LICENSING BOARD', 'CONSUMER AFFAIRS AND LICENSING', 'LICESNING BOARD')
replace_dept("MAYOR\'S OFFICE\-PUBLIC INFO", "MAYOR'S OFFICE", "MAYOR'S COMMUNICATIONS")
replace_dept('OFC BOSTON RESIDENTS JOB POL', 'OFFICE OF ECONOMIC DEVELOPMENT', 'BOSTON RESIDENTS JOB POLICY OFFICE')
replace_dept('SMALL AND LOCAL BUSINESS', 'OFFICE OF ECONOMIC DEVELOPMENT', 'SMALL AND LOCAL BUSINESS')
replace_dept('TRAFFIC DIVISION', 'TRANSPORTATION DEPARTMENT', 'TRAFFIC DIVISION')
replace_dept('TRANSPORTATION-PARKING CLERK', 'TRANSPORTATION DEPARTMENT', 'PARKING CLERK')
replace_dept('TREASURY-COLLECTING DIVISION', 'TREASURY DEPARTMENT', 'TREASURY DIVISION')
replace_dept('TREASURY-TREASURY DIVISION', 'TREASURY DEPARTMENT', 'COLLECTING DIVISION')
replace_dept('WORKERS COMPENSATION SERVICE', 'HUMAN RESOURCES', 'WORKERS COMP')

In [25]:
show_unique('Department')

array(['ADMINISTRATION AND FINANCE', 'AGE STRONG', 'ASD GRAPHIC ARTS',
       'ASSESSING DEPARTMENT', 'AUDITING DEPARTMENT',
       'BOSTON CENTER FOR YOUTH AND FAMILIES', 'BOSTON PUBLIC LIBRARY',
       'BOSTON PUBLIC SCHOOLS', 'BUDGET MANAGEMENT', 'CEMETERY DIVISION',
       'CITY CLERK', 'CITY COUNCIL',
       'COMMISSION FOR PERSONS WITH DISABILITIES',
       'CONSUMER AFFAIRS AND LICENSING',
       'DEPARTMENT OF INNOVATION AND TECHNOLOGY', 'DIPLOMAS PLUS',
       'ELECTION DEPARTMENT', 'EMERGENCY MANAGEMENT',
       'ENVIRONMENT DEPARTMENT', 'FAIR HOUSING AND EQUITY',
       'FINANCE COMMISSION', 'FIRE DEPARTMENT', 'HBI RETIREES ET AL',
       'HUMAN RESOURCES', 'INSPECTIONAL SERVICES DEPARTMENT',
       'INTERGOVERNMENTAL RELATIONS', 'LAW DEPARTMENT', "MAYOR'S OFFICE",
       'NEIGHBORHOOD DEVELOPMENT', 'NEIGHBORHOOD SERVICES',
       'OFFICE FOR IMMIGRANT ADVANCEMENT', 'OFFICE OF ARTS AND CULTURE',
       'OFFICE OF ECONOMIC DEVELOPMENT', 'OFFICE OF LABOR RELATIONS',
       'OF

In [26]:
#preliminary cleaning for program column
earnings_df['Program'] = earnings_df['Program'].replace({'^BPS':'', 'BOSTON PUBLIC SCHOOLS':'', '\"':'', '\-NETWORK':' - NETWORK'}, regex=True).str.strip()
show_unique('Program')

array(['', 'ACCOUNTABILITY', 'ACHIEVEMENT GAP', 'ADAMS ELEMENTARY',
       'ADULT EDUCATION', 'ADVANCEMENT AND EXT AFFAIRS',
       'ALIGHIERI MONTESSORI SCHOOL', 'ALTERNATIVE EDUCATION',
       'ALTERNATIVE EDUCATION HS', 'ANOTHER COURSE TO COLLEG',
       'ASST SUPERINTENDENT - NETWORK A',
       'ASST SUPERINTENDENT - NETWORK B',
       'ASST SUPERINTENDENT - NETWORK C',
       'ASST SUPERINTENDENT - NETWORK D',
       'ASST SUPERINTENDENT - NETWORK E',
       'ASST SUPERINTENDENT - NETWORK F',
       'ASST SUPERINTENDENT - NETWORK G', 'ATHLETICS', 'BALDWIN ELC',
       'BATES ELEMENTARY', 'BEETHOVEN ELEMENTARY',
       'BLACKSTONE ELEMENTARY', 'BOSTON ARTS ACADEMY',
       'BOSTON COLLABORATIVE HIGH SCH', 'BOSTON COMM LEADERSHIP AC',
       'BOSTON EVENING ACADEMY', 'BOSTON INTERNATIONAL HS',
       'BOSTON LATIN', 'BOSTON MIDDLE SCHOOL ACADEMY',
       'BOSTON RESIDENTS JOB POLICY OFFICE', 'BOSTON SCHOOL COMMITTEE',
       'BRADLEY ELEMENTARY', 'BRIGHTON HIGH', 'BTU PILOT', 'BURKE

In [27]:
prog_abbrevs = {
    'MGMT':' MANAGEMENT ',
    'EXT AFFAIRS':' EXTERNAL AFFAIRS ',
    'CL10':' ',
    'COM ACD':' COMMUNITY ACADEMY OF ',
    'P A SHAW':' PA SHAW ',
    'NURS':' NURSES ',
    'WEST ROXBURY HIGH':' WEST ROXBURY ACADEMY ', 
    'WREC\:':' ',
    'HPEC\:':' ',
    'MC CORMACK MIDDLE':' MCCORMACK MIDDLE ', 
    'KENNEDY EM':' EDWARD M KENNEDY ',
    'KENNEDY JF':' JOHN F KENNEDY ',
    'KENNEDY PJ':' PATRICK J KENNEDY ',
    'GREENWOOD S':' SARAH GREENWOOD ',
    'MC KINLEY MIDDLE':' MCKINLEY MIDDLE ',
    'HORACE MANN':'HORACE MANN SCHOOL',
    'WITHTHROP':' WINTHROP ',
    'SOUTH BOSTON HS - EXCEL':' EXCEL HIGH SCHOOL ',
    'MPHCOMMERCE':' MADISON PARK HIGH SCHOOL - COMMERCE ',
    'MPHCRAFTS':' MADISON PARK HIGH SCHOOL - CRAFTS ',
    'MPHHEALTH':' MADISON PARK HIGH SCHOOL - HEALTH ',
    'MPHFRESHMAN':' MADISON PARK HIGH SCHOOL - FRESHMAN ',
    'FACILITY MANAGEMENT AND A AND R':' FACILITIES MANAGEMENT ',
    'FACILITY MANAGEMENT':' FACILITIES MANAGEMENT ',
    'FACILITITES MANAGEMENT':' FACILITIES MANAGEMENT ',
    'GREENWOOD E':' ELIHU GREENWOOD ',
    'ALTERNATIVE EDUCATION HIGH SCHOOL':' ALTERNATIVE EDUCATION ',
    'COLLEG':' COLLEGE ',
    'SCH':' SCHOOL ',  
    'INTERVTN CT':' INTERVENTION CENTER ',
    'COUSELING':' COUNSELING ',
    'SERVICE':' SERVICES ',
    'SER':' SERVICES ',
    'HI':' HIGH ',
    'ED':' EDUCATION ',
    'HS':' HIGH SCHOOL ',
    'ACAD':' ACADEMY ',
    'SVC':' SERVICES ',
    'ASST':' ASSISTANT ',
    'COMM':' COMMUNITY ',
    'AC':' ACADEMY ',
    'ST':' STREET ',
    'HEARING AND APPEAL AND ATTENDANC':' HEARINGS AND APPEALS ',
    'KITCHEN FOOD':' KITCHEN AND FOOD ',
    'LEARN':' LEARNING ',
    'FAM AND STUDENT ENGAGEMT':' FAMILY AND STUDENT ENGAGEMENT ',
    'ADVANCEMT':' ADVANCEMENT ',
    'ACCOUNTABILITY':' TURNAROUND AND TRANSFORMATION ',
    'RESEARCH ASSESS AND EVAL':' DATA AND ACCOUNTABILITY '
}

for key in prog_abbrevs:
    new_key = '(^|[\s\(\)\-])'+key+'([\s\(\)\-]|$)'
    earnings_df['Program'] = earnings_df['Program'].replace(new_key, prog_abbrevs[key], regex=True).str.strip()
    
earnings_df['Program'] = earnings_df['Program'] .replace({'ELEMENTARY$':'ELEMENTARY SCHOOL',
                                                          'MIDDLE$':'MIDDLE SCHOOL',
                                                          'HIGH$':'HIGH SCHOOL',
                                                          'PILOT$':'PILOT SCHOOL'}, regex=True)

In [28]:
show_unique('Program')

array(['', 'ACHIEVEMENT GAP', 'ADAMS ELEMENTARY SCHOOL',
       'ADULT EDUCATION', 'ADVANCEMENT AND EXTERNAL AFFAIRS',
       'ALIGHIERI MONTESSORI SCHOOL', 'ALTERNATIVE EDUCATION',
       'ALTERNATIVE EDUCATION HIGH SCHOOL', 'ANOTHER COURSE TO COLLEGE',
       'ASSISTANT SUPERINTENDENT - NETWORK A',
       'ASSISTANT SUPERINTENDENT - NETWORK B',
       'ASSISTANT SUPERINTENDENT - NETWORK C',
       'ASSISTANT SUPERINTENDENT - NETWORK D',
       'ASSISTANT SUPERINTENDENT - NETWORK E',
       'ASSISTANT SUPERINTENDENT - NETWORK F',
       'ASSISTANT SUPERINTENDENT - NETWORK G', 'ATHLETICS', 'BALDWIN ELC',
       'BATES ELEMENTARY SCHOOL', 'BEETHOVEN ELEMENTARY SCHOOL',
       'BLACKSTONE ELEMENTARY SCHOOL', 'BOSTON ARTS ACADEMY',
       'BOSTON COLLABORATIVE HIGH SCHOOL',
       'BOSTON COMMUNITY LEADERSHIP ACADEMY', 'BOSTON EVENING ACADEMY',
       'BOSTON INTERNATIONAL HIGH SCHOOL', 'BOSTON LATIN',
       'BOSTON MIDDLE SCHOOL ACADEMY',
       'BOSTON RESIDENTS JOB POLICY OFFICE', 'BO

In [29]:
earnings_df.sample(500)

Unnamed: 0,Name,Department,Title,Regular,Retro,Other,Overtime,Injured,Detail,Quinn,Total,Postal,Year,Program
9260,"FERDINAND,THONY",BOSTON PUBLIC SCHOOLS,PARAPROFESSIONAL,34998.23,0.0,0.0,0.0,0.0,0.0,0.0,34998.23,2136,2014,MCKINLEY MIDDLE SCHOOL
15476,"ORTIZ,SANTOS J",FIRE DEPARTMENT,FIRE LIEUTENANT-TECHNICIAN,106121.09,0.0,550.0,11864.06,17971.57,12667.75,0.0,149174.47,2136,2015,
11860,"FELICIANO,LOURDES I",BOSTON PUBLIC SCHOOLS,PRINCIPAL CLERK TYPIST 21,45379.36,0.0,0.0,0.0,0.0,0.0,0.0,45379.36,2136,2011,
17313,"ROACH,TIMOTHY E",FIRE DEPARTMENT,FIRE FIGHTER,83073.12,0.0,5044.5,4432.88,21212.67,0.0,0.0,113763.17,2126,2016,
1863,"GIRARD,NANCY",ENVIRONMENT DEPARTMENT,COMMISSIONER,93269.25,0.0,2619.23,0.0,0.0,0.0,0.0,95888.48,2130,2014,
5957,"PEARSON,PAULA F",POLICE DEPARTMENT,POLICE OFFICER,71817.84,0.0,800.0,11490.0,0.0,37626.0,0.0,121733.84,2119,2011,
21542,"WEBBER,M LISA",BOSTON PUBLIC SCHOOLS,TEACHER,96948.19,902.59,0.0,0.0,0.0,0.0,0.0,97850.78,2062,2013,
12617,"SOTO,YENI",BOSTON PUBLIC SCHOOLS,LEAD SATELLITE ATTENDANT B,24044.71,0.0,100.0,0.0,0.0,0.0,0.0,24144.71,2131,2014,SUMNER ELEMENTARY SCHOOL
4287,"CANNATA,MARTIN J",FIRE DEPARTMENT,FIRE FIGHTER,90729.18,0.0,2157.24,11270.82,0.0,0.0,0.0,104157.24,2144,2012,
18607,"HOLDER,JAMES",POLICE DEPARTMENT,POLICE OFFICER,80677.91,10349.31,833.31,16340.71,0.0,12617.0,0.0,120818.24,2124,2014,


In [30]:
earnings_df.size

2792426

In [35]:
#budget_df = budget_df[['Cabinet','Dept']].drop_duplicates()
#earnings_df = earnings_df.join(budget_df.set_index('Dept'),how='left', on='Department')
columnTitles = ['Name', 'Year', 'Cabinet', 'Department', 'Program', 'Title', 'Regular','Retro', 'Other', 'Overtime', 'Injured', 'Detail', 'Quinn', 'Total', 'Postal']
earnings_df = earnings_df.reindex(columns=columnTitles)

In [36]:
earnings_df.sample(500)

Unnamed: 0,Name,Year,Cabinet,Department,Program,Title,Regular,Retro,Other,Overtime,Injured,Detail,Quinn,Total,Postal
11794,"SITOMER,MARION",2012,EDUCATION,BOSTON PUBLIC SCHOOLS,,TEACHER,94165.45,0.0,0.0,0.0,0.0,0.0,0.0,94165.45,2474
17405,"BOGGS,GEORGE E",2014,PUBLIC SAFETY,POLICE DEPARTMENT,,POLICE OFFICER CANINE OFFICER 2$6,82118.46,9755.04,8642.03,22789.93,0.0,592.0,13864.75,137762.21,2302
16252,"CABEY II,EMILIO OLIVER",2017,PUBLIC SAFETY,FIRE DEPARTMENT,,FIRE FIGHTER-TECHNICIAN,87263.96,0.0,550.0,6508.97,7483.66,12656.0,0.0,114462.59,2126
18504,"SHAMSHAK,RICHARD A",2016,STREETS,TRANSPORTATION DEPARTMENT,TRAFFIC DIVISION,SENIOR TRAFFIC INVESTIGATOR,48275.22,0.0,928.37,14190.9,0.0,0.0,0.0,63394.49,2124
574,"CLARKE,WINFIELD",2011,ENVIRONMENT ENERGY AND OPEN SPACE,PARKS AND RECREATION DEPARTMENT,,GARDENER,41233.98,0.0,0.0,1724.64,0.0,0.0,0.0,42958.62,2126
19579,"TURO,DAVID L",2011,EDUCATION,BOSTON PUBLIC SCHOOLS,,TEACHER,100043.14,0.0,8620.0,0.0,0.0,0.0,0.0,108663.14,2140
11176,"DEPAZ,CESAR G",2011,EDUCATION,BOSTON PUBLIC SCHOOLS,,DUPL/DISTRIBUTION SUPERVISOR 27,48919.63,0.0,0.0,49.03,0.0,0.0,0.0,48968.66,2120
22128,"TUOHY,MICHAEL PATRICK",2014,PUBLIC SAFETY,FIRE DEPARTMENT,,FIRE FIGHTER,92646.9,13193.07,550.0,7927.82,0.0,165.0,0.0,114482.79,2131
12825,"THEDINGA,SARA E",2017,EDUCATION,BOSTON PUBLIC SCHOOLS,SPECIAL EDUCATION,TEACHER,107572.49,0.0,0.0,0.0,0.0,0.0,0.0,107572.49,2061
4299,"COSTINE,ROBERT F",2013,PUBLIC SAFETY,POLICE DEPARTMENT,,POLICE DETECTIVE,73402.42,0.0,6953.51,20932.56,0.0,3641.0,7340.45,112269.94,1887


In [None]:
#purpose: get pivot table of aggregates by department
    #column indicates what column to use for aggregation (i.e. Injured, Regular, Total, etc.)
    #aggfunc is the aggregation function (mean, sum, max, min)
    #dept is an optional column that specifies what department to filter on, set to everything by default
def agg_by_dept(column, aggfunc, dept=r'(.*?)'):
    
    #create dataframe with department, year, and aggregate column
    dept_costs = earnings_df.loc[earnings_df[column]>0].groupby(['Department', 'Year'])[column].agg(aggfunc).reset_index(name="Costs")
    
    #remove BPS schools from result
    dept_costs = dept_costs.loc[dept_costs['Department'].str.contains(BPS_str, regex=True) == False]
    
    #create the pivot table table, with calculated aggregate for each row and column
    dept_costs_table = pd.pivot_table(dept_costs, values='Costs', index='Department', columns='Year', aggfunc = aggfunc, fill_value=0, margins=True).reset_index()
    
    #get rid of the calculated aggregate by row, doesn't make sense for time series data
    return dept_costs_table.iloc[:,0:-1].loc[dept_costs_table['Department'].str.match(dept)]

#agg_by_dept('Injured', 'median', 'Boston Police Department|Boston Fire Department')

In [None]:
#purpose: get pivot table of employee counts by department and title
    #column indicates what column to count on (i.e. Injured or Total)
    #dept is an optional column that specifies what department to filter on, set to everything by default
    #title is an optional column that specifies what title to filter on, set to everything by default, not reccommended without setting dept first
def count_by_title(column, dept=r'(.*?)', title=r'(.*?)'):
    
    #create dataframe with title, year, and counts
    job_counts = earnings_df.loc[earnings_df['Department'].str.match(dept)]
    job_counts = job_counts.loc[job_counts[column]>0].groupby(['Title', 'Year'])[column].count().reset_index(name="count")
    
    #create the pivot table table, with calculated sums for each row and column
    job_counts_table = pd.pivot_table(job_counts, values='count', index='Title', columns='Year', aggfunc=np.sum, fill_value=0, margins=True).reset_index()
    
    #get rid of the calculated sums by row, doesn't make sense for time series data
    return job_counts_table.loc[job_counts_table['Title'].str.match(title)].sort_values(by='All', ascending=False).iloc[:,0:-1]

count_by_title('Total').head(1000)

In [None]:
#purpose: get pivot table of chosen aggregate by department and title
    #column indicates what column to aggregate on (i.e. Injured, Regular, Total, etc.)
    #dept is an optional column that specifies what department to filter on, set to everything by default
    #title is an optional column that specifies what title to filter on, set to everything by default, not reccommended without setting dept first
def agg_by_title(column, aggfunc, dept=r'(.*?)', title=r'(.*?)'):
    
    #create dataframe with title, year, and aggregate column
    job_pay = earnings_df.loc[earnings_df['Department'].str.match(dept)].groupby(['Title', 'Year'])[column].agg(aggfunc).reset_index(name="Costs")
    
    #create the pivot table table, with calculated aggregate for each row and column
    job_pay_table = pd.pivot_table(job_pay, values='Costs', index='Title', columns='Year', fill_value=0, margins=True).reset_index()
    
    #get rid of the calculated sums by row, doesn't make sense for time series data
    return job_pay_table.loc[job_pay_table['Title'].str.match(title)].sort_values(by='All', ascending=False).iloc[:,0:-1]

agg_by_title('Total', 'mean', 'Mayor\'s Office', 'Chief Diversity Officer')

In [None]:
print(earnings_df.loc[(earnings_df['Injured']>0) & (earnings_df['Year']==2014) & (earnings_df['Department']=='Boston Police Department'), ['Name', 'Title', 'Injured']])

In [None]:
count_by_title('Total', title='Prin')