In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 1000) 
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [2]:
#purpose: create our dataframe

#default column headers
col_headers = ['Name', 'Department', 'Title', 'Regular', 'Retro', 'Other', 'Overtime', 'Injured', 'Detail', 'Quinn', 'Total', 'Postal']

#create an empty df with default values
earnings_df = pd.DataFrame(columns = col_headers)

#add column 'Year' and set to NaN
earnings_df['Year'] = np.nan

for year in range(2011, 2020):
    
    #read in the CSV for the given year, set it to variable next_df
    next_df = pd.read_csv('./data/salaries_' + str(year) + '.csv', skiprows=1, names=col_headers, encoding='latin1')
    
    #add the column 'Year' to next_df and set to the given year
    next_df['Year'] = year
    
    #in the 2013 and 2014 datasets, title and department columns are in the wrong order
    if year == 2013 or year == 2014:
        
        #get the list of columns (including year)
        col_list = list(next_df)
        
        #swap the order of title and department in the list
        col_list[1], col_list[2] = col_list[2], col_list[1]
        
        #set the dataframe's columns to the new list
        next_df.columns = col_list
    
    #add next_df to earnings_df
    earnings_df = pd.concat([earnings_df, next_df], sort=False)

In [3]:
earnings_df[['Department', 'Name', 'Title', 'Postal']].head(1000)

Unnamed: 0,Department,Name,Title,Postal
0,Assessing Department,"Abadi,Kidani A",Property Officer (Asn),02118-3126
1,ASD Office Of Labor Relation,"Ablon,Jordan N",Asst Corp Counsel III,02135-5943
2,Transportation-Parking Clerk,"Accardi,Patricia",Chief Claims Investigator,02081-3751
3,Boston Public Library,"Ackerly,Lyn E.",Spec Library Asst I,02118-0000
4,Law Department,"Adams,Carey L.",Prin Clerk,02131-4834
5,Public Works Department,"Adams,Dean",Highway Maint Frprs (Pwd)##,02124-1112
6,Boston Cntr - Youth & Families,"Adams,Natasha",Youth Worker,02124-4418
7,ASD Human Resources,"Adario,Anthony J",Supvising Claims Agent (Asd),02132-3000
8,Property Management,"Addessa,Rocco",Jr Building Custodian,02128-1617
9,Boston Cntr - Youth & Families,"Afonseca,Jose",Certified Seasonal Lifeguard,02124-1214


In [4]:
#purpose: clean our dataframe

#targeted (hard-coded) cleaning for specific rows
earnings_df = earnings_df.loc[earnings_df['Department']!='DEPARTMENT_NAME']
earnings_df.loc[earnings_df['Department'] == 'Boston Cntr-Youth & Families', 'Department'] = 'Boston Cntr - Youth & Families'
earnings_df.loc[earnings_df['Department'] == 'DND Neighborhood Development', 'Department'] = 'Neighborhood Development'
earnings_df.loc[earnings_df['Name'].str.match('Ostiguy,David M'), 'Postal'] = '02327'
earnings_df.loc[earnings_df['Name'].str.match('Karales,George Alfred'), 'Postal'] = '02170'
earnings_df.loc[earnings_df['Name'].str.match('Smith,Kenneth J'), 'Postal'] = '02124'
earnings_df.loc[earnings_df['Name'].str.match('Thomas,Sarita J'), 'Postal'] = '02125'
earnings_df.loc[earnings_df['Name'].str.match('Morris,Judith A.'), 'Postal'] = '02170'
earnings_df.loc[earnings_df['Name'].str.match('Mendez,Jose R'), 'Postal'] = '02135'
earnings_df.loc[earnings_df['Name'].str.match('Morrison,June'), 'Postal'] = '02481'

#dimensions are qualitative columns, facts are quantitative columns
facts = ['Regular', 'Retro', 'Other', 'Overtime', 'Injured', 'Detail', 'Quinn', 'Total']
    
##cast year to type 'int'
earnings_df['Year'] = earnings_df['Year'].astype(int)

#clean the facts columns and convert from type 'object' to 'float'
earnings_df[facts] = earnings_df[facts].astype(str).applymap(lambda x: x.strip())
earnings_df[facts] = earnings_df[facts].replace({'^-$|^None$|^nan$|\)':0, ',':'', '\$':'', ' ':'', '^\(':'-'}, regex=True)
earnings_df[facts] = earnings_df[facts].astype(float)

In [5]:
#convert dimensions from type 'object' to 'string' and remove leading/trailing whitespace
dimensions = ['Name', 'Department', 'Title', 'Postal']
earnings_df[dimensions] = earnings_df[dimensions].astype(str)

#purpose: clean Postal column

#add a 0 to the front of any code with 4 digits
mask = earnings_df['Postal'].str.len() == 4
earnings_df.loc[mask, 'Postal'] = '0' + earnings_df.loc[mask, 'Postal']

#remove delivery route number from any codes that have it (number after hyphen)
earnings_df['Postal'] = earnings_df['Postal'].str.split('-', expand=True)[0]

#any postal codes with non-numeric characters will be set to UNKNOWN
earnings_df.loc[earnings_df['Postal'].str.match('[A-Z]', na=False), 'Postal'] = 'UNKNOWN'

In [6]:
#purpose: clean Name column
earnings_df['Name'] = earnings_df['Name'].replace({'\.':''}, regex=True).str.upper()

In [7]:
#purpose: preliminary cleaning for Title column - create uniform spacing, convert to all caps, and remove unwanted characters
earnings_df['Title'] = earnings_df['Title'].replace({'\.':'', '(?<=[a-z])([A-Z])':r' \1', '\(':' (', '\)':') ', '\/':' AND ', '\\\\':'', '&':' AND ', ',':'', '\#':''}, regex=True).str.upper()
earnings_df[dimensions] = earnings_df[dimensions].applymap(lambda x: x.strip())

In [8]:
earnings_df[['Department', 'Name', 'Title', 'Postal']].head(1000)

Unnamed: 0,Department,Name,Title,Postal
0,Assessing Department,"ABADI,KIDANI A",PROPERTY OFFICER (ASN),2118
1,ASD Office Of Labor Relation,"ABLON,JORDAN N",ASST CORP COUNSEL III,2135
2,Transportation-Parking Clerk,"ACCARDI,PATRICIA",CHIEF CLAIMS INVESTIGATOR,2081
3,Boston Public Library,"ACKERLY,LYN E",SPEC LIBRARY ASST I,2118
4,Law Department,"ADAMS,CAREY L",PRIN CLERK,2131
5,Public Works Department,"ADAMS,DEAN",HIGHWAY MAINT FRPRS (PWD),2124
6,Boston Cntr - Youth & Families,"ADAMS,NATASHA",YOUTH WORKER,2124
7,ASD Human Resources,"ADARIO,ANTHONY J",SUPVISING CLAIMS AGENT (ASD),2132
8,Property Management,"ADDESSA,ROCCO",JR BUILDING CUSTODIAN,2128
9,Boston Cntr - Youth & Families,"AFONSECA,JOSE",CERTIFIED SEASONAL LIFEGUARD,2124


In [9]:
#purpose: replace abbreviations and spelling errors with proper words in title column
abbrevs = {
    'ADMIN':'ADMINISTRATIVE',
    'OFFC':'OFFICER',
    'OFFCR':'OFFICER',
    '\(DET\)':'DETECTIVE',
    'DET':'DETECTIVE',
    'SUPV':'SUPERVISOR',
    'SPV':'SUPERVISOR',
    'EXEC':'EXECUTIVE',
    'ANL':'ANALYST',
    'ANAL':'ANALYST',
    'TECH':'TECHNICIAN',
    'EQUIP OPER':'EQUIPMENT OPERATOR',
    'PROJ':'PROJECT',
    'SP PROJ STFF':'SPECIAL PROJECT STAFF',
    'SP PROJECT STFF':'SPECIAL PROJECT STAFF',
    'STFF':'STAFF',
    'ACAD':'ACADEMY',
    'INSTR':'INSTRUCTOR',
    'ASST':'ASSISTANT',
    'ASSIST':'ASSISTANT',
    'DEP':'DEPUTY',
    'SUPN':'SUPERINTENDENT',
    'SYS':'SYSTEMS',
    'COOR':'COORDINATOR',
    'COORD':'COORDINATOR',
    'SEC':'SECRETARY',
    'LIEUT':'LIEUTENANT',
    'MAINT':'MAINTENANCE',
    'DIR':'DIRECTOR',
    'MGMT':'MANAGEMENT',
    'MGR':'MANAGER',
    'MNGR':'MANAGER',
    'MANGR':'MANAGER',
    'MED':'MEDICAL',
    'OPER':'OPERATIONS',
    'DATA PROC':'DATA PROCESSING',
    'CORP COUNSEL':'CORPORATION COUNSEL',
    'ASSOC':'ASSOCIATE',
    'COMM SERV':'COMMUNITY SERVICE',
    'COMM':'COMMUNICATIONS',
    'COMMUNIC':'COMMUNICATIONS',
    'COMMUN':'COMMUNICATIONS',
    'BLDG':'BUILDING',
    'SERV':'SERVICE',
    'REG VOTERS':'REGISTRAR OF VOTERS',
    'SVC':'SERVICE',
    'SRV':'SERVICE',
    'EQUIP':'EQUIPMENT',
    'PRIN':'PRINCIPAL',
    'DIST':'DISTRICT',
    'FF':'FIRE FIGHTER',
    'INSTRUC':'INSTRUCTOR',
    'SR':'SENIOR',
    'JR':'JUNIOR',
    'MECH':'MECHANIC',
    'MECHA':'MECHANIC',
    'GEN':'GENERAL',
    'ADMN':'ADMINISTRATIVE',
    'ENG':'ENGINEER',
    'STRUCT':'STRUCTURAL',
    'FRPRS':'FOREPERSON',
    'FRPR':'FOREPERSON',
    'FOREPRS':'FOREPERSON',
    'CONST':'CONSTRUCTION',
    'LBR':'LABORER',
    'RPR':'REPAIR',
    'REP':'REPAIR',
    'SPEC':'SPECIAL',
    'INCT COMND SP':'INCIDENT COMMAND SPECIALIST',
    'MAS OF F BOAT':'MASTER OF FIRE BOAT',
    'RPPRS':'REPAIRPERSON',
    'REPPRS':'REPAIRPERSON',
    'REPRPRS':'REPAIRPERSON',
    'REPAIRPR':'REPAIRPERSON',
    'REPAIRPRS':'REPAIRPERSON',
    'RPPR':'REPAIR PERSON',
    'WKG':'WORKING',
    'PW':'PUBLIC WORKS',
    'P W':'PUBLIC WORKS',
    'HVY':'HEAVY',
    'MTR':'MOTOR',
    'INSP':'INSPECTOR',
    'INSPEC':'INSPECTOR',
    'TRA':'TRAFFIC',
    'OPR':'OPERATIONS',
    'MEO':'MOTOR EQUIPMENT OPERATOR',
    'CFM':'(CFM)',
    'ELEC EQUIPMENT':'ELECTRIC EQUIPMENT',
    'ACC MANAGEMENT':'ACCOUNT MANAGEMENT',
    'EQUI':'EQUIPMENT',
    'COLL TRS':'COLLECTOR TREASURER',
    'ACNTNG':'ACCOUNTING',
    'CRFTSPRS':'CRAFTSPERSON',
    'COUNSLR':'COUNSELOR',
    'MEMBER BD OF ELECTION':'MEMBER OF BOARD OF ELECTIONS',
    'LIB':'LIBRARIAN',
    'LIBR':'LIBRARIAN',
    'LIBRARIN':'LIBRARIAN',
    'SVCS':'SERVICES',
    'CAMP JO':'(CAMP JOY)',
    'CAM JO':'(CAMP JOY)',
    'SER':'SERVICES',
    'PROT':'PROTECTIVE',
    'REL':'RELATIONS',
    'SUPVISING':'SUPERVISING',
    'PROP':'PROPERTY',
    'DISP':'DISPATCHER',
    'CHF':'CHIEF',
    'PMDGRAFF REMOVAL':'(PMD GRAFFITI REMOVAL)',
    '(PAINT)':'AND PAINTER',
    '(PAINTER)':'AND PAINTER',
    'ENFORCE':'ENFORCEMENT',
    'DEVELOP':'DEVELOPMENT',
    'PROG':'PROGRAM',
    'CONTRUCTION':'CONSTRUCTION',
    'PWD':' (PWD)',
    'SWIM':'SWIMMING',
    'REGNL':'REGIONAL',
    'ACCTNG':'ACCOUNTING',
    'ENGR':'ENGINEER',
    'EQU':'EQUIPMENT',
    'EQ':'EQUIPMENT',
    'ANIM CNTL OFCR':'ANIMAL CONTROL OFFICER',
    'CLRK':'CLERK',
    'DEVELOP':'DEVELOPMENT',
    'PARKS AND REC':'PARKS AND RECREATION',
    'P AND R':'PARKS AND RECREATION',
    '\(PARK\)':'(PARKS AND RECREATION)',
    'SPC':'SPECIAL',
    'HDQ':'HEADQUARTER',
    'DISPCH':'DISPATCHER',
    'SUB':'SUBSTITUTE',
    'HE':'(HE)',
    'FGR PRT EV':'FINGERPRINT EVIDENCE',
    'CH':'CHIEF',
    'OP':'OPERATOR',
    'IBPDFLEET':'I (BPD FLEET',
    'IIBPDFLEET':'II (BPD FLEET',
    'EVIDENC TECHNCN':'EVIDENCE TECHNICIAN',
    'TCH':'TECHNICIAN',
    'SPECIAL$':'SPECIALIST'
}

abbrevs_regex = {}

for key in abbrevs:
    abbrevs[key] = ' '+abbrevs[key]+' '
    new_key = '(^|[\s\(\)\-])'+key+'([\s\(\)\-]|$)'
    abbrevs_regex[new_key] = abbrevs[key]

earnings_df['Title'] = earnings_df['Title'].replace(abbrevs_regex, regex=True).str.strip()
print('done')

done


In [10]:
earnings_df[['Department', 'Name', 'Title', 'Postal']].head(1000)

Unnamed: 0,Department,Name,Title,Postal
0,Assessing Department,"ABADI,KIDANI A",PROPERTY OFFICER (ASN),2118
1,ASD Office Of Labor Relation,"ABLON,JORDAN N",ASSISTANT CORPORATION COUNSEL III,2135
2,Transportation-Parking Clerk,"ACCARDI,PATRICIA",CHIEF CLAIMS INVESTIGATOR,2081
3,Boston Public Library,"ACKERLY,LYN E",SPECIAL LIBRARY ASSISTANT I,2118
4,Law Department,"ADAMS,CAREY L",PRINCIPAL CLERK,2131
5,Public Works Department,"ADAMS,DEAN",HIGHWAY MAINTENANCE FOREPERSON (PWD),2124
6,Boston Cntr - Youth & Families,"ADAMS,NATASHA",YOUTH WORKER,2124
7,ASD Human Resources,"ADARIO,ANTHONY J",SUPERVISING CLAIMS AGENT (ASD),2132
8,Property Management,"ADDESSA,ROCCO",JUNIOR BUILDING CUSTODIAN,2128
9,Boston Cntr - Youth & Families,"AFONSECA,JOSE",CERTIFIED SEASONAL LIFEGUARD,2124


In [11]:
#purpose: final cleaning for title column - put additional info between parens
new_title = earnings_df['Title'].str.split("(", n=1, expand=True)
new_title[0] = new_title[0].str.strip()
new_title[1] = new_title[1].replace({'\(':'','\)':''}, regex=True).str.strip()
#print(new_title)
earnings_df['Title'] = (new_title[0].astype(str)+ ' (' + new_title[1].astype(str) + ')').replace('\(None\)', '', regex=True).str.strip()

In [12]:
earnings_df.loc[earnings_df['Department'] == 'Boston Police Department', ['Department', 'Name', 'Title', 'Postal']].tail(1000)

Unnamed: 0,Department,Name,Title,Postal
9081,Boston Police Department,"DALERE,RAYMOND M",SENIOR RADIO COMMUNICATIONS TECHNICIAN,2136
9100,Boston Police Department,"DURDEN,JENNIFER D",POLICE DISPATCHER,2121
9108,Boston Police Department,"YUNG,MANG Y",MOTOR EQUIPMENT REPAIRPERSON CLASS I (BPD FLEE...,2130
9111,Boston Police Department,"WILLIS,MIRTA L",POLICE DISPATCHER,2126
9126,Boston Police Department,"ELIOPOULOS,ANTONIOS S",VIDEO FORENSIC ANALYST,2462
9146,Boston Police Department,"ERB,NICHOLAS",POLICE OFFICER,2132
9150,Boston Police Department,"KOSEK,JASON FRANK",SENIOR RADIO COMMUNICATIONS TECHNICIAN,2132
9160,Boston Police Department,"CARROLL,JENNIFER M",COMMUNICATIONS EQUIPMENT OPERATOR III R-13 (CT),2125
9163,Boston Police Department,"GUERINI III,ANDREW J",POLICE OFFICER,2122
9204,Boston Police Department,"BAMFORD,DANIEL JAMES",POLICE OFFICER,2122


In [13]:
print(earnings_df['Department'].unique().sort())

None


In [34]:
earnings_df['Department'] = earnings_df['Department'].replace({'\.':' ', '(?<=[a-z])([A-Z])':r' \1', '\/':' AND ', '\\\\':'', '&':' AND ', '\,':' ', '\#':''}, regex=True).str.upper()
earnings_df['Department'] = earnings_df['Department'].replace('\s+', ' ', regex=True).str.strip()

In [35]:
BPS_str = '^BPS|ELEMENTARY|ACADEMY|K-8|MIDDLE|HIGH|SCHOOL|ACAD$|PILOT| EEC$| ELC$| EES$|9-12|ACHIEVEMENT GAP|STUDENT|SUPERINTENDENT|CHIEF ACADEMIC OFFICER|ENGLISH LANGUAGE LEARN|ACCOUNTABILITY|^ADVANCEMENT|ENROLLMENT|^EARLY LEARNING|^HPEC|^INFO AND INSTR|TEACHING|CAREER AND TECHNICAL ED|CHIEF OF STAFF|CHIEF FINANCIAL OFFICER|CHIEF OPERATING OFFICER|^COMMUNICATIONS$|FOOD AND NUTRITION SVC|INNOVATION DEPARTMENT|INSTITUTIONAL ADVANCEMT|LEGAL ADVISOR|PROFESSIONAL DEVELOPMNT|RESEARCH ASSESS AND EVAL|STRATEGY DEPARTMENT'
earnings_df.loc[earnings_df['Department'].str.contains(BPS_str, regex=True),'Program'] = earnings_df.loc[earnings_df['Department'].str.contains(BPS_str, regex=True),'Department']
earnings_df.loc[earnings_df['Department'].str.contains(BPS_str, regex=True),'Department'] = 'BOSTON PUBLIC SCHOOLS'
earnings_df['Program'] = earnings_df['Program'].replace(np.nan,'')

In [36]:
depts = earnings_df['Department'].unique()
depts.sort()
print(depts)

['ADMINISTRATION AND FINANCE' 'AGE STRONG' 'ARTS AND CULTURAL DEVELOPMENT'
 'ASD GRAPHIC ARTS' 'ASD HUMAN RESOURCES' 'ASD INTERGVERNMTL RELATIONS'
 'ASD OFFICE OF BUDGET MANGMNT' 'ASD OFFICE OF LABOR RELATION'
 'ASD PURCHASING DIVISION' 'ASSESSING DEPARTMENT' 'AUDITING DEPARTMENT'
 'BOSTON CITY COUNCIL' 'BOSTON CNTR - YOUTH AND FAMILIES'
 'BOSTON FIRE DEPARTMENT' 'BOSTON POLICE DEPARTMENT'
 'BOSTON PUBLIC LIBRARY' 'BOSTON PUBLIC SCHOOLS'
 'BOSTON RETIREMENT SYSTEM' 'CEMETERY DIVISION' 'CITY CLERK'
 'COMM FOR PERSONS WITH DISABIL' 'CONSUMER AFFAIRS AND LICENSING'
 'DEPT OF VOTER MOBILIZATION' 'DIPLOMAS PLUS'
 'DPT OF INNOVATION AND TECHNOLOGY' 'ELDERLY COMMISSION'
 'ELECTION DIVISION' 'EMERGENCY MANAGEMENT' 'ENVIRONMENT DEPARTMENT'
 'FAIR HOUSING AND EQUITY' 'FINANCE COMMISSION' 'HBI RETIREES ET AL'
 'IMMIGRANT ADVANCEMENT' 'INSPECTIONAL SERVICES DEPT' 'LAW DEPARTMENT'
 'LICENSING BOARD' "MAYOR'S OFFICE" "MAYOR'S OFFICE-PUBLIC INFO"
 'NEIGHBORHOOD DEVELOPMENT' 'NEIGHBORHOOD SERVICES'
 '

In [37]:
budget_df = pd.read_csv('./data/operating_budget.csv', encoding='latin1')
budget_df = budget_df.drop_duplicates(['Cabinet','Dept','Program'])[['Cabinet','Dept','Program']].apply(lambda x: x.str.upper())
budget_df = budget_df.replace({'\.':' ', '\&':' AND ', '\,':' ', '\s+':' '}, regex=True).apply(lambda x: x.str.strip())
budget_df.replace({'LIBRARY DEPARTMENT':'BOSTON PUBLIC LIBRARY', 
                   ' W/':' WITH ', 
                   'LICENSING_BOARD':'LICENSING BOARD',
                   'BOSTON VETS':"VETERANS' SERVICES'"})

Unnamed: 0,Cabinet,Dept,Program
0,MAYORS CABINET,MAYOR'S OFFICE,MAYOR'S ADMINISTRATION
5,MAYORS CABINET,MAYOR'S OFFICE,MAYOR'S EXECUTIVE
7,MAYORS CABINET,MAYOR'S OFFICE,MAYOR'S POLICY AND PLANNING
11,MAYORS CABINET,MAYOR'S OFFICE,NEW URBAN MECHANICS
16,MAYORS CABINET,MAYOR'S OFFICE,MAYOR'S COMMUNICATIONS
20,MAYORS CABINET,ELECTION DEPARTMENT,ELECTION ADMINISTRATION
25,MAYORS CABINET,ELECTION DEPARTMENT,VOTER REGISTRATION
28,MAYORS CABINET,ELECTION DEPARTMENT,ELECTION ACTIVITIES
32,MAYORS CABINET,ELECTION DEPARTMENT,ANNUAL LISTING
35,MAYORS CABINET,INTERGOVERNMENTAL RELATIONS,IGR


In [38]:
budget_df

Unnamed: 0,Cabinet,Dept,Program
0,MAYORS CABINET,MAYOR'S OFFICE,MAYOR'S ADMINISTRATION
5,MAYORS CABINET,MAYOR'S OFFICE,MAYOR'S EXECUTIVE
7,MAYORS CABINET,MAYOR'S OFFICE,MAYOR'S POLICY AND PLANNING
11,MAYORS CABINET,MAYOR'S OFFICE,NEW URBAN MECHANICS
16,MAYORS CABINET,MAYOR'S OFFICE,MAYOR'S COMMUNICATIONS
20,MAYORS CABINET,ELECTION DEPARTMENT,ELECTION ADMINISTRATION
25,MAYORS CABINET,ELECTION DEPARTMENT,VOTER REGISTRATION
28,MAYORS CABINET,ELECTION DEPARTMENT,ELECTION ACTIVITIES
32,MAYORS CABINET,ELECTION DEPARTMENT,ANNUAL LISTING
35,MAYORS CABINET,INTERGOVERNMENTAL RELATIONS,IGR


In [50]:
#create pivot table displaying the number of people in each dept by year


#purpose: get pivot table of employee counts by department
    #dept is an optional column that specifies what department to filter on, set to everything by default
def count_by_dept(dept=r'(.*?)'):
    
    #create dataframe with department, year, and counts
    dept_counts = earnings_df.groupby(['Department', 'Year'])['Total'].count().reset_index(name="count")
    
    #remove BPS schools from result
    #dept_counts = dept_counts.loc[dept_counts['Department'].str.contains(BPS_str, regex=True) == False]
    
    #create the pivot table table, with calculated sums for each row and column
    dept_counts_table = pd.pivot_table(dept_counts, values='count', index='Department', columns='Year', aggfunc='sum', fill_value=0, margins=True).reset_index()
    
    #get rid of the calculated sums by row, doesn't make sense for time series data
    return dept_counts_table.iloc[:,0:-1].loc[dept_counts_table['Department'].str.match(dept)]

count_by_dept('OFC OF STRTS TRNSP AND SANI')

Year,Department,2011,2012,2013,2014,2015,2016,2017,2018,2019
42,OFC OF STRTS TRNSP AND SANI,0,0,0,19,18,0,0,0,0


In [None]:
dept_abbrevs = {
    'OFFICE OF FINANCE AND BUDGET':'ADMINISTRATION AND FINANCE', #*OFFICE OF FINANCE AND BUDGET: 2014 - 2015
    'OFFICE OF ADMIN AND FINANCE':'ADMINISTRATION AND FINANCE', 
    'ARTS AND CULTURAL DEVELOPMENT':'OFFICE OF ARTS AND CULTURE', 
    'ASD HUMAN RESOURCES':'HUMAN RESOURCES',
    'ASD INTERGVERNMTL RELATIONS':'INTERGOVERNMENTAL RELATIONS',
    'ASD OFFICE OF BUDGET MANGMNT':'BUDGET MANAGEMENT',
    'ASD OFFICE OF LABOR RELATION':'OFFICE OF LABOR RELATIONS',
    'ASD PURCHASING DIVISION':'PROCUREMENT', #**PURCHASING
    'BOSTON CITY COUNCIL':'CITY COUNCIL',
    'BOSTON CNTR - YOUTH AND FAMILIES':'BOSTON CENTER FOR YOUTH AND FAMILIES',
    'BOSTON FIRE DEPARTMENT':'FIRE DEPARTMENT',
    'BOSTON POLICE DEPARTMENT':'POLICE DEPARTMENT',
    #CEMETARY DIVISION -> PARKS AND RECREATION DEPARTMENT, CEMETARY
    'BOSTON RETIREMENT SYSTEM':'RETIREMENT DEPARTMENT',
    'STATE BOSTON RETIREMENT SYST':'RETIREMENT DEPARTMENT',
    'COMM FOR PERSONS WITH DISABIL':'COMMISSION FOR PERSONS WITH DISABILITIES',
    'DEPT OF VOTER MOBILIZATION':'ELECTION DEPARTMENT', #*DEPARTMENT OF VOTER MOBILIZATION: <2013
    'DPT OF INNOVATION AND TECHNOLOGY':'DEPARTMENT OF INNOVATION AND TECHNOLOGY',
    'ELDERLY COMMISSION':'AGE STRONG', #*ELDERLY COMMISSION <2018
    'ELECTION DIVISION':'ELECTION DEPARTMENT',
    'IMMIGRANT ADVANCEMENT':'OFFICE FOR IMMIGRANT ADVANCEMENT',
    'INSPECTIONAL SERVICES DEPT':'INSPECTIONAL SERVICES DEPARTMENT',
    #LICENSING BOARD -> CONSUMER AFFAIRS AND LICENSING, LICESNING BOARD
    #MAYOR'S OFFICE-PUBLIC INFO -> MAYOR'S OFFICE, MAYOR'S COMMUNICATIONS
    #OFC BOSTON RESIDENTS JOB POL -> OFFICE OF ECONOMIC DEVELOPMENT, BOSTON RESIDENTS JOB POLICY OFFICE
    'OFC CHF PUBLIC WORKS TRANSPORT':'OFFICE OF STREETS', #*OFFICE OF CHIEF OF PUBLIC WORKS AND TRANSPORT <2013
    'OFC OF STRTS TRNSP AND SANI':'OFFICE OF STREETS', #*OFFICE OF STREETS, TRANSPORTATION, AND SANITATION 2014 - 2015
    'OFFICE OF CIVIL RIGHTS':'FAIR HOUSING AND EQUITY', #*OFFICE OF CIVIL RIGHTS <2013
    'OFFICE OF NEW BOSTONIANS':'OFFICE FOR IMMIGRANT ADVANCEMENT', #*OFFICE OF NEW BOSTONIANS <2017
    'PARKS DEPARTMENT':'PARKS AND RECREATION DEPARTMENT',
    'PROPERTY MANAGEMENT':'PROPERTY MANAGEMENT DEPARTMENT',
    #SMALL AND LOCAL BUSINESS -> OFFICE OF ECONOMIC DEVELOPMENT, SMALL AND LOCAL BUSINESS
    #TRAFFIC DIVISION -> TRANSPORTATION DEPARTMENT, TRAFFIC DIVISION
    #TRANSPORTATION-PARKING CLERK -> TRANSPORTATION DEPARTMENT, PARKING CLERK
    #TREASURY-COLLECTING DIVISION -> TREASURY DEPARTMENT, TREASURY DIVISION
    #TREASURY-TREASURY DIVISION -> TREASURY DEPARTMENT, COLLECTING DIVISION
    #WOMEN'S COMMISSION -> WOMEN'S ADVANCEMENT, WOMEN'S COMMISSION
    #WORKERS COMPENSATION SERVICE -> HUMAN RESOURCES, WORKERS COMP
    'YOUTH FUND':'YOUTH ENGAGEMENT AND EMPLOYMENT' #*YOUTH FUND <2013
}

In [None]:
#Arts & Cultural Development -> Office of Arts & Culture
#Transportation Department -> Traffic Division
#Women's Commission -> Women's Advancement
#Dept of Voter Mobilization -> Election Division
#Youth Fund -> Youth Engagement & Employment

In [None]:
#purpose: get pivot table of aggregates by department
    #column indicates what column to use for aggregation (i.e. Injured, Regular, Total, etc.)
    #aggfunc is the aggregation function (mean, sum, max, min)
    #dept is an optional column that specifies what department to filter on, set to everything by default
def agg_by_dept(column, aggfunc, dept=r'(.*?)'):
    
    #create dataframe with department, year, and aggregate column
    dept_costs = earnings_df.loc[earnings_df[column]>0].groupby(['Department', 'Year'])[column].agg(aggfunc).reset_index(name="Costs")
    
    #remove BPS schools from result
    dept_costs = dept_costs.loc[dept_costs['Department'].str.contains(BPS_str, regex=True) == False]
    
    #create the pivot table table, with calculated aggregate for each row and column
    dept_costs_table = pd.pivot_table(dept_costs, values='Costs', index='Department', columns='Year', aggfunc = aggfunc, fill_value=0, margins=True).reset_index()
    
    #get rid of the calculated aggregate by row, doesn't make sense for time series data
    return dept_costs_table.iloc[:,0:-1].loc[dept_costs_table['Department'].str.match(dept)]

agg_by_dept('Injured', 'median', 'Boston Police Department|Boston Fire Department')

In [None]:
#purpose: get pivot table of employee counts by department and title
    #column indicates what column to count on (i.e. Injured or Total)
    #dept is an optional column that specifies what department to filter on, set to everything by default
    #title is an optional column that specifies what title to filter on, set to everything by default, not reccommended without setting dept first
def count_by_title(column, dept=r'(.*?)', title=r'(.*?)'):
    
    #create dataframe with title, year, and counts
    job_counts = earnings_df.loc[earnings_df['Department'].str.match(dept)]
    job_counts = job_counts.loc[job_counts[column]>0].groupby(['Title', 'Year'])[column].count().reset_index(name="count")
    
    #create the pivot table table, with calculated sums for each row and column
    job_counts_table = pd.pivot_table(job_counts, values='count', index='Title', columns='Year', aggfunc=np.sum, fill_value=0, margins=True).reset_index()
    
    #get rid of the calculated sums by row, doesn't make sense for time series data
    return job_counts_table.loc[job_counts_table['Title'].str.match(title)].sort_values(by='All', ascending=False).iloc[:,0:-1]

count_by_title('Injured')

In [None]:
#purpose: get pivot table of chosen aggregate by department and title
    #column indicates what column to aggregate on (i.e. Injured, Regular, Total, etc.)
    #dept is an optional column that specifies what department to filter on, set to everything by default
    #title is an optional column that specifies what title to filter on, set to everything by default, not reccommended without setting dept first
def agg_by_title(column, aggfunc, dept=r'(.*?)', title=r'(.*?)'):
    
    #create dataframe with title, year, and aggregate column
    job_pay = earnings_df.loc[earnings_df['Department'].str.match(dept)].groupby(['Title', 'Year'])[column].agg(aggfunc).reset_index(name="Costs")
    
    #create the pivot table table, with calculated aggregate for each row and column
    job_pay_table = pd.pivot_table(job_pay, values='Costs', index='Title', columns='Year', fill_value=0, margins=True).reset_index()
    
    #get rid of the calculated sums by row, doesn't make sense for time series data
    return job_pay_table.loc[job_pay_table['Title'].str.match(title)].sort_values(by='All', ascending=False).iloc[:,0:-1]

agg_by_title('Total', 'mean', 'Mayor\'s Office', 'Chief Diversity Officer')

In [None]:
print(earnings_df.loc[(earnings_df['Injured']>0) & (earnings_df['Year']==2014) & (earnings_df['Department']=='Boston Police Department'), ['Name', 'Title', 'Injured']])

In [None]:
count_by_title('Total', title='Prin')