In [79]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 300) 
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [91]:
#purpose: create our dataframe

#default column headers
col_headers = ['Name', 'Department', 'Title', 'Regular', 'Retro', 'Other', 'Overtime', 'Injured', 'Detail', 'Quinn', 'Total', 'Postal']

#create an empty df with default values
earnings_df = pd.DataFrame(columns = col_headers)

#add column 'Year' and set to NaN
earnings_df['Year'] = np.nan

for year in range(2011, 2020):
    
    #read in the CSV for the given year, set it to variable next_df
    next_df = pd.read_csv('./data/salaries_' + str(year) + '.csv', skiprows=1, names=col_headers, encoding='latin1')
    
    #add the column 'Year' to next_df and set to the given year
    next_df['Year'] = year
    
    #in the 2013 and 2014 datasets, title and department are in the wrong order
    if year == 2013 or year == 2014:
        
        #get the list of columns (including year)
        col_list = list(next_df)
        
        #swap the order of title and department in the list
        col_list[1], col_list[2] = col_list[2], col_list[1]
        
        #set the dataframe's columns to the new list
        next_df.columns = col_list
    
    #add next_df to earnings_df
    earnings_df = pd.concat([earnings_df, next_df], sort=False)

In [92]:
#purpose: clean our dataframe

#dimensions are qualitative columns, facts are quantitative columns
dimensions = ['Name', 'Department', 'Title', 'Postal']
facts = ['Regular', 'Retro', 'Other', 'Overtime', 'Injured', 'Detail', 'Quinn', 'Total']

#targeted (hard-coded) cleaning for specific rows
earnings_df = earnings_df.loc[earnings_df['Department']!='DEPARTMENT_NAME']
earnings_df.loc[earnings_df['Department'] == 'Boston Cntr-Youth & Families', 'Department'] = 'Boston Cntr - Youth & Families'
earnings_df.loc[earnings_df['Department'] == 'DND Neighborhood Development', 'Department'] = 'Neighborhood Development'
    
#convert dimensions from type 'object' to 'string'
earnings_df[dimensions] = earnings_df[dimensions].astype(str).applymap(lambda x: x.strip())

#convert all postal codes to same format
mask = earnings_df['Postal'].str.len() == 4
earnings_df.loc[mask, 'Postal'] = '0' + earnings_df.loc[mask, 'Postal']
mask = earnings_df['Postal'].str.len() > 5 
earnings_df.loc[mask, 'Postal'] = earnings_df.loc[mask, 'Postal'].str.slice(0,5)

#clean the facts columns and convert from type 'object' to 'float'
earnings_df[facts] = earnings_df[facts].astype(str).applymap(lambda x: x.strip())
earnings_df[facts] = earnings_df[facts].replace({'^-$|^None$|^nan$|\)':0, ',':'', '\$':'', ' ':'', '^\(':'-'}, regex=True)
earnings_df[facts] = earnings_df[facts].astype(float)

#cast year to type int
earnings_df['Year'] = earnings_df['Year'].astype(int)

In [94]:
earnings_df.head(50)

Unnamed: 0,Name,Department,Title,Regular,Retro,Other,Overtime,Injured,Detail,Quinn,Total,Postal,Year
0,"Abadi,Kidani A",Assessing Department,Property Officer (Asn),33065.38,0.0,0.0,379.49,0.0,0.0,0.0,33444.87,2118,2011
1,"Ablon,Jordan N",ASD Office Of Labor Relation,Asst Corp Counsel III,76051.24,0.0,1321.03,0.0,0.0,0.0,0.0,77372.27,2135,2011
2,"Accardi,Patricia",Transportation-Parking Clerk,Chief Claims Investigator,56430.79,0.0,0.0,418.58,0.0,0.0,0.0,56849.37,2081,2011
3,"Ackerly,Lyn E.",Boston Public Library,Spec Library Asst I,35058.78,0.0,0.0,439.19,0.0,0.0,0.0,35497.97,2118,2011
4,"Adams,Carey L.",Law Department,Prin Clerk,41588.83,0.0,0.0,0.0,0.0,0.0,0.0,41588.83,2131,2011
5,"Adams,Dean",Public Works Department,Highway Maint Frprs (Pwd)##,37693.81,0.0,6761.54,11485.32,0.0,0.0,0.0,55940.67,2124,2011
6,"Adams,Natasha",Boston Cntr - Youth & Families,Youth Worker,38330.84,0.0,0.0,47.14,0.0,0.0,0.0,38377.98,2124,2011
7,"Adario,Anthony J",ASD Human Resources,Supvising Claims Agent (Asd),85214.42,0.0,1629.36,0.0,0.0,0.0,0.0,86843.78,2132,2011
8,"Addessa,Rocco",Property Management,Jr Building Custodian,39322.79,0.0,0.0,3720.81,0.0,0.0,0.0,43043.6,2128,2011
9,"Afonseca,Jose",Boston Cntr - Youth & Families,Certified Seasonal Lifeguard,4400.45,0.0,0.0,137.99,0.0,0.0,0.0,4538.44,2124,2011


In [6]:
#purpose: find out how the number of employees in each department changed throughout the years

#create pivot table displaying the number of people in each dept by year
BPS_str = '^BPS |Elementary|Academy|K-8|Middle|High|School|Acad$|Pilot| EEC$| ELC$| EES$|9-12|Achievement Gap|Student|Superintendent'

def count_by_dept(column, dept=r'(.*?)'):
    #count employees where given column is greater than 0, grouped by department, rename calculated column to count
    dept_counts = earnings_df.loc[earnings_df[column]>0].groupby(['Department', 'Year'])[column].count().reset_index(name="count")
    
    #remove BPS schools from result
    dept_counts = dept_counts.loc[dept_counts['Department'].str.contains(BPS_str, regex=True) == False]
    
    #create the pivot table table, with calculated sums for each row and column
    dept_counts_table = pd.pivot_table(dept_counts, values='count', index='Department', columns='Year', aggfunc='sum', fill_value=0, margins=True).reset_index()
    
    #get rid of the calculated sums by row, doesn't make sense for time series data
    return dept_counts_table.iloc[:,0:-1].loc[dept_counts_table['Department'].str.match(dept)]

count_by_dept('Injured')

Year,Department,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,ASD Purchasing Division,0,0,0,0,1,1,0,0,1
1,Age Strong,0,0,0,0,0,0,0,0,3
2,Assessing Department,0,0,0,1,0,1,1,0,0
3,Auditing Department,0,0,0,1,1,0,1,1,0
4,Boston City Council,0,0,0,1,0,0,0,0,0
5,Boston Cntr - Youth & Families,2,6,7,2,2,1,2,1,4
6,Boston Fire Department,514,466,509,1013,495,495,441,442,754
7,Boston Police Department,360,327,368,826,342,449,475,459,457
8,Boston Public Library,5,1,2,2,1,1,2,2,3
9,Cemetery Division,3,3,2,4,4,2,1,3,2


In [7]:
#Arts & Cultural Development -> Office of Arts & Culture
#Transportation Department -> Traffic Division
#Women's Commission -> Women's Advancement
#Dept of Voter Mobilization -> Election Division
#Youth Fund -> Youth Engagement & Employment

In [8]:
#purpose: easily get pivot table of aggregates by department

def agg_by_dept(column, aggfunc, dept=r'(.*?)'):
    dept_costs = earnings_df.loc[earnings_df[column]>0].groupby(['Department', 'Year'])[column].agg(aggfunc).reset_index(name="Costs")
    dept_costs = dept_costs.loc[dept_costs['Department'].str.contains(BPS_str, regex=True) == False]
    dept_costs_table = pd.pivot_table(dept_costs, values='Costs', index='Department', columns='Year', aggfunc = aggfunc, fill_value=0, margins=True).reset_index()
    return dept_costs_table.iloc[:,0:-1].loc[dept_costs_table['Department'].str.match(dept)]
agg_by_dept('Injured', 'median', 'Boston Police Department|Boston Fire Department')

Year,Department,2011,2012,2013,2014,2015,2016,2017,2018,2019
6,Boston Fire Department,10077.1,11000.65,10033.58,2111.12,9761.53,9122.28,8778.47,10325.38,6758.73
7,Boston Police Department,7455.53,6787.27,9174.69,1041.95,12528.32,10279.39,8690.22,16429.05,26432.22


In [56]:
def count_by_title(column, dept=r'(.*?)', title=r'(.*?)'):
    job_counts = earnings_df.loc[earnings_df['Department'].str.match(dept)]
    job_counts = job_counts.loc[job_counts[column]>0].groupby(['Title', 'Year'])[column].count().reset_index(name="count")
    job_counts_table = pd.pivot_table(job_counts, values='count', index='Title', columns='Year', aggfunc=np.sum, fill_value=0, margins=True).reset_index()
    return job_counts_table.loc[job_counts_table['Title'].str.match(title)].sort_values(by='All', ascending=False).iloc[:,0:-1]
count_by_title('Injured')

ValueError: Cannot index with multidimensional key

In [59]:
def agg_by_title(column, aggfunc, dept=r'(.*?)', title=r'(.*?)'):
    job_pay = earnings_df.loc[earnings_df['Department'].str.match(dept)].groupby(['Title', 'Year'])[column].agg(aggfunc).reset_index(name="Costs")
    job_pay_table = pd.pivot_table(job_pay, values='Costs', index='Title', columns='Year', fill_value=0, margins=True).reset_index()
    return job_pay_table.loc[job_pay_table['Title'].str.match(title)].sort_values(by='All', ascending=False).iloc[:,0:-1]
agg_by_title('Total', 'mean', 'Mayor\'s Office', 'Chief Diversity Officer')

Year,Title,2011,2012,2013,2014,2015,2016,2017,2018,2019
4,Chief Diversity Officer,0.0,0.0,0.0,0.0,97292.38,96753.68,103961.62,114576.95,112576.95


In [12]:
print(earnings_df.loc[(earnings_df['Injured']>0) & (earnings_df['Year']==2014) & (earnings_df['Department']=='Boston Police Department'), ['Name', 'Title', 'Injured']])

                     Name                           Title  Injured
17221    Abasciano,Joseph                  Police Officer 60753.78
17231       Acosta,Carina                  Police Officer  2326.64
17245    Ajemian,Gerald F  Police Offc Comm Serv Offc 3$8   999.10
17251      Alfonso,Jose M      Police Offc Acad Instr 2$6    26.25
17258       Almeida,Ana c  Police Offc/Juvenile Offc 4$10  1268.79
...                   ...                             ...      ...
20382       Younger,Atiya                  Police Officer   898.48
20383   Younger,Vatchel S                  Police Officer 11445.98
20390    Zanoli,Joseph M.                  Police Officer  2814.97
20396  Zographos,Peter A.     Police Offc Mobile Offc 2$6  1222.84
20397   Zubrin,William W.    Police Offc/Auto Invest 4$10    33.76

[826 rows x 3 columns]


In [50]:
agg_by_title('Regular', 'mean')

Year,Title,2011,2012,2013,2014,2015,2016,2017,2018,2019
849,FCommissioner/Chief of theDept,0.00,0.00,0.00,0.00,212438.54,201307.77,205207.77,233461.57,249615.33
392,Chief of Support Services,0.00,0.00,0.00,170007.27,204892.59,207578.28,217822.94,214984.43,236325.88
448,Commissioner (Bpd),174200.00,174200.00,155194.45,186203.87,207017.37,221323.02,229999.90,185336.14,249999.88
1900,Superintendent,266750.10,266750.12,197128.97,254357.58,126523.14,132025.96,264660.69,119230.71,132211.51
732,Dist Fire Chief-Adm Asst Dvmtr,0.00,0.00,0.00,0.00,0.00,174984.32,179271.73,165650.95,197961.99
...,...,...,...,...,...,...,...,...,...,...
423,Cluster Administrator,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
927,Fire Captain-ADR,-1.20,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
883,FF LEP Title 3 Inspector - ADR,0.00,0.00,-1356.08,1195.32,0.00,0.00,0.00,0.00,0.00
869,FF (Inct Comnd Sp) DEP-ADR,-569.42,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [55]:
print(earnings_df.loc[earnings_df['Title']=='Advisor'])

             Name      Department    Title   Regular  Retro     Other  \
5381  Chang,Tommy  Superintendent  Advisor 149117.42   0.00 301465.78   

      Overtime  Injured  Detail  Quinn     Total Postal  Year  
5381      0.00     0.00    0.00   0.00 450583.20  92861  2018  
