# Analyzing New York City employees' payroll database 

## Data Source: [NYC open data](https://data.cityofnewyork.us/City-Government/Citywide-Payroll-Data-Fiscal-Year-/k397-673e/data)

In [1]:
import pandas as pd
df = pd.read_csv('Citywide_Payroll_Data__Fiscal_Year_.csv')
df.columns = df.columns.str.replace(" ", "_")
df.columns = df.columns.str.replace("-", "_")
df.columns = df.columns.str.lower()
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.2f}'.format



In [2]:
df.shape

(2864545, 17)

Previous versions of the dataset didn't import all the years correctly, so I'll do these sanity checks a couple of times

In [3]:
df.fiscal_year.value_counts()

2019    592431
2020    590210
2021    573477
2017    562266
2018    546161
Name: fiscal_year, dtype: int64

#### Cleaning the data

The database includes people whose work locations are outside of NYC. For this analyses, I'm only including employees with work locations in NYC boroughs. Since Staten Island wasn't listed (unless included in the "other" location, this is filtered down to Queens, Manhattan, Bronx and Brooklyn.

In [4]:
boroughs = ['QUEENS', 'MANHATTAN', 'BROOKLYN', 'BRONX']

In [5]:
df = df[df.work_location_borough.isin(boroughs)]

In [6]:
df.shape

(2760682, 17)

Note: This reduced the dataset by 103,863 rows.

In [7]:
df.sort_values(by='total_ot_paid', ascending = False).head(10)

Unnamed: 0,fiscal_year,payroll_number,agency_name,last_name,first_name,mid_init,agency_start_date,work_location_borough,title_description,leave_status_as_of_june_30,base_salary,pay_basis,regular_hours,regular_gross_paid,ot_hours,total_ot_paid,total_other_pay
2291076,2021,996.0,NYC HOUSING AUTHORITY,PROCIDA,ROBERT,,04/13/1987,BRONX,SUPERVISOR PLUMBER,ACTIVE,387.03,per Day,1820.0,100627.8,2249.5,248749.72,7215.34
2291070,2021,816.0,DEPT OF HEALTH/MENTAL HYGIENE,MCGROARTY,MICHAEL,,10/06/2014,QUEENS,STATIONARY ENGINEER,ACTIVE,508.8,per Day,2080.0,132288.0,2374.75,238829.13,40105.0
2291085,2021,996.0,NYC HOUSING AUTHORITY,MARKOWSKI,JAKUB,,05/31/2016,BRONX,PLUMBER,ACTIVE,369.53,per Day,1820.0,96077.8,2119.5,223776.86,5899.29
2291072,2021,816.0,DEPT OF HEALTH/MENTAL HYGIENE,PETTIT,PATRICK,J,08/02/2010,MANHATTAN,STATIONARY ENGINEER,ACTIVE,508.8,per Day,2080.0,132288.0,2152.75,218694.96,38611.82
2291071,2021,816.0,DEPT OF HEALTH/MENTAL HYGIENE,HALLAHAN,PATRICK,M,02/26/2018,BROOKLYN,STATIONARY ENGINEER,ACTIVE,508.8,per Day,2080.0,132288.0,2115.25,218628.18,56616.07
2291081,2021,3.0,BOARD OF ELECTION,"ORTIZ, JR",ANTONIO,,08/27/1995,MANHATTAN,SENIOR SYSTEMS ANALYSTS,ACTIVE,117003.0,per Annum,1820.0,116673.77,2461.25,217915.94,2974.95
2234227,2020,996.0,NYC HOUSING AUTHORITY,PROCIDA,ROBERT,,04/13/1987,BRONX,SUPERVISOR PLUMBER,ACTIVE,387.03,per Day,1820.0,100627.8,1944.5,215022.81,6468.93
2234228,2020,996.0,NYC HOUSING AUTHORITY,ORTIZ,JOSE,,11/27/1989,QUEENS,SUPERVISOR PLUMBER,ACTIVE,387.03,per Day,1820.0,100627.8,1937.5,214248.85,5860.74
1069369,2018,996.0,NYC HOUSING AUTHORITY,GIURBINO,VINCENZO,,04/28/2003,BROOKLYN,PLUMBER,ACTIVE,361.48,per Day,1825.0,93984.8,2043.0,213634.68,7539.44
2291078,2021,996.0,NYC HOUSING AUTHORITY,DALEY,GARFIELD,D,05/24/1994,BRONX,SUPERVISOR ELECTRICIAN,ACTIVE,460.25,per Day,1820.0,119469.25,2032.5,200038.56,28316.97


In [8]:
import re
df[df['title_description'].str.contains('.PLUMBER')== True].agency_name.value_counts()

NYC HOUSING AUTHORITY             83
DEPARTMENT OF EDUCATION ADMIN     34
DEPT OF ENVIRONMENT PROTECTION    27
DEPT OF PARKS & RECREATION        23
FIRE DEPARTMENT                   22
DEPT. OF HOMELESS SERVICES        14
DEPARTMENT OF SANITATION          12
POLICE DEPARTMENT                 12
HRA/DEPT OF SOCIAL SERVICES       10
DEPARTMENT OF CORRECTION           5
DEPARTMENT OF TRANSPORTATION       5
DEPT OF CITYWIDE ADMIN SVCS        5
COMMUNITY COLLEGE (HOSTOS)         1
Name: agency_name, dtype: int64

In [9]:
df[df['title_description'].str.contains('.PLUMBER')== True]

Unnamed: 0,fiscal_year,payroll_number,agency_name,last_name,first_name,mid_init,agency_start_date,work_location_borough,title_description,leave_status_as_of_june_30,base_salary,pay_basis,regular_hours,regular_gross_paid,ot_hours,total_ot_paid,total_other_pay
100265,2017,,DEPARTMENT OF CORRECTION,NELSON,PATRICK,M,05/01/1989,QUEENS,SUPERVISOR PLUMBER,ACTIVE,378.98,per Day,1825.00,102324.60,574.75,64794.76,0.00
106750,2017,,DEPARTMENT OF EDUCATION ADMIN,BLACKBURN,JULIAN,U,03/29/1993,BROOKLYN,SUPERVISOR PLUMBER,ACTIVE,378.98,per Day,1825.00,102324.60,171.75,21304.09,0.00
107020,2017,,DEPARTMENT OF EDUCATION ADMIN,BRIDGWOOD,DAVID,E,03/16/1982,BROOKLYN,SUPERVISOR PLUMBER,ACTIVE,378.98,per Day,1825.00,102324.60,155.50,23117.78,0.00
107719,2017,,DEPARTMENT OF EDUCATION ADMIN,CATANIA,CARMEL,,12/18/1987,BROOKLYN,SUPERVISOR PLUMBER,ACTIVE,378.98,per Day,1825.00,102324.60,500.00,64155.90,0.00
114309,2017,,DEPARTMENT OF EDUCATION ADMIN,MASON,HENRY,,11/02/1987,BROOKLYN,SUPERVISOR PLUMBER,ACTIVE,378.98,per Day,1825.00,98299.60,162.00,16731.36,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2408001,2021,71.00,DEPT. OF HOMELESS SERVICES,AVITTO,RICHARD,,04/17/1994,BROOKLYN,SUPERVISOR PLUMBER,ACTIVE,387.03,per Day,1820.00,100627.80,10.00,1105.80,0.00
2410599,2021,740.00,DEPARTMENT OF EDUCATION ADMIN,BRIDGWOOD,DAVID,E,03/16/1982,BROOKLYN,SUPERVISOR PLUMBER,ACTIVE,387.03,per Day,1820.00,100627.80,0.00,0.00,0.00
2410600,2021,740.00,DEPARTMENT OF EDUCATION ADMIN,PLEMPER,PETER,,07/09/2017,MANHATTAN,SUPERVISOR PLUMBER,ACTIVE,387.03,per Day,1820.00,100627.80,0.00,0.00,0.00
2410601,2021,996.00,NYC HOUSING AUTHORITY,RUGGIERO,VINCENT,W,07/05/2016,MANHATTAN,SUPERVISOR PLUMBER,ACTIVE,387.03,per Day,1820.00,100627.80,0.00,0.00,0.00


In [10]:
df[df['title_description'].str.contains('.PLUMBER')== True].total_ot_paid.median()

48368.93

In [11]:
df[df['title_description'].str.contains('.PLUMBER')== True].total_ot_paid.mean()

58882.56102766799

In [12]:
df[df['title_description'].str.contains('.PLUMBER')== True].groupby(by='agency_name').total_ot_paid.sum()

agency_name
COMMUNITY COLLEGE (HOSTOS)           5,569.37
DEPARTMENT OF CORRECTION           457,030.12
DEPARTMENT OF EDUCATION ADMIN      756,051.84
DEPARTMENT OF SANITATION           220,854.98
DEPARTMENT OF TRANSPORTATION       176,641.66
DEPT OF CITYWIDE ADMIN SVCS        679,538.41
DEPT OF ENVIRONMENT PROTECTION     619,263.96
DEPT OF PARKS & RECREATION       1,563,329.13
DEPT. OF HOMELESS SERVICES         556,608.68
FIRE DEPARTMENT                  1,200,328.52
HRA/DEPT OF SOCIAL SERVICES        381,300.86
NYC HOUSING AUTHORITY            7,684,816.42
POLICE DEPARTMENT                  595,953.99
Name: total_ot_paid, dtype: float64

In [18]:
df[df['title_description'].str.contains('.PLUMBER')== True].groupby(by='agency_name').base_salary.sum()

agency_name
COMMUNITY COLLEGE (HOSTOS)          387.03
DEPARTMENT OF CORRECTION          1,919.05
DEPARTMENT OF EDUCATION ADMIN    13,046.32
DEPARTMENT OF SANITATION          4,596.06
DEPARTMENT OF TRANSPORTATION      1,919.05
DEPT OF CITYWIDE ADMIN SVCS       1,919.05
DEPT OF ENVIRONMENT PROTECTION   10,360.28
DEPT OF PARKS & RECREATION        8,837.29
DEPT. OF HOMELESS SERVICES        5,378.17
FIRE DEPARTMENT                   8,447.46
HRA/DEPT OF SOCIAL SERVICES       3,838.10
NYC HOUSING AUTHORITY            31,865.89
POLICE DEPARTMENT                 4,612.16
Name: base_salary, dtype: float64

In [13]:
df[df['title_description'].str.contains('.PLUMBER')== True].groupby(by='agency_name').total_ot_paid.mean()

agency_name
COMMUNITY COLLEGE (HOSTOS)         5,569.37
DEPARTMENT OF CORRECTION          91,406.02
DEPARTMENT OF EDUCATION ADMIN     22,236.82
DEPARTMENT OF SANITATION          18,404.58
DEPARTMENT OF TRANSPORTATION      35,328.33
DEPT OF CITYWIDE ADMIN SVCS      135,907.68
DEPT OF ENVIRONMENT PROTECTION    22,935.70
DEPT OF PARKS & RECREATION        67,970.83
DEPT. OF HOMELESS SERVICES        39,757.76
FIRE DEPARTMENT                   54,560.39
HRA/DEPT OF SOCIAL SERVICES       38,130.09
NYC HOUSING AUTHORITY             92,588.15
POLICE DEPARTMENT                 49,662.83
Name: total_ot_paid, dtype: float64

In [17]:
df[df['title_description'].str.contains('.PLUMBER')== True].groupby(by='agency_name').count()

Unnamed: 0_level_0,fiscal_year,payroll_number,last_name,first_name,mid_init,agency_start_date,work_location_borough,title_description,leave_status_as_of_june_30,base_salary,pay_basis,regular_hours,regular_gross_paid,ot_hours,total_ot_paid,total_other_pay
agency_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
COMMUNITY COLLEGE (HOSTOS),1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1
DEPARTMENT OF CORRECTION,5,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5
DEPARTMENT OF EDUCATION ADMIN,34,27,34,34,10,34,34,34,34,34,34,34,34,34,34,34
DEPARTMENT OF SANITATION,12,10,12,12,12,12,12,12,12,12,12,12,12,12,12,12
DEPARTMENT OF TRANSPORTATION,5,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5
DEPT OF CITYWIDE ADMIN SVCS,5,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5
DEPT OF ENVIRONMENT PROTECTION,27,22,27,27,14,27,27,27,27,27,27,27,27,27,27,27
DEPT OF PARKS & RECREATION,23,19,23,23,10,23,23,23,23,23,23,23,23,23,23,23
DEPT. OF HOMELESS SERVICES,14,11,14,14,0,14,14,14,14,14,14,14,14,14,14,14
FIRE DEPARTMENT,22,19,22,22,19,22,22,22,22,22,22,22,22,22,22,22
