In [1]:
import pandas as pd
import numpy as np
import pymongo
from pymongo import MongoClient
import simplejson as json
import re 
pd.set_option('display.max_rows', 1000) 
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [2]:
columns = ['Name', 'Department', 'Title', 'Regular', 'Retro', 'Other', 'Overtime', 'Injury', 'Detail', 'Quinn', 'Total', 'Postal']
earnings_df = pd.DataFrame(columns = columns)
earnings_df['Year'] = np.nan

for year in range(2011, 2022):
    
    df_for_year = pd.read_csv('./data/salaries_' + str(year) + '.csv', encoding='latin1')
    
    #retrieve only the first part of each column name 
    columns = {col: (re.split('[^a-zA-Z]', col.strip())[0]).title() for col in df_for_year.columns}
    
    #standardize column names
    df_for_year.rename(columns=columns, inplace=True)
    df_for_year.rename(columns={"Injured": "Injury"}, inplace=True)
    df_for_year['Year'] = year
  
    earnings_df = pd.concat([earnings_df, df_for_year], sort=False, axis=0)

In [3]:
earnings_df.sample(100)

Unnamed: 0,Name,Department,Title,Regular,Retro,Other,Overtime,Injury,Detail,Quinn,Total,Postal,Year,Zip,Details
7224,"Nguyen,Loni",BPS Mather Elementary,Teacher,103308.17,2005.30,768.31,-,-,-,-,106081.78,2169.0,2019.0,,
12311,"Roddy,Elise",BPS Substitute Teachers/Nurs,Substitute Teacher,$2012.86,$0.00,$0.00,$0.00,$0.00,$0.00,$0.00,$2012.86,,2014.0,02132,
6524,"White,Matthew O.",Tech Boston Academy,Teacher,105437.26,2044.92,2150.00,-,-,-,-,109632.18,2184.0,2019.0,,
16625,"Preskenis,Kathryn Irene",Neighborhood Services,Staff Asst II,"$63,303.32",,,$441.78,,,,"$63,745.10",2132.0,2016.0,,
6270,"Leith,Edna Monteiro",BPS Dearborn Middle,Instructional Coach,105578.94,2082.17,3258.32,-,-,-,-,110919.43,2139.0,2019.0,,
2427,"Montiero,Domingo R",Boston Police Department,Police Officer,98998.27,-,12792.15,27810.05,-,-,9899.90,149500.37,2121.0,2019.0,,
686,"Lewis,Tracey",Boston Cntr - Youth & Families,Computer Instructor,$58493.40,$0.00,$0.00,$193.05,$0.00,$0.00,$0.00,$58686.45,,2014.0,02176,
10712,"Sager,Jessica",BPS Snowden International Hi,Teacher,$82663.88,$0.00,$1500.00,$0.00,$0.00,$0.00,$0.00,$84163.88,,2014.0,02184,
15592,"O'Connor,Mark",Boston Public Schools,Teacher,$72160.54,$580.90,$2673.10,$0.00,$0.00,$0.00,$0.00,$75414.54,,2013.0,02492-3700,
8755,"Walsh,Caitlin P",BPS Special Education,Physical Therapist,"$97,588.56",,,,,,,"$97,588.56",2026.0,2020.0,,


In [4]:
#targeted (hard-coded) cleaning for specific row
earnings_df = earnings_df.loc[earnings_df['Department']!='DEPARTMENT_NAME']

facts = ['Regular', 'Retro', 'Other', 'Overtime', 'Injury', 'Detail', 'Quinn', 'Total']

facts_transforms = {
    '^-$|^None$|^nan$|\)':0, 
    '':0,
    ',':'', 
    '\$':'', 
    ' ':'', 
    '^\(':'-'
}

#convert facts to useable types
earnings_df['Year'] = earnings_df['Year'].astype(int)
earnings_df[facts] = earnings_df[facts].astype(str).applymap(lambda x: x.strip())
earnings_df[facts] = earnings_df[facts].replace(facts_transforms, regex=True)
earnings_df[facts] = earnings_df[facts].astype(float)
earnings_df[facts].fillna(0, inplace=True)

#combine 'Detail' and 'Quinn' with 'Other'
earnings_df['Other'] = earnings_df[['Other', 'Detail', 'Quinn']].sum(axis=1)

earnings_df.drop(columns=['Detail', 'Quinn', 'Zip', 'Details'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  earnings_df[facts].fillna(0, inplace=True)


In [5]:
earnings_df.sample(100)

Unnamed: 0,Name,Department,Title,Regular,Retro,Other,Overtime,Injury,Total,Postal,Year
14803,"Qualters-Turner,Mary E",Haley Pilot,Prin Clerk/School Sec 19,0.0,1783.32,13381.31,0.0,0.0,15164.63,,2014
14069,"Santana,Heidi A",BPS Transportation,Cab Monitor,537.22,0.0,0.0,0.0,0.0,537.22,2119.0,2017
17875,"Curtis,Ronald P",Boston Police Department,Police Officer,85827.82,14351.49,10526.0,23882.48,0.0,134587.79,,2014
11974,"Gales,Nickisha M.",Boston Police Department,Executive Coordinator,61647.38,0.0,1344.79,5424.72,0.0,68416.89,2126.0,2019
6549,"Clark,Joel L.",BPS Bates Elementary,Teacher,113328.58,0.0,0.0,0.0,0.0,113328.58,2132.0,2021
22732,"Afonseca,Jose",Public Works Department,Laborer Seasonal (PW),595.66,0.0,0.0,0.0,0.0,595.66,2124.0,2019
10912,"Kelly-Chalas,Ramon M.",Boston Police Department,Police Officer,70607.35,0.0,3046.0,20617.14,0.0,94270.49,,2013
20103,"Persad,Joellen",BPS Madison Park High,Teacher,40276.4,0.0,0.0,0.0,0.0,40276.4,2130.0,2017
9088,"Beharry,Agnes M",Boston Public Schools,Hot Line Worker,1635.52,0.0,0.0,0.0,0.0,1635.52,,2011
12935,"Matthews,Jacqueline Melissa",Boston Police Department,Police Dispatcher,67778.73,0.0,0.0,1545.97,0.0,69324.7,2136.0,2017


In [6]:
dimensions = ['Name', 'Department', 'Title', 'Postal']
earnings_df[dimensions] = earnings_df[dimensions].astype(str)

In [7]:
#purpose: clean Postal column

#add a 0 to the front of any code with 4 digits
mask = earnings_df['Postal'].str.len() == 4
earnings_df.loc[mask, 'Postal'] = '0' + earnings_df.loc[mask, 'Postal']

#remove delivery route number
earnings_df['Postal'] = earnings_df['Postal'].str.split('-', expand=True)[0]

#set postal codes with non-numeric characters to UNKNOWN
earnings_df.loc[earnings_df['Postal'].str.match('[^0-9]', na=False), 'Postal'] = 'UNKNOWN'

#hard cleaning for specific rows
postal_codes = {
    'Ostiguy,David M': '02327',
    'Karales,George Alfred': '02170',
    'Smith,Kenneth J': '02124',
    'Thomas,Sarita J': '02125',
    'Morris,Judith A.': '02170',
    'Mendez,Jose R': '02135',
    'Morrison,June': '02481'
}

for name, code in postal_codes.items():
    earnings_df.loc[earnings_df['Name'].str.match(name), 'Postal'] = code


In [8]:
#purpose: clean Name column
earnings_df['Name'] = earnings_df['Name'].replace({'\.':''}, regex=True).str.upper()

name = earnings_df['Name'].str.split(',', n=1, expand=True)
earnings_df['Last']=name[0]
earnings_df['First']=name[1]
earnings_df.drop(columns=["Name"], inplace=True) 

dimensions = ['First', 'Last', 'Department', 'Title', 'Postal']

In [9]:
#return list of all unique values for a given column sorted in alphabetical order
def show_unique(column):
    optns = earnings_df[column].unique()
    optns.sort()
    return optns

#purpose: get pivot table of employee counts by department
def count_by_group(column, rec=r'(.*?)'):
    
    group_counts = earnings_df.groupby([column, 'Year'])['Total'].count().reset_index(name="count")
    group_counts_table = pd.pivot_table(group_counts, values='count', index=column, columns='Year', aggfunc='sum', fill_value=0, margins=True).reset_index()
    
    #get rid of the calculated sums by row, doesn't make sense for time series data
    return group_counts_table.iloc[:,0:-1].loc[group_counts_table[column].str.match(rec)]

In [10]:
#create uniform spacing, convert to all caps, and remove unwanted characters

title_cleaning = {
    '\.':'', 
    '(?<=[a-z])([A-Z])':r' \1', 
    '\(':' (', 
    '\)':') ', 
    '\\\\':'', 
    '&':' AND ', 
    '\,':'/', 
    '\#':'', 
    '\s+':' '
}

earnings_df['Title'] = earnings_df['Title'].replace(title_cleaning, regex=True).str.upper()
earnings_df[dimensions] = earnings_df[dimensions].applymap(lambda x: x.strip() if x else x)

In [11]:
#preliminary cleaning for Title column to unstick words that were stuck together
title_cleaning = {
    '([A-Z])(OF)([\/\s\)]|$)':r'\1 OF ', 
    '([A-Z])(BPD)':r'\1 \2', 
    '(BPD)([A-Z])':r'\1 \2'
}

earnings_df['Title'].replace(title_cleaning, regex=True, inplace=True)

In [12]:
#replace abbreviations and spelling errors with proper words in title column

#CHALLENGES --> 
#OP/OPER: OPERATOR VS OPERATIONS
#SP/SPEC: SPECIAL VS SPECIALIST
#SERV/SVC: SERVICE VS SERVICES
#COM/COMM: COMMUNITY VS COMMUNICATIONS VS COMMISSIONER VS COMMISSION VS COMMITTEE

title_transforms = {
    'ADMIN AND FINANCE':'ADMINISTRATION AND FINANCE',
    'ADMIN/FINANCE':'ADMINISTRATION AND FINANCE',
    'OFFC':'OFFICER',
    'OFFCR':'OFFICER',
    'SEN':'SENIOR',
    'DET':'DETECTIVE',
    'SUP':'SUPERVISOR',
    'SUPV':'SUPERVISOR',
    'SPV':'SUPERVISOR',
    'EXEC':'EXECUTIVE',
    'AN':'ANALYST',
    'ANL':'ANALYST',
    'ANAL':'ANALYST',
    'TECH':'TECHNICIAN',
    'DEPT':'DEPARTMENT',
    'EQUIP':'EQUIPMENT',
    'EQUIPMENT OPER':'EQUIPMENT OPERATOR',
    'ALARM OPER':'ALARM OPERATOR',
    'METER OPER':'METER OPERATOR',
    'COMPUTER OPER':'COMPUTER OPERATOR',
    'PROJ':'PROJECT',
    'SP':'SPECIAL',
    'STFF':'STAFF',
    'ACAD':'ACADEMY',
    'INSTR':'INSTRUCTOR',
    'ASST':'ASSISTANT',
    'ASSIST':'ASSISTANT',
    'ASS':'ASSISTANT',
    'DEP':'DEPUTY',
    'SUPN':'SUPERINTENDENT',
    'SYS':'SYSTEMS',
    'COOR':'COORDINATOR',
    'COORD':'COORDINATOR',
    'SEC':'SECRETARY',
    'LIEUT':'LIEUTENANT',
    'LT':'LIEUTENANT',
    'MAINT':'MAINTENANCE',
    'MAIN':'MAINTENANCE',
    'DIR':'DIRECTOR',
    'MGMT':'MANAGEMENT',
    'MGR':'MANAGER',
    'MNGR':'MANAGER',
    'MANGR':'MANAGER',
    'MED':'MEDICAL',
    'PROC':'PROCESSING',
    'CORP':'CORPORATION',
    'ASSOC':'ASSOCIATE',
    'ASSESS OPER MANAGEMENT':'ASSESSING OPERATIONS MANAGEMENT',
    'OPER':'OPERATIONS',
    'OP':'OPERATOR',
    'INC COMM':'INCIDENT COMMAND',
    'COMM SERV':'COMMUNITY SERVICE',
    'COMM OUTREACH':'COMMUNITY OUTREACH',
    'ASSISTANT COMM':'ASSISTANT COMMISSIONER',
    'DEPUTY COMM':'DEPUTY COMMISSIONER',
    'COMM OFFICE':'COMMISSIONERS OFFICE',
    'RADIO COMM':'RADIO COMMUNICATIONS',
    'COMM EQUIPMENT':'COMMUNICATIONS EQUIPMENT',
    'COMMUNIC':'COMMUNICATIONS',
    'COMMUN':'COMMUNICATIONS',
    'COMM LEADER':'COMMUNITY LEADER',
    'FIRE COMM':'FIRE COMMISSIONER',
    'HOUSING COMM':'HOUSING COMMISSION',
    'SCHOOL COMM':'SCHOOL COMMITTEE',
    'COMM AND INTERG':'COMMUNITY AND INTERGOVERNMENTAL',
    'BLDG':'BUILDINGS',
    'BLDGS':'BUILDINGS',
    'BDG':'BUILDINGS',
    'REG':'REGISTRAR',
    'SERV':'SERVICE',
    'SVC':'SERVICE',
    'SRV':'SERVICE',
    'PRIN':'PRINCIPAL',
    'PARA':'PARAPROFESSIONAL',
    'DIST':'DISTRICT',
    'FF':'FIRE FIGHTER',
    'A AND F':'ADMINISTRATION AND FINANCE',
    'F':'FIRE',
    'INSTRUC':'INSTRUCTOR',
    'SR':'SENIOR',
    'JR':'JUNIOR',
    'MECH':'MECHANIC',
    'MECHA':'MECHANIC',
    'MACH':'MACHINE',
    'GEN':'GENERAL',
    'ADMN':'ADMIN',
    'ADM':'ADMIN',
    'ENG':'ENGINEER',
    'STRUCT':'STRUCTURAL',
    'FRPRS':'FOREPERSON',
    'FRPR':'FOREPERSON',
    'FOREPRS':'FOREPERSON',
    'CONST':'CONSTRUCTION',
    'LBR':'LABORER',
    'RPR':'REPAIR',
    'REP':'REPAIR',
    'SPEC':'SPECIAL',
    'INCT':'INCIDENT',
    'COMND':'COMMAND',
    'FIN COM':'FINANCE COMMISSION',
    'MAS':'MASTER',
    'RPPRS':'REPAIRPERSON',
    'REPPRS':'REPAIRPERSON',
    'REPRPRS':'REPAIRPERSON',
    'REPAIRPR':'REPAIRPERSON',
    'REPAIRPRS':'REPAIRPERSON',
    'RPPR':'REPAIR PERSON',
    'WKG':'WORKING',
    'PW':'PUBLIC WORKS',
    'P W':'PUBLIC WORKS',
    'HVY':'HEAVY',
    'MTR':'MOTOR',
    'INSP':'INSPECTOR',
    'INSPEC':'INSPECTOR',
    'TRA':'TRAFFIC',
    'OPR':'OPERATIONS',
    'MEO':'MOTOR EQUIPMENT OPERATOR',
    'CFM':'(CFM)',
    'ELEC':'ELECTRIC',
    'EQUI':'EQUIPMENT',
    'COLL TRS':'COLLECTOR TREASURER',
    'COLL-TRS':'COLLECTOR TREASURER',
    'ACNTNG':'ACCOUNTING',
    'CRFTSPRS':'CRAFTSPERSON',
    'COMMSS':'COMMISSIONER',
    'COMR':'COMMISSIONER',
    'COMMIS':'COMMISSIONER',
    'CFM':'',
    'COUNSLR':'COUNSELOR',
    'BD':'BOARD',
    'MEMBER BOARD':'MEMBER OF BOARD',
    'LIB':'LIBRARIAN',
    'LIBR':'LIBRARIAN',
    'LIBRARIN':'LIBRARIAN',
    'SVCS':'SERVICES',
    'CAMP JO':'(CAMP JOY)',
    'CAM JO':'(CAMP JOY)',
    'SER':'SERVICES',
    'PROT':'PROTECTIVE',
    'REL':'RELATIONS',
    'SUPVISING':'SUPERVISING',
    'PROP':'PROPERTY',
    'DISP':'DISPATCHER',
    'CHF':'CHIEF',
    'PMDGRAFF REMOVAL':'(GRAFFITI REMOVAL)',
    'PAINT':'PAINTER',
    'ENFORCE':'ENFORCEMENT',
    'DEVELOP':'DEVELOPMENT',
    'DEVEL':'DEVELOPMENT',
    'PROG':'PROGRAM',
    'PWD':'',
    'SWIM':'SWIMMING',
    'REGNL':'REGIONAL',
    'ACCTNG':'ACCOUNTING',
    'ACCT':'ACCOUNTING',
    'ENGR':'ENGINEER',
    'EQU':'EQUIPMENT',
    'EQ':'EQUIPMENT',
    'ANIM CNTL':'ANIMAL CONTROL',
    'OFCR':'OFFICER',
    'CLRK':'CLERK',
    'PARKS AND REC':'PARKS AND RECREATION',
    'P AND R':'PARKS AND RECREATION',
    '\(PARK\)':'(PARKS AND RECREATION)',
    'SPC':'SPECIAL',
    'HDQ':'HEADQUARTER',
    'TRANS':'TRANSPORTATION',
    'DISPCH':'DISPATCHER',
    'SUB':'SUBSTITUTE',
    'CUST':'CUSTODIAN',
    'NEIGH':'NEIGHBORHOOD',
    'YTH':'YOUTH',
    'HE':'',
    'BE':'',
    'FGR PRT EV':'FINGERPRINT EVIDENCE',
    'CH':'CHIEF',
    'SUM SCH':'SUMMER SCHOOL',
    'COM SCH':'COMMUNITY SCHOOL',
    'FAM':'FAMILY',
    'BPDFLEET':'BPD FLEET',
    'IBPDFLEET':'I (BPD FLEET',
    'IIBPDFLEET':'II (BPD FLEET',
    'EVIDENC':'EVIDENCE',
    'TECHNCN':'TECHNICIAN',
    'TCH':'TECHNICIAN',
    'SP ED':'SPECIAL ED',
    'SPED':'SPECIAL ED',
    'DP':'DATA PROCESSING',
    'PREV':'PREVENTION',
    'SCUBA DIV':'SCUBA DIVER',
    'DIV':'DIVISION',
    'POL':'POLICE',
    'OFFR':'OFFICER',
    'CLASSIFICATN':'CLASSIFICATION',
    'ATTN':'ATTENDANT',
    'CAFE':'CAFETERIA',
    'SCHL':'SCHOOL',
    'SCH':'SCHOOL',
    'HOSP':'HOSPITAL',
    'ASSTO':'ASSISTANT TO',
    'CHIEFOF':'CHIEF OF',
    'EXC':"EXECUTIVE",
    'OFF':'OFFICE',
    'SRGT':'SERGEANT',
    'RET':'RETIREMENT',
    'RETIRE':'RETIREMENT',
    'RETIREME':'RETIREMENT',
    'RECYCLE':'RECYCLING',
    'REAS':'RESEARCH',
    'INVESTNS':'INVESTIGATIONS',
    'INVEST':'INVESTIGATOR',
    'HACKNEY':'HACKNEY UNIT',
    'TOTHE':'TO THE',
    'PRGS':'PROGRAMS',
    'JOURNEYPRS':'JOURNEYPERSON',
    'BLD':'BUILDINGS',
    'PLMG AND GSFTG':'PLUMBING AND GASFITTING',
    'PLG AND GAS FTNG':'PLUMBING AND GASFITTING',
    'EL IN AND MNT':'ELECTRICAL INSPECTION AND MAINTENANCE',
    'AL':'ALTERATION',
    'ALT':'ALTERATION',
    'CLK':'CLERK',
    'ADV':'ADVANCED',
    'MAST\)':'MASTER',
    'ADR':'(RETIRED - ADR)',
    'LIBRRIN':'LIBRARIAN',
    'SPECL':'SPECIALIST',
    'LAB RELATIONS':'LABOR RELATIONS',
    'C AND REPAIR':'CONSTRUCTION AND REPAIR',
    'CONSTR':'CONSTRUCTION',
    'PMDCONST':'PMD CONSTRUCTION',
    'AUD':'AUDITOR',
    'ENVRNMNTL':'ENVIRONMENTAL',
    'PR OF':'PARAPROFESSIONAL',
    'HOT LINE':'HOTLINE',
    'FCOMMISSIONER':'FIRE COMMISSIONER',
    'PROFESS':'PROFESSIONAL',
    'ARCHIV': 'ARCHIVE',
    'TECHG':'TECHNOLOGY',
    'FOREPR': 'FOREPERSON',
    'LIBRAY': 'LIBRARY',
    'SRVC': 'SERVICES',
    'MGNT': 'MANAGEMENT',
    'DEV': 'DEVELOPMENT'
    
}

#to prevent words within words from accidentally being changed, lets ensure that values from above dictionary must be
#between certain characters to change
for key, val in title_transforms.items():
    val = ' ' + val + ' '
    new_key = '(^|[\s\(\)\-\/])' + key + '([\s\(\)\-\/]|$)'
    earnings_df['Title'] = earnings_df['Title'].replace(new_key, val, regex=True).str.strip()

earnings_df['Title'].replace({'SPECIAL$':'SPECIALIST'}, regex=True, inplace=True)

In [13]:
count_by_group('Title').sort_values(2021, ascending=False).head(1000)

Year,Title,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
2168,All,20509,21140,22469,22233,21902,22046,22245,23603,23312,21858,22552
2060,TEACHER,5121,5249,5512,5426,5375,5363,5437,5541,5255,5106,5171
1378,PARAPROFESSIONAL,1108,1161,1209,1113,1180,1187,1215,1416,1356,1394,1509
1427,POLICE OFFICER,1281,1231,1308,1349,1273,1279,1331,1334,1346,1355,1321
926,FIRE FIGHTER,804,816,861,873,815,837,795,792,820,790,788
1978,SUBSTITUTE TEACHER,770,743,1045,837,793,771,759,983,724,534,505
1391,PART-TIME CAFETERIA ATTENDANT,360,389,417,360,356,341,334,375,374,372,405
1228,LUNCH HOUR MONITORS,388,368,424,384,377,370,360,392,341,319,330
1419,POLICE DETECTIVE,281,292,290,286,286,330,298,296,309,298,297
294,CAB MONITOR,252,319,379,343,354,365,306,232,337,264,288


In [14]:
show_unique('Department')

array(['ASD Graphic Arts', 'ASD Human Resources',
       'ASD Intergvernmtl Relations', 'ASD Office Of Labor Relation',
       'ASD Office of Budget Mangmnt', 'ASD Purchasing Division',
       'Accountability', 'Achievement Gap', 'Administration and Finance',
       'Advancement & Ext. Affairs', 'Age Strong',
       'Alighieri Montessori School', 'Arts & Cultural Development',
       'Assessing Department', 'Asst Superintendent-Network A',
       'Asst Superintendent-Network B', 'Asst Superintendent-Network C',
       'Asst Superintendent-Network D', 'Asst Superintendent-Network E',
       'Asst Superintendent-Network F', 'Asst Superintendent-Network G',
       'Auditing Department', 'BPS Adams Elementary',
       'BPS Adult Education', 'BPS Alternative Education',
       'BPS Alternative Education HS', 'BPS Another Course To Colleg',
       'BPS Athletics', 'BPS Bates Elementary',
       'BPS Beethoven Elementary', 'BPS Bi-Weekly 26 Leave',
       'BPS Blackstone Elementary', 'BPS Bos

In [15]:
# create uniform spacing, convert to all caps, and remove unwanted characters
cleaning_dict = {
    '\.':'', 
    '(?<=[a-z])([A-Z])':r' \1', 
    '\/':' AND ', 
    '\\\\':'', 
    '&':' AND ', 
    '\,':' ', 
    '\#':''
}
earnings_df['Department'] = earnings_df['Department'].replace(cleaning_dict, regex=True).str.upper()
earnings_df['Department'] = earnings_df['Department'].replace('\s+', ' ', regex=True).str.strip()

In [16]:
show_unique('Department')

array(['ACCOUNTABILITY', 'ACHIEVEMENT GAP', 'ADMINISTRATION AND FINANCE',
       'ADVANCEMENT AND EXT AFFAIRS', 'AGE STRONG',
       'ALIGHIERI MONTESSORI SCHOOL', 'ARTS AND CULTURAL DEVELOPMENT',
       'ASD GRAPHIC ARTS', 'ASD HUMAN RESOURCES',
       'ASD INTERGVERNMTL RELATIONS', 'ASD OFFICE OF BUDGET MANGMNT',
       'ASD OFFICE OF LABOR RELATION', 'ASD PURCHASING DIVISION',
       'ASSESSING DEPARTMENT', 'ASST SUPERINTENDENT-NETWORK A',
       'ASST SUPERINTENDENT-NETWORK B', 'ASST SUPERINTENDENT-NETWORK C',
       'ASST SUPERINTENDENT-NETWORK D', 'ASST SUPERINTENDENT-NETWORK E',
       'ASST SUPERINTENDENT-NETWORK F', 'ASST SUPERINTENDENT-NETWORK G',
       'AUDITING DEPARTMENT', 'BALDWIN ELC', 'BOSTON CITY COUNCIL',
       'BOSTON CNTR - YOUTH AND FAMILIES',
       'BOSTON CNTR-YOUTH AND FAMILIES', 'BOSTON COLLABORATIVE HIGH SCH',
       'BOSTON FIRE DEPARTMENT', 'BOSTON MIDDLE SCHOOL ACADEMY',
       'BOSTON POLICE DEPARTMENT', 'BOSTON PUBLIC LIBRARY',
       'BOSTON PUBLIC SC

In [17]:
#purpose: replace incorrect department names with correct ones when applicable

dept_transforms = {
    'OFFICE OF FINANCE AND BUDGET':'ADMINISTRATION AND FINANCE', #*OFFICE OF FINANCE AND BUDGET: 2014 - 2015
    'OFFICE OF ADMIN AND FINANCE':'ADMINISTRATION AND FINANCE', 
    'DND NEIGHBORHOOD DEVELOPMENT':'NEIGHBORHOOD DEVELOPMENT',
    'ARTS AND CULTURAL DEVELOPMENT':'OFFICE OF ARTS AND CULTURE',
    'ASD GRAPHIC ARTS':'OFFICE OF ARTS AND CULTURE',
    'ASD HUMAN RESOURCES':'HUMAN RESOURCES',
    'ASD INTERGVERNMTL RELATIONS':'INTERGOVERNMENTAL RELATIONS',
    'ASD OFFICE OF BUDGET MANGMNT':'BUDGET MANAGEMENT',
    'ASD OFFICE OF LABOR RELATION':'OFFICE OF LABOR RELATIONS',
    'ASD PURCHASING DIVISION':'PROCUREMENT', #*AKA PURCHASING
    'BOSTON CITY COUNCIL':'CITY COUNCIL',
    'BOSTON CNTR \- YOUTH AND FAMILIES':'BOSTON CENTER FOR YOUTH AND FAMILIES',
    'BOSTON CNTR\-YOUTH AND FAMILIES':'BOSTON CENTER FOR YOUTH AND FAMILIES',
    'BOSTON FIRE DEPARTMENT':'FIRE DEPARTMENT',
    'BOSTON POLICE DEPARTMENT':'POLICE DEPARTMENT',
    'BOSTON RETIREMENT SYSTEM':'RETIREMENT DEPARTMENT',
    'STATE BOSTON RETIREMENT SYST':'RETIREMENT DEPARTMENT',
    'COMM FOR PERSONS WITH DISABIL':'COMMISSION FOR PERSONS WITH DISABILITIES',
    'DEPT OF VOTER MOBILIZATION':'ELECTION DEPARTMENT', #*DEPARTMENT OF VOTER MOBILIZATION: <2013
    'DPT OF INNOVATION AND TECHNOLOGY':'DEPARTMENT OF INNOVATION AND TECHNOLOGY',
    'ELDERLY COMMISSION':'AGE STRONG', #*ELDERLY COMMISSION <2018
    'ELECTION DIVISION':'ELECTION DEPARTMENT',
    'IMMIGRANT ADVANCEMENT':'OFFICE FOR IMMIGRANT ADVANCEMENT',
    'INSPECTIONAL SERVICES DEPT':'INSPECTIONAL SERVICES DEPARTMENT',
    'OFC CHF PUBLIC WORKS TRANSPORT':'OFFICE OF STREETS', #*OFFICE OF CHIEF OF PUBLIC WORKS AND TRANSPORT <2013
    'OFC OF STRTS TRNSP AND SANI':'OFFICE OF STREETS', #*OFFICE OF STREETS, TRANSPORTATION, AND SANITATION 2014 - 2015
    'OFFICE OF CIVIL RIGHTS':'FAIR HOUSING AND EQUITY', #*OFFICE OF CIVIL RIGHTS <2013
    'OFFICE OF NEW BOSTONIANS':'OFFICE FOR IMMIGRANT ADVANCEMENT', #*OFFICE OF NEW BOSTONIANS <2017
    'PARKS DEPARTMENT':'PARKS AND RECREATION DEPARTMENT',
    'PROPERTY MANAGEMENT$':'PROPERTY MANAGEMENT DEPARTMENT',
    'YOUTH FUND':'YOUTH ENGAGEMENT AND EMPLOYMENT', #*YOUTH FUND <2013
    "WOMEN'S COMMISSION":"WOMEN'S ADVANCEMENT", #*WOMEN'S COMMISSION <2013
    'OPAT': 'OFFICE OF POLICE ACCOUNTABILITY AND TRANSPARENCY',
    'SERVS': 'SERVICES',
    'MGMT':'MANAGEMENT',
    'EXT AFFAIRS':'EXTERNAL AFFAIRS',
    'CL10':' ',
    'COM ACD':'COMMUNITY ACADEMY OF',
    'P A SHAW':'PA SHAW',
    'NURS':'NURSES',
    'WEST ROXBURY HIGH':'WEST ROXBURY ACADEMY', 
    'WREC\:':' ',
    'HPEC\:':' ',
    'MC CORMACK MIDDLE':'MCCORMACK MIDDLE', 
    'KENNEDY EM':'EDWARD M KENNEDY',
    'KENNEDY JF':'JOHN F KENNEDY',
    'KENNEDY PJ':'PATRICK J KENNEDY',
    'GREENWOOD S':'SARAH GREENWOOD',
    'MC KINLEY MIDDLE':'MCKINLEY MIDDLE',
    'HORACE MANN$':'HORACE MANN SCHOOL',
    'WITHTHROP':'WINTHROP',
    'SOUTH BOSTON HS - EXCEL':'EXCEL HIGH SCHOOL',
    'MPHCOMMERCE':'MADISON PARK HIGH SCHOOL - COMMERCE',
    'MPHCRAFTS':'MADISON PARK HIGH SCHOOL - CRAFTS',
    'MPHHEALTH':'MADISON PARK HIGH SCHOOL - HEALTH',
    'MPHFRESHMAN':'MADISON PARK HIGH SCHOOL - FRESHMAN',
    'FACILITY MANAGEMENT AND A AND R':'FACILITIES MANAGEMENT',
    'FACILITY MANAGEMENT':'FACILITIES MANAGEMENT',
    'FACILITITES MANAGEMENT':'FACILITIES MANAGEMENT',
    'GREENWOOD E':'ELIHU GREENWOOD',
    'ALTERNATIVE EDUCATION HIGH SCHOOL':'ALTERNATIVE EDUCATION',
    'COLLEG':' COLLEGE ',
    'SCH':'SCHOOL',  
    'INTERVTN CT':'INTERVENTION CENTER',
    'COUSELING':'COUNSELING',
    'SERVICE':'SERVICES',
    'SER':'SERVICES',
    'HI$': 'HIGH SCHOOL',
    'HI':'HIGH',
    'ED':'EDUCATION',
    'HS':' HIGH SCHOOL ',
    'ACAD':'ACADEMY',
    'SVC':'SERVICES',
    'ASST':'ASSISTANT',
    'COMM':'COMMUNITY',
    'AC':'ACADEMY',
    'ST':'STREET',
    'DEVELOPMNT': 'DEVELOPMENT',
    'HEARING AND APPEAL AND ATTENDANC':'HEARINGS AND APPEALS',
    'KITCHEN FOOD':'KITCHEN AND FOOD',
    'LEARN':'LEARNING',
    'FAM AND STUDENT ENGAGEMT':'FAMILY AND STUDENT ENGAGEMENT',
    'ADVANCEMT':'ADVANCEMENT',
    'ACCOUNTABILITY':'TURNAROUND AND TRANSFORMATION',
    'RESEARCH ASSESS AND EVAL':'DATA AND ACCOUNTABILITY',
    'TRANFORMATION':'TRANSFORMATION',
    'ELEMENTARY$':'ELEMENTARY SCHOOL',
    'MIDDLE$':'MIDDLE SCHOOL',
    'HIGH$':'HIGH SCHOOL',
    'HIGH SCH$': 'HIGH SCHOOL',
    'PILOT$':' PILOT SCHOOL',
    'ED$':'EDUCATION',
    'BPS': 'BOSTON PUBLIC SCHOOLS',
    '\-NETWORK':' - NETWORK',
    'INFO AND INSTR TECHNOLOGY': 'INFORMATION AND INSTRUCTIONAL TECHNOLOGY'
}

new_dept_transforms = {}
for k in dept_transforms.keys():
    v = dept_transforms[k]
    if k[-1] != '$':
        k = '(^| )'+k+'( |$)'
    new_dept_transforms[k] = ' ' + v + ' '
    
earnings_df['Department'] = earnings_df['Department'].replace(new_dept_transforms, regex=True).str.strip()

In [18]:
count_by_group('Department').sort_values(2021, ascending=False).head(1000)

Year,Department,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
268,All,20509,21140,22469,22233,21902,22046,22245,23603,23312,21858,22552
228,POLICE DEPARTMENT,3013,3034,3085,3178,3034,3114,3149,3173,3271,3142,3094
168,FIRE DEPARTMENT,1646,1657,1690,1834,1703,1722,1680,1688,1766,1692,1692
119,BOSTON PUBLIC SCHOOLS SPECIAL EDUCATION,0,0,0,721,724,762,747,831,857,785,860
120,BOSTON PUBLIC SCHOOLS SUBSTITUTE TEACHERS AND ...,0,0,0,1007,974,936,931,1196,928,705,689
65,BOSTON PUBLIC SCHOOLS FACILITIES MANAGEMENT,0,0,0,581,571,552,585,668,587,566,589
125,BOSTON PUBLIC SCHOOLS TRANSPORTATION,0,0,0,416,428,557,540,506,653,511,514
18,BOSTON PUBLIC LIBRARY,479,516,567,558,512,500,496,494,562,492,483
233,PUBLIC WORKS DEPARTMENT,416,503,466,485,475,451,454,430,436,405,418
15,BOSTON CENTER FOR YOUTH AND FAMILIES,533,609,618,562,558,544,546,584,498,410,411


In [19]:
#purpose: some depts should actually be programs, this function can be used to set departments to programs

def replace_dept(original_dept, new_dept, new_program):
    mask = earnings_df['Department'].str.contains(original_dept, regex=True)
    earnings_df.loc[mask,'Program'] = new_program
    earnings_df.loc[mask,'Department'] = new_dept

In [20]:
# move BPS programs to new 'Program' column and set dept to Boston Public Schools
bps_phrases = [
    'ELEMENTARY',
    'ACADEMY',
    'K-8',
    'MIDDLE',
    'HIGH',
    'SCHOOL',
    'ACADEMY',
    'PILOT',
    '9-12',
    'ACHIEVEMENT GAP',
    'STUDENT',
    'SUPERINTENDENT',
    'ACADEMIC',
    'LEARNING',
    'EDUCATION',
    'TEACHING',
    'ENROLLMENT',
    'INSTRUCTION',
    'INSTITUTIONAL',
    'MONTESSORI',
    'DIPLOMA',
    ' EEC$',
    ' ELC$',
    ' EES$',
    '^HPEC',
    '^ADVANCEMENT',
    'CHIEF OF STAFF',
    'CHIEF FINANCIAL OFFICER',
    'CHIEF OPERATING OFFICER',
    '^COMMUNICATIONS$',
    'FOOD AND NUTRITION SVC',
    'INNOVATION DEPARTMENT',
    'LEGAL ADVISOR',
    'PROFESSIONAL DEVELOPMENT',
    'DATA AND ACCOUNTABILITY',
    'STRATEGY DEPARTMENT'
]
bps_str = '|'.join(bps_phrases)
bps_mask = earnings_df['Department'].str.contains(bps_str, regex=True)
earnings_df[bps_mask]


Unnamed: 0,Department,Title,Regular,Retro,Other,Overtime,Injury,Total,Postal,Year,Last,First
8362,BOSTON PUBLIC SCHOOLS,PRINCIPAL ELEMENTARY,119578.40,0.00,0.00,0.00,0.00,119578.40,UNKNOWN,2011,ABABIO-FERNANDEZ,RUBY A
8363,BOSTON PUBLIC SCHOOLS,TEACHER,85562.10,0.00,123.09,0.00,0.00,85685.19,UNKNOWN,2011,ABBOTT,JOHN R
8364,BOSTON PUBLIC SCHOOLS,TECHNICIAN (B),27030.56,0.00,0.00,0.00,0.00,27030.56,UNKNOWN,2011,ABBOTT,LEONIA N
8365,BOSTON PUBLIC SCHOOLS,SUBSTITUTE LUNCH MONITOR,216.00,0.00,0.00,0.00,0.00,216.00,UNKNOWN,2011,ABBRUZZESE,DONNA
8366,BOSTON PUBLIC SCHOOLS,TEACHER,85935.70,0.00,0.00,0.00,0.00,85935.70,UNKNOWN,2011,ABDALKHALLAQ,AMIRA N
...,...,...,...,...,...,...,...,...,...,...,...,...
22539,UP ACADEMY HOLLAND,DIRECTOR (BASAS 10B) (NON-AC),0.00,81.08,0.00,0.00,0.00,81.08,02136,2021,BAILEY,LITA R
22540,PA SHAW ELEMENTARY SCHOOL,SUBSTITUTE LUNCH MONITOR,81.00,0.00,0.00,0.00,0.00,81.00,02122,2021,BLACK,VANESSA DEBRA
22541,BOSTON PUBLIC SCHOOLS SPECIAL EDUCATION,ASSISTANT DIRECTOR,0.00,69.86,0.00,0.00,0.00,69.86,02131,2021,BARTHOLOMEW,JOSEPH WILLIAM
22542,BOSTON PUBLIC SCHOOLS SUBSTITUTE TEACHERS AND ...,SUBSTITUTE TEACHER,0.00,0.00,0.00,59.78,0.00,59.78,02180,2021,RABOUIN,SHANTE EVELIN


In [21]:
earnings_df['Program'] = ''
earnings_df.loc[bps_mask,'Program'] = earnings_df.loc[bps_mask,'Department']
earnings_df.loc[bps_mask,'Department'] = 'BOSTON PUBLIC SCHOOLS'

earnings_df['Program'] = earnings_df['Program'].replace({'BOSTON PUBLIC SCHOOLS':'', '\"':''}, regex=True).str.strip()

In [22]:
#use the above function to fix some of the Department data
replace_dept('CEMETERY DIVISION', 'PARKS AND RECREATION DEPARTMENT', 'CEMETERY')
replace_dept('LICENSING BOARD', 'CONSUMER AFFAIRS AND LICENSING', 'LICESNING BOARD')
replace_dept("MAYOR\'S OFFICE\-PUBLIC INFO", "MAYOR'S OFFICE", "MAYOR'S COMMUNICATIONS")
replace_dept('OFC BOSTON RESIDENTS JOB POL', 'OFFICE OF ECONOMIC DEVELOPMENT', 'BOSTON RESIDENTS JOB POLICY OFFICE')
replace_dept('SMALL AND LOCAL BUSINESS', 'OFFICE OF ECONOMIC DEVELOPMENT', 'SMALL AND LOCAL BUSINESS')
replace_dept('TRAFFIC DIVISION', 'TRANSPORTATION DEPARTMENT', 'TRAFFIC DIVISION')
replace_dept('TRANSPORTATION-PARKING CLERK', 'TRANSPORTATION DEPARTMENT', 'PARKING CLERK')
replace_dept('TREASURY-COLLECTING DIVISION', 'TREASURY DEPARTMENT', 'TREASURY DIVISION')
replace_dept('TREASURY-TREASURY DIVISION', 'TREASURY DEPARTMENT', 'COLLECTING DIVISION')
replace_dept('WORKERS COMPENSATION SERVICE', 'HUMAN RESOURCES', 'WORKERS COMP')
replace_dept('HBI RETIREES ET AL', 'HUMAN RESOURCES', 'HEALTH BENEFITS AND INSURANCE')

In [23]:
show_unique('Department')

array(['ADMINISTRATION AND FINANCE', 'AGE STRONG', 'ASSESSING DEPARTMENT',
       'AUDITING DEPARTMENT', 'BOSTON CENTER FOR YOUTH AND FAMILIES',
       'BOSTON PUBLIC LIBRARY', 'BOSTON PUBLIC SCHOOLS',
       'BUDGET MANAGEMENT', 'CITY CLERK', 'CITY COUNCIL',
       'COMMISSION FOR PERSONS WITH DISABILITIES',
       'CONSUMER AFFAIRS AND LICENSING',
       'DEPARTMENT OF INNOVATION AND TECHNOLOGY', 'ELECTION DEPARTMENT',
       'EMERGENCY MANAGEMENT', 'ENVIRONMENT DEPARTMENT',
       'FAIR HOUSING AND EQUITY', 'FINANCE COMMISSION', 'FIRE DEPARTMENT',
       'FOOD AND NUTRITION SERVICES', 'HEALTH AND HUMAN SERVICES',
       'HUMAN RESOURCES', 'HUMAN RIGHTS COMMISSION',
       'INSPECTIONAL SERVICES DEPARTMENT', 'INTERGOVERNMENTAL RELATIONS',
       'LAW DEPARTMENT', "MAYOR'S OFFICE", 'NAN',
       'NEIGHBORHOOD DEVELOPMENT', 'NEIGHBORHOOD SERVICES',
       'OFFC OF LANGUAGE AND COMMUNICA',
       'OFFICE FOR IMMIGRANT ADVANCEMENT', 'OFFICE OF ARTS AND CULTURE',
       'OFFICE OF DIVERSI

In [24]:
show_unique('Program')

array(['', 'ACHIEVEMENT GAP', 'ADAMS  ELEMENTARY SCHOOL',
       'ADULT EDUCATION', 'ADVANCEMENT AND EXTERNAL AFFAIRS',
       'ALIGHIERI MONTESSORI SCHOOL', 'ALTERNATIVE EDUCATION',
       'ALTERNATIVE EDUCATION  HIGH SCHOOL', 'ANOTHER COURSE TO  COLLEGE',
       'ASSISTANT SUPERINTENDENT-NETWORK A',
       'ASSISTANT SUPERINTENDENT-NETWORK B',
       'ASSISTANT SUPERINTENDENT-NETWORK C',
       'ASSISTANT SUPERINTENDENT-NETWORK D',
       'ASSISTANT SUPERINTENDENT-NETWORK E',
       'ASSISTANT SUPERINTENDENT-NETWORK F',
       'ASSISTANT SUPERINTENDENT-NETWORK G', 'ATHLETICS', 'BALDWIN ELC',
       'BATES  ELEMENTARY SCHOOL', 'BEETHOVEN  ELEMENTARY SCHOOL',
       'BI-WEEKLY 26 LEAVE', 'BLACKSTONE  ELEMENTARY SCHOOL',
       'BOSTON ARTS ACADEMY', 'BOSTON COLLABORATIVE HIGH SCHOOL',
       'BOSTON COMMUNITY LEADERSHIP ACADEMY', 'BOSTON EVENING ACADEMY',
       'BOSTON INTERNATIONAL  HIGH SCHOOL', 'BOSTON LATIN',
       'BOSTON MIDDLE SCHOOL ACADEMY',
       'BOSTON RESIDENTS JOB POLI

In [25]:
#purpose: verify/retrieve cabinet and department names using operating budget data
#TODO: Replace with 2021 budget CSV?
    
budget_df = pd.read_csv('./data/operating_budget.csv', encoding='latin1')
budget_df

Unnamed: 0,Cabinet,Dept,Program,Expense Category,FY18 Actual,FY19 Actual,20 Budget,21 Budget
0,Mayors Cabinet,Mayor's Office,Mayor's Administration,Personnel Services,1185285,1400773,1529111,1845647
1,Mayors Cabinet,Mayor's Office,Mayor's Administration,Contractual Services,179041,85769,83334,74334
2,Mayors Cabinet,Mayor's Office,Mayor's Administration,Supplies & Materials,53822,37539,37183,37183
3,Mayors Cabinet,Mayor's Office,Mayor's Administration,Current Charges & Obligations,7543,8801,7213,7213
4,Mayors Cabinet,Mayor's Office,Mayor's Administration,Equipment,17709,17273,25000,25000
...,...,...,...,...,...,...,...,...
1321,Other,Risk Retention Reserve,Risk Retention Reserve,Other Expenses,3000000,3000000,3000000,3000000
1322,Other,Tax Title,Tax Title,Other Expenses,-,-,600000,600000
1323,,,,,,,,
1324,,,,,,,,


In [26]:
transforms = {
    'LIBRARY DEPARTMENT':'BOSTON PUBLIC LIBRARY', 
    'COMMISSION FOR PERSONS W/DISABILITIES':'COMMISSION FOR PERSONS WITH DISABILITIES', 
    'LICENSING_BOARD':'LICENSING BOARD',
    'BOSTON VETS':"VETERANS' SERVICES"
}
budget_df = budget_df[['Cabinet','Dept']].drop_duplicates().apply(lambda x: x.str.upper())
budget_df = budget_df.replace({'\.':' ', '\&':' AND ', '\,':' ', '\s+':' '}, regex=True).apply(lambda x: x.str.strip())
budget_df.replace(transforms, inplace=True)

In [27]:
earnings_df = earnings_df.join(budget_df.set_index('Dept'), how='left', on='Department')
earnings_df = earnings_df[['First','Last','Year','Cabinet','Department','Program','Title','Regular','Retro','Overtime','Injury','Other','Total','Postal']]
earnings_df.size

3414166

In [28]:
earnings_df.dropna(subset=['Department'], inplace=True)
earnings_df[dimensions].fillna('', inplace=True)
earnings_df = earnings_df.loc[earnings_df['Department'] != 'NAN']
earnings_df.replace('\s+', ' ', regex=True, inplace=True)
earnings_df.size

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  earnings_df[dimensions].fillna('', inplace=True)


3414082

In [29]:
show_unique('Department')

array(['ADMINISTRATION AND FINANCE', 'AGE STRONG', 'ASSESSING DEPARTMENT',
       'AUDITING DEPARTMENT', 'BOSTON CENTER FOR YOUTH AND FAMILIES',
       'BOSTON PUBLIC LIBRARY', 'BOSTON PUBLIC SCHOOLS',
       'BUDGET MANAGEMENT', 'CITY CLERK', 'CITY COUNCIL',
       'COMMISSION FOR PERSONS WITH DISABILITIES',
       'CONSUMER AFFAIRS AND LICENSING',
       'DEPARTMENT OF INNOVATION AND TECHNOLOGY', 'ELECTION DEPARTMENT',
       'EMERGENCY MANAGEMENT', 'ENVIRONMENT DEPARTMENT',
       'FAIR HOUSING AND EQUITY', 'FINANCE COMMISSION', 'FIRE DEPARTMENT',
       'FOOD AND NUTRITION SERVICES', 'HEALTH AND HUMAN SERVICES',
       'HUMAN RESOURCES', 'HUMAN RIGHTS COMMISSION',
       'INSPECTIONAL SERVICES DEPARTMENT', 'INTERGOVERNMENTAL RELATIONS',
       'LAW DEPARTMENT', "MAYOR'S OFFICE", 'NEIGHBORHOOD DEVELOPMENT',
       'NEIGHBORHOOD SERVICES', 'OFFC OF LANGUAGE AND COMMUNICA',
       'OFFICE FOR IMMIGRANT ADVANCEMENT', 'OFFICE OF ARTS AND CULTURE',
       'OFFICE OF DIVERSITY', 'OFFICE O

In [30]:
show_unique('Program')

array(['', 'ACHIEVEMENT GAP', 'ADAMS ELEMENTARY SCHOOL',
       'ADULT EDUCATION', 'ADVANCEMENT AND EXTERNAL AFFAIRS',
       'ALIGHIERI MONTESSORI SCHOOL', 'ALTERNATIVE EDUCATION',
       'ALTERNATIVE EDUCATION HIGH SCHOOL', 'ANOTHER COURSE TO COLLEGE',
       'ASSISTANT SUPERINTENDENT-NETWORK A',
       'ASSISTANT SUPERINTENDENT-NETWORK B',
       'ASSISTANT SUPERINTENDENT-NETWORK C',
       'ASSISTANT SUPERINTENDENT-NETWORK D',
       'ASSISTANT SUPERINTENDENT-NETWORK E',
       'ASSISTANT SUPERINTENDENT-NETWORK F',
       'ASSISTANT SUPERINTENDENT-NETWORK G', 'ATHLETICS', 'BALDWIN ELC',
       'BATES ELEMENTARY SCHOOL', 'BEETHOVEN ELEMENTARY SCHOOL',
       'BI-WEEKLY 26 LEAVE', 'BLACKSTONE ELEMENTARY SCHOOL',
       'BOSTON ARTS ACADEMY', 'BOSTON COLLABORATIVE HIGH SCHOOL',
       'BOSTON COMMUNITY LEADERSHIP ACADEMY', 'BOSTON EVENING ACADEMY',
       'BOSTON INTERNATIONAL HIGH SCHOOL', 'BOSTON LATIN',
       'BOSTON MIDDLE SCHOOL ACADEMY',
       'BOSTON RESIDENTS JOB POLICY OFFI

In [31]:
def remove_nulls(d):
    return {k: v for k, v in d.items() if v is not None and v != ''}

#convert the dataframe into JSON, with one document for each employee and year
j = json.loads(earnings_df.sort_values(['Last','First','Year']).to_json(orient='records'), object_hook=remove_nulls)

In [2]:
from dotenv import load_dotenv
import os
load_dotenv()
password = os.environ.get("MONGODB_PASSWORD")
DATABASE_URL = f'mongodb+srv://cpschneider98:{password}@sharednamr.nxmopyw.mongodb.net/?retryWrites=true&w=majority'

In [3]:
client = MongoClient(DATABASE_URL)
db = client.db
collection = db['salaries']

In [37]:
collection.delete_many({})

collection.insert_many(j)

<pymongo.results.InsertManyResult at 0x20aa9834fc0>

In [10]:
collection.create_index('Department')
collection.create_index([('Department', pymongo.ASCENDING), ('Program', pymongo.ASCENDING)])
collection.create_index('Cabinet')
collection.create_index('Year')
collection.create_index('Program')

'Cabinet_1'

In [38]:
for val in collection.find({'$and':[{'First':'RICHARD','Last':'BECKERS'}]}):
    print(val)

{'_id': ObjectId('6369d3e084cb8117b0a928f3'), 'First': 'RICHARD', 'Last': 'BECKERS', 'Year': 2021, 'Cabinet': 'PUBLIC SAFETY', 'Department': 'POLICE DEPARTMENT', 'Title': 'POLICE OFFICER', 'Regular': 0.0, 'Retro': 0.0, 'Overtime': 0.0, 'Injury': 0.0, 'Other': 1264843.63, 'Total': 1264843.63, 'Postal': '02119'}


In [13]:
client.close()