In [1]:
import pandas as pd
import sys
import numpy as np
sys.path.insert(0, '../src')
from utils.extract_utils import extract_csv

# load data into pandas dataframe
data = extract_csv('../data/input/Salaries.csv')
df = pd.DataFrame(data)

# Set max num of rows to display
pd.set_option('display.max_rows', 1500)

df.head()

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
0,1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.0,400184.25,,567595.43,567595.43,2011,,San Francisco,
1,2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966.02,245131.88,137811.38,,538909.28,538909.28,2011,,San Francisco,
2,3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739.13,106088.18,16452.6,,335279.91,335279.91,2011,,San Francisco,
3,4,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916.0,56120.71,198306.9,,332343.61,332343.61,2011,,San Francisco,
4,5,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134401.6,9737.0,182234.59,,326373.19,326373.19,2011,,San Francisco,


## Cleaning

In [None]:
# check info about dataset
df.info()

The info tells us that all columns are object type although a lot of them are numeric columns. There are missing values in this dataset that are represented by empty strings, we can change both of these at the same time:

In [None]:
# convert Id and Year columns to int dtype
df[['Id', 'Year']] = df[['Id', 'Year']].astype(int)

# convert multiple columns to float, "errors='coerce'" converts any non-numeric values to NaN
float_columns = ['BasePay', 'OvertimePay', 'OtherPay', 'Benefits', 'TotalPay', 'TotalPayBenefits']
for col in float_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce').astype(float)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# dropping columns: 'Agency' is a zero variance column (only San Francisco), 'Notes' column has 100% missing values, and 'Status' has 74% missing values, checking the dataset on Kaggle.
df.drop(['Agency', 'Notes', 'Status'], axis=1, inplace=True)

In [None]:
# drop basepay missing rows (only 609, 0.4%)
df.dropna(subset=['BasePay'], inplace=True)

'Benefits' has 36159 missing values. I will filter the rows that have missing 'Benefits' and where 'TotalPay' and 'TotalPayBenefits' have the same value, and then see if there is also 36159 entries. If it does, I can set all missing 'Benefits' values to 0 since it means there is no benefits for these entries.

In [None]:
# check number of missing entries that also have the same totalpay and totalpaybenefits values
missing_benefits_same_totalpay = df[(df['Benefits'].isna()) & (df['TotalPay'] == df['TotalPayBenefits'])]
missing_benefits_same_totalpay.shape

In [None]:
# change all null benefits to 0
df['Benefits'].fillna(0, inplace=True)

In [None]:
# check for duplicate entries
duplicates = df.duplicated()
num_duplicated = duplicates.sum()
print(f"There are {num_duplicated} duplicate rows in the df")

In [None]:
df.head()

## Data Aggregation

In [None]:
# group by job title and calculate stats of basepay and totalpay (mean, median, max, standard deviation, count)
agg_df = df.groupby(['JobTitle']).agg({'BasePay': ['mean', 'median', 'min', 'max', 'std', 'count'], 'TotalPay': ['mean', 'median', 'min', 'max', 'std', 'count']}).reset_index()

In [None]:
agg_df.head(15)

## Data Engineering
I am going to create 2 new features. The first feature (JobCategory) will categorise job titles. For example, 'law enforcement' will contain police and similar roles. The second feature (SalaryRangeCategory) will categorise different salary ranges, such as low, medium and high salaries depending on the minimum, maximum and mean salary for that job title. 

In [None]:
import re

# define regex patterns for each category
job_categories = {
    'Law Enforcement' : ['police', 'sheriff', 'crime', 'forensic(s)?', 'patrol', 'detective', 'mayor', 'sergeant', 'captain', 'officer', 'lieutenant'],
    'Safety & Security' : ['fire', 'safety', 'public', 'security', 'guard', 'protect(ive)?'],
    'Medical' : ['doctor', 'nurse', 'paramedic', 'medic(al)?', 'health(care)?', 'medicine', 'anesthetist'],
    'Education' : ['teacher', 'prof(essor)?', 'teacher(s)? assistant', 'education', 'eng(r)?'],
    'Administrative' : ['clerk', 'admin', 'secretary', 'assistant', 'library', 'librarian'],
    'Engineering' : ['engineer', 'architect', 'technician', 'physician', r'\belectr\w+'],
    'Construction' : ['construction', 'mechanic', 'laborer'], 
    'Information Technology' : ['programmer', 'developer', 'software', 'IT', 'computer', 'analyst'],
    'Management' : ['manager', 'director', 'CEO', 'owner', 'supervisor', 'head', 'leader'],
    'Finance' : ['accountant', 'economist', 'tax', 'finance', 'money'],
    'Legal' : ['law(yer)?', 'legal', 'attorney', 'judge'],
    'Maintenance' : ['custodian', 'porter', 'gardener'], 
    'Other' : []   
}

# function to assign job categories based on regex patterns
def categorise_job_title(job_title):
    for category, patterns in job_categories.items():
        for pattern in patterns:
            if re.search(pattern, job_title, re.IGNORECASE):
                return category
    return 'Other'

# apply the categorise_job_title function to the JobTitle column and create new column
df['JobCategory'] = df['JobTitle'].apply(categorise_job_title)

In [None]:
df.head()

In [None]:
# the second feature will compare TotalPay to the std for that job title and categorise it (low, medium, high)
# create empty column
df['SalaryRangeCategory'] = ''

# merge the aggregated df and original df to compare salaries for job titles
agg_df.columns = ['JobTitle', 'BasePay_mean', 'BasePay_median', 'BasePay_min', 'BasePay_max', 'BasePay_std', 'BasePay_count', 'TotalPay_mean', 'TotalPay_median', 'TotalPay_min', 'TotalPay_max', 'TotalPay_std', 'TotalPay_count']
merged_df = df.merge(agg_df, on='JobTitle')

In [None]:
# calculate the TotalPay salary range for each row, if pay is less than 1 std below mean, salary range will be low. Above will be high and within will be medium.
for i, row in merged_df.iterrows():
    mean = row['TotalPay_mean']
    std = row['TotalPay_std']
    salary = row['TotalPay']
    if salary < mean - std:
        df.at[i, 'SalaryRangeCategory'] = 'low'
    elif salary > mean + std:
        df.at[i, 'SalaryRangeCategory'] = 'high'
    else:
        df.at[i, 'SalaryRangeCategory'] = 'medium'
        
# fill in missing values (caused by std = NaN) with 'unknown'
df['SalaryRangeCategory'] = df['SalaryRangeCategory'].fillna('unknown')

In [None]:
# transforming the dataset is finished. Display both agg_df and df:
df.head()

In [None]:
agg_df.head()