In [2]:
# 1. Import Libraries & Load Data

import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("../data/processed/hr_dataset.csv")
df.head()

Unnamed: 0,EmployeeID,Name,Department,Gender,JobRole,Education,Age,Salary,Bonus,Attrition,TotalCompensation
0,1001,Emp_1,Sales,Male,Manager,High School,39,39239.0,6909,No,46148.0
1,1002,Emp_2,Marketing,Male,Consultant,Masters,53,30416.0,17150,No,47566.0
2,1003,Emp_3,IT,Female,Executive,High School,24,23412.0,28602,No,52014.0
3,1004,Emp_4,Marketing,Female,Engineer,Masters,23,101976.0,5080,No,107056.0
4,1005,Emp_5,Marketing,Male,Executive,PhD,54,44406.0,7083,No,51489.0


In [3]:
# Inspect unique values
print(df["Department"].unique())

# Standardize department names (fix misspellings, capitalization)
df["Department"] = df["Department"].str.strip().str.title()
df["Department"] = df["Department"].replace({
    "It": "IT",
    "I.T": "IT",
    "Hr": "HR",
    "Hrr": "HR",
    "Finanace": "Finance",
    "Sales ": "Sales",
    "Markting": "Marketing"
})

print(df["Department"].unique())

['Sales' 'Marketing' 'IT' 'Finance' 'Operations' 'HR' nan 'I.T' 'HRR'
 'Finanace' 'hr' 'Slaes' 'Markting']
['Sales' 'Marketing' 'IT' 'Finance' 'Operations' 'HR' nan 'Slaes']


In [4]:
# Check for negative or zero salaries
print(df[df["Salary"] <= 0])

# Remove or correct unrealistic salaries
df = df[df["Salary"] > 0]

# Optional: cap very high salaries (e.g., above 200k)
df.loc[df["Salary"] > 200000, "Salary"] = 200000

      EmployeeID      Name  Department  Gender     JobRole    Education  Age  \
242         1243   Emp_243          HR    Male  Consultant  High School   33   
376         1377   Emp_377     Finance  Female    Engineer    Bachelors   36   
992         1993   Emp_993          IT    Male     Manager    Bachelors   33   
1037        2038  Emp_1038     Finance  Female  Consultant      Masters   53   
1368        2369  Emp_1369  Operations  Female     Analyst      Masters   51   
1382        2383  Emp_1383          IT  Female  Consultant    Bachelors   48   
1665        2666  Emp_1666     Finance  Female  Consultant    Bachelors   31   
1752        2753  Emp_1753          IT    Male   Executive  High School   29   

       Salary  Bonus Attrition  TotalCompensation  
242  -20000.0  16528        No            -3472.0  
376   -5000.0  12953        No             7953.0  
992       0.0  20976        No            20976.0  
1037 -15000.0  15649        No              649.0  
1368      0.0   736

In [5]:
# EmployeeID is just an identifier
df = df.drop(columns=["EmployeeID"], errors="ignore")

In [6]:
# Check missing values
print(df.isnull().sum())

# Fill or drop based on context
df = df.dropna()   # simple option (later you can try imputing)

Name                  0
Department           38
Gender               39
JobRole               0
Education             0
Age                   0
Salary                0
Bonus                 0
Attrition            40
TotalCompensation     0
dtype: int64


In [7]:
# Create TotalCompensation if missing
if "TotalCompensation" not in df.columns:
    df["TotalCompensation"] = df["Salary"] + df["Bonus"].fillna(0)

In [8]:
df.to_csv("../data/processed/hr_dataset_cleaned.csv", index=False)
print("✅ Cleaned dataset saved as hr_dataset_cleaned.csv")

✅ Cleaned dataset saved as hr_dataset_cleaned.csv
