In [1]:
"""
01 Data Cleaning
Load and clean the employee attrition dataset.
"""

'\n01 Data Cleaning\nLoad and clean the employee attrition dataset.\n'

In [2]:
from src.data_processing import load_data, clean_data, split_data
import pandas as pd
from IPython.display import display, Markdown

In [3]:
display(Markdown("""
# Data Cleaning
This notebook loads and cleans the raw employee attrition dataset, preparing it for analysis and modeling.
"""))


# Data Cleaning
This notebook loads and cleans the raw employee attrition dataset, preparing it for analysis and modeling.


In [8]:
import os

# Print current working directory
print("Current working directory:", os.getcwd())

# If not in the project root, change to it
project_root = r"C:\Users\USER\Documents\Projects\JJM-attrition-rate"
if os.getcwd() != project_root:
    os.chdir(project_root)
    print("Changed working directory to:", os.getcwd())

Current working directory: c:\Users\USER\Documents\Projects\JJM-attrition-rate\notebooks
Changed working directory to: C:\Users\USER\Documents\Projects\JJM-attrition-rate


In [9]:
# Load data
data_path = os.path.join('data', 'employee_data.csv')
raw_df = load_data(data_path)
raw_df.head()

Unnamed: 0,EmployeeId,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,1,38,,Travel_Frequently,1444,Human Resources,1,4,Other,1,...,2,80,1,7,2,3,6,2,1,2
1,2,37,1.0,Travel_Rarely,1141,Research & Development,11,2,Medical,1,...,1,80,0,15,2,1,1,0,0,0
2,3,51,1.0,Travel_Rarely,1323,Research & Development,4,4,Life Sciences,1,...,3,80,3,18,2,4,10,0,2,7
3,4,42,0.0,Travel_Frequently,555,Sales,26,3,Marketing,1,...,4,80,1,23,2,4,20,4,4,8
4,5,40,,Travel_Rarely,1194,Research & Development,2,4,Medical,1,...,2,80,3,20,2,3,5,3,0,2


In [10]:
display(Markdown("""
## Load Raw Data
We begin by loading the raw dataset to inspect its structure and contents.
"""))


## Load Raw Data
We begin by loading the raw dataset to inspect its structure and contents.


In [11]:
# Clean data
clean_df = clean_data(raw_df)
clean_df.head()

Unnamed: 0,EmployeeId,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1,2,37,1.0,Travel_Rarely,1141,Research & Development,11,2,Medical,1,...,1,80,0,15,2,1,1,0,0,0
2,3,51,1.0,Travel_Rarely,1323,Research & Development,4,4,Life Sciences,1,...,3,80,3,18,2,4,10,0,2,7
3,4,42,0.0,Travel_Frequently,555,Sales,26,3,Marketing,1,...,4,80,1,23,2,4,20,4,4,8
6,7,40,0.0,Travel_Rarely,1124,Sales,1,2,Medical,1,...,3,80,3,6,2,2,4,3,0,2
7,8,55,1.0,Travel_Rarely,725,Research & Development,2,3,Medical,1,...,4,80,1,24,2,3,5,2,1,4


In [12]:
display(Markdown("""
## Clean Data
We apply cleaning steps to handle missing values, correct data types, and fix inconsistencies.
"""))


## Clean Data
We apply cleaning steps to handle missing values, correct data types, and fix inconsistencies.


In [13]:
# Optionally, split data for modeling and inference
model_df, infer_df = split_data(clean_df)
print(f"Modeling set: {model_df.shape}, Inference set: {infer_df.shape}")

Modeling set: (846, 35), Inference set: (212, 35)


In [14]:
display(Markdown("""
## Split Data (Optional)
We optionally split the cleaned data into modeling and inference sets for downstream tasks.
"""))


## Split Data (Optional)
We optionally split the cleaned data into modeling and inference sets for downstream tasks.


In [15]:
# Save cleaned data for next steps
clean_df.to_csv('data/employee_data_cleaned.csv', index=False)

In [16]:
display(Markdown("""
## Save Cleaned Data
The cleaned dataset is saved for use in EDA and modeling notebooks.
""")) 


## Save Cleaned Data
The cleaned dataset is saved for use in EDA and modeling notebooks.
