In [None]:
"""
01 Data Cleaning
Load and clean the employee attrition dataset.
"""

In [None]:
from src.data_processing import load_data, clean_data, split_data
import pandas as pd
from IPython.display import display, Markdown

In [None]:
display(Markdown("""
# Data Cleaning
This notebook loads and cleans the raw employee attrition dataset, preparing it for analysis and modeling.
"""))

In [None]:
# Load data
raw_df = load_data('data/employee_data.csv')
raw_df.head()

In [None]:
display(Markdown("""
## Load Raw Data
We begin by loading the raw dataset to inspect its structure and contents.
"""))

In [None]:
# Clean data
clean_df = clean_data(raw_df)
clean_df.head()

In [None]:
display(Markdown("""
## Clean Data
We apply cleaning steps to handle missing values, correct data types, and fix inconsistencies.
"""))

In [None]:
# Optionally, split data for modeling and inference
model_df, infer_df = split_data(clean_df)
print(f"Modeling set: {model_df.shape}, Inference set: {infer_df.shape}")

In [None]:
display(Markdown("""
## Split Data (Optional)
We optionally split the cleaned data into modeling and inference sets for downstream tasks.
"""))

In [None]:
# Save cleaned data for next steps
clean_df.to_csv('data/employee_data_cleaned.csv', index=False)

In [None]:
display(Markdown("""
## Save Cleaned Data
The cleaned dataset is saved for use in EDA and modeling notebooks.
""")) 