# Data Cleaning Essentials

# 0) Setup & Load Data

We first load the messy CSV and take a quick look. If the CSV isn't present (e.g., when running standalone), we recreate it on the fly.

In [None]:

import pandas as pd
import numpy as np
import unicodedata
from pathlib import Path

dirty_path = Path("./data/employees_dirty.csv")

df = pd.read_csv(dirty_path, dtype={"emp_id":"Int64","manager_id":"Int64","age":"Int64"})
print("Raw head:"); display(df.head())
print("\nInfo:"); display(df.info())


# 1) Normalize Column Names

Make column names **lower snake_case** for consistency.

In [None]:

df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(r"[^\w]+", "_", regex=True)
      .str.replace(r"_+", "_", regex=True)
      .str.strip("_")
)
print(df.columns.tolist())


# 2) Trim Whitespace, Normalize Unicode & Fix Casing

- Strip whitespace in **all object** columns
- Normalize Unicode (e.g., NBSP to normal spaces)
- Title‑case names, standardize department casing
- Lower‑case emails

In [None]:

# Helper: normalize Unicode and strip
def clean_text(x):
    if not isinstance(x, str):
        return x
    # normalize unusual spaces / accents
    x = unicodedata.normalize("NFKC", x)
    # strip leading/trailing whitespace
    x = x.strip()
    # collapse inner multiple spaces
    x = " ".join(x.split())
    return x

# Apply to all object columns
obj_cols = df.select_dtypes(include="object").columns
for c in obj_cols:
    df[c] = df[c].map(clean_text)

# Emails lowercase
df["email"] = df["email"].str.lower()

# Standardize names
df["first_name"] = df["first_name"].str.title()
df["last_name"]  = df["last_name"].str.title()
df["name"]       = df["name"].str.title()

# Standardize department
df["department"] = df["department"].str.replace(r"\s+", " ", regex=True).str.strip().str.title()

print(df[obj_cols].head(10))


# 3) Split/Extract Names (if only `name` is present)

For rows where `first_name`/`last_name` are missing, try to extract from `name` which may be
in formats like `'Last, First'` or `'First Last'`.

In [None]:

def split_name(full):
    if not isinstance(full, str) or not full:
        return (None, None)
    if "," in full:
        last, first = [p.strip() for p in full.split(",", 1)]
        return (first.title(), last.title())
    parts = full.split()
    if len(parts) >= 2:
        return (parts[0].title(), " ".join(parts[1:]).title())
    return (parts[0].title(), None) if parts else (None, None)

missing_fn = df["first_name"].isna() | (df["first_name"]=="")
missing_ln = df["last_name"].isna()  | (df["last_name"]=="")
need_split = missing_fn | missing_ln

extracted = df.loc[need_split, "name"].map(split_name)
df.loc[need_split, "first_name"] = [t[0] for t in extracted]
df.loc[need_split, "last_name"]  = [t[1] for t in extracted]

df[["name","first_name","last_name"]].head(10)


# 4) Parse Dates & Standardize Types

- Convert `hire_date` to datetime, coerce invalids to `NaT`
- Standardize `salary` to **numeric EUR** by stripping symbols and locale variants
- Ensure categorical types for `department`

In [None]:

# Dates
df["hire_date"] = pd.to_datetime(df["hire_date"], errors="coerce", dayfirst=True)

# Salary: remove currency symbols/spaces/dots used as thousands, commas as decimal in EU, then to float
salary_clean = (
    df["salary"]
      .astype(str)
      .str.replace(r"[^\d,.\-]", "", regex=True) # keep digits . , -
      .str.replace(r"\.(?=\d{3}\b)", "", regex=True) # remove thousand dots like 55.500 -> 55500
      .str.replace(",", ".", regex=False)           # convert comma decimals to dot
)

df["salary_eur"] = pd.to_numeric(salary_clean, errors="coerce")
df["department"] = df["department"].astype("category")

df[["hire_date","salary","salary_eur","department"]].head(10)


# 5) Standardize Country Values

Map messy country values to **ISO‑like** codes. Anything unknown becomes `NaN`.

In [None]:

country_map = {
    "germany":"DE", "de":"DE", "deu":"DE", "ger":"DE", "de ":"DE", " germany":"DE", "de":"DE"
}
df["country"] = (
    df["country"]
      .str.lower()
      .map(country_map)
)

df["country"].head(10)


# 6) Validate & Clean Emails / Phones

- Email: lowercased earlier; mark invalid patterns
- Phone: keep only digits, add `+49` if obviously missing and number starts with `0`

In [None]:

# Email validity (very light check)
valid_email = df["email"].str.contains(r"^[A-Za-z0-9._%+\-']+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$", regex=True, na=False)
df["email_valid"] = valid_email

# Phone normalization (Germany): keep digits, if starts with 0 then assume +49 local
digits = df["phone"].astype(str).str.replace(r"\D", "", regex=True)
df["phone_clean"] = np.where(digits.str.startswith("0"), "+49" + digits.str.lstrip("0"),
                             np.where(digits.str.startswith("49"), "+" + digits, "+" + digits))

df[["email","email_valid","phone","phone_clean"]].head(10)


# 7) Handle Duplicates

Drop duplicate employees by `emp_id`, **keep the last** occurrence (assuming it is the most recent/correct).

In [None]:

before = len(df)
df = df.sort_values(["emp_id","salary_eur"]).drop_duplicates(subset=["emp_id"], keep="last")
after = len(df)
print(f"Dropped {before - after} duplicate rows")


# 8) Missing Values Strategies

- Impute `age` with **median per department**
- If `manager_id` missing, fill with most frequent manager per department
- Leave `notes` missing as-is

In [None]:

# Age: median per department
## If someone didn’t specify their age, assume they’re around the typical age of people in that same department.

df["age"] = df.groupby("department",observed=True)["age"].transform(lambda s: s.fillna(s.median()))

# Manager: mode per department
def fill_mode(s):
    m = s.mode(dropna=True)
    return s.fillna(m.iloc[0]) if not m.empty else s
df["manager_id"] = df.groupby("department",observed=True)["manager_id"].transform(fill_mode)

df[["department","age","manager_id"]].head(10)


# 9) Outliers: Detect & Cap via IQR

Use the **IQR rule** to cap extreme `age` and `salary_eur` values (winsorization).

In [None]:
df.info()

In [18]:
# 1️⃣ Force conversion to pure float64 (not nullable Int64)
df["age"] = pd.to_numeric(df["age"], errors="coerce").astype("float64")
df["salary_eur"] = pd.to_numeric(df["salary_eur"], errors="coerce").astype("float64")

# 2️⃣ Define the IQR-capping function safely
def cap_outliers(series, k=1.5):
    # drop NaN before computing quartiles
    clean = series.dropna()
    if clean.empty:
        return series  # nothing to cap

    q1 = clean.quantile(0.25)
    q3 = clean.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - k * iqr
    upper = q3 + k * iqr

    # clip only on numeric arrays
    capped = series.clip(lower=lower, upper=upper)
    return capped

# 3️⃣ Apply it
df["age"] = cap_outliers(df["age"])
df["salary_eur"] = cap_outliers(df["salary_eur"])

# 4️⃣ Verify result
print(df[["age", "salary_eur"]].describe())




             age   salary_eur
count   8.000000      8.00000
mean   31.812500  43953.12500
std    11.153755  28221.67416
min    12.750000     60.00000
25%    27.000000  34516.25000
50%    30.500000  56750.00000
75%    36.500000  58250.00000
max    50.750000  75000.00000


# 10) Feature Engineering

- `tenure_years` from `hire_date`
- Full `display_name`
- Department one‑hot example

In [19]:

today = pd.Timestamp.today().normalize()
df["tenure_years"] = ((today - df["hire_date"]).dt.days / 365.25).round(2)
df["display_name"] = df["first_name"].fillna("") + " " + df["last_name"].fillna("")
df = df.join(pd.get_dummies(df["department"], prefix="dept", dtype="Int64"))

df[["emp_id","display_name","tenure_years"] + [c for c in df.columns if c.startswith("dept_")]].head()


Unnamed: 0,emp_id,display_name,tenure_years,dept_Engineering,dept_Hr,dept_Human Resources,dept_Sales
8,101,Anna Schmidt,3.03,0,0,0,1
1,102,Peter Müller,,0,0,0,1
2,103,Maria Garcia,,1,0,0,0
3,104,John Doe,,1,0,0,0
4,105,Alice Wong,,0,1,0,0


# 11) Final QA Checks

Common validations before saving.

In [20]:

expected_cols = {"emp_id","first_name","last_name","email","email_valid","phone_clean","hire_date",
                 "salary_eur","department","country","age","manager_id","tenure_years","display_name"}
missing = expected_cols - set(df.columns)
print("Missing expected columns:", missing)

# Basic sanity
print("Any negative salary?", (df["salary_eur"] < 0).any())
print("Any hire_date NaT?", df["hire_date"].isna().any())
print("Invalid emails count:", (~df["email_valid"]).sum())

display(df.head(10))


Missing expected columns: set()
Any negative salary? False
Any hire_date NaT? True
Invalid emails count: 2


Unnamed: 0,emp_id,name,first_name,last_name,email,phone,hire_date,salary,department,country,...,notes,salary_eur,email_valid,phone_clean,tenure_years,display_name,dept_Engineering,dept_Hr,dept_Human Resources,dept_Sales
8,101,,Anna,Schmidt,anna.schmidt@example.com,0301234567,2022-10-01,€46.000,Sales,DE,...,duplicate row with newer salary,46000.0,True,49301234567,3.03,Anna Schmidt,0,0,0,1
1,102,"Müller, Peter",Peter,Müller,peter.mueller@example.com,030-555-9876,NaT,"€55.500,00",Sales,DE,...,überdurchschnittlich,55500.0,True,49305559876,,Peter Müller,0,0,0,1
2,103,"Garcia , Maria",Maria,Garcia,maria.garcia(at)example.com,+49-40-222-3333,NaT,"60,000 EUR",Engineering,DE,...,needs coaching,60.0,False,49402223333,,Maria Garcia,1,0,0,0
3,104,,John,Doe,john.doe@example.com,49 89 444 555,NaT,$75000,Engineering,DE,...,late on Mondays,75000.0,True,4989444555,,John Doe,1,0,0,0
4,105,Alice Wong,Alice,Wong,alice.wong@example,040 333 222,NaT,"£65,000",Hr,DE,...,PROMOTED,65.0,False,4940333222,,Alice Wong,0,1,0,0
5,106,"O'Connor, Sean",Sean,O'Connor,sean.o'connor@example.com,+49 (30) 111-2222,NaT,EUR 58.000,Human Resources,,...,,58000.0,True,49301112222,,Sean O'Connor,0,0,1,0
6,107,,Léa,Dubois,lea.dubois@example.com,030 111 222 333,2024-05-04,58000,Engineering,DE,...,,58000.0,True,4930111222333,1.44,Léa Dubois,1,0,0,0
7,108,"Rossi,Marco",Marco,Rossi,marco.rossi@example.com,089/555-111,2022-01-12,59000€,Engineering,DE,...,trainee,59000.0,True,4989555111,3.75,Marco Rossi,1,0,0,0


# 12) Save Clean Output

Write the cleaned dataset to disk.

In [23]:

clean_path = "./data/employees_clean.csv"
df.to_csv(clean_path, index=False, encoding="utf-8")
print("Saved cleaned CSV to:", clean_path)


Saved cleaned CSV to: ./data/employees_clean.csv
