## 02 - Data Cleaning Pipeline

This notebook focuses on cleaning and standardizing the storage company datasets.  
The goal is to prepare clean, consistent data ready for analysis or dashboarding.


In [1]:
# Imports
import pandas as pd
import numpy as np
import os

data_path = "../Data"

customers = pd.read_csv(os.path.join(data_path, "customers_dirty.csv"))
payments = pd.read_csv(os.path.join(data_path, "payments_dirty.csv"))
visits = pd.read_csv(os.path.join(data_path, "visits_log_dirty.csv"))
units = pd.read_csv(os.path.join(data_path, "units.csv"))


## Check general structure and duplicates


In [2]:
datasets = {
    "customers": customers,
    "payments": payments,
    "visits": visits,
    "units": units
}

for name, df in datasets.items():
    print(f"{name} → {df.shape[0]} rows, {df.shape[1]} columns")
    print("Duplicates:", df.duplicated().sum())
    print("-" * 40)


customers → 2200 rows, 18 columns
Duplicates: 0
----------------------------------------
payments → 10000 rows, 7 columns
Duplicates: 0
----------------------------------------
visits → 5000 rows, 5 columns
Duplicates: 0
----------------------------------------
units → 500 rows, 4 columns
Duplicates: 0
----------------------------------------


In [3]:
for name, df in datasets.items():
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    if not missing.empty:
        print(f"\n{name} missing values:")
        print(missing)



customers missing values:
email                 59
phone                 44
lease_end             65
monthly_fee           54
last_payment_date    113
notes                274
dtype: int64

payments missing values:
note    1668
dtype: int64

units missing values:
size_sqm    6
dtype: int64


In [None]:
# Filling missing lease_end with 'active' (still renting)
if "lease_end" in customers.columns:
    customers["lease_end"] = customers["lease_end"].fillna("active")

# Filling monthly_fee and size_sqm with median
if "monthly_fee" in customers.columns:
    customers["monthly_fee"] = customers["monthly_fee"].fillna(customers["monthly_fee"].median())

if "size_sqm" in units.columns:
    units["size_sqm"] = units["size_sqm"].fillna(units["size_sqm"].median())


In [None]:
# checking that missing values were filled
for name, df in [("customers", customers), ("units", units)]:
    nulls = df.isnull().sum()
    nulls = nulls[nulls > 0]
    if not nulls.empty:
        print(f"{name} still has missing values:\n{nulls}\n")
    else:
        print(f"{name} → no missing values in the filled columns.\n")


customers still has missing values:
email                 59
phone                 44
last_payment_date    113
notes                274
dtype: int64

units → no missing values in the filled columns.



In [None]:
# checking for numeric columns that might be stored as text
for name, df in datasets.items():
    for col in df.columns:
        if df[col].dtype == "object":
            sample = df[col].dropna().astype(str).head(20).tolist()
            if any(any(c.isalpha() for c in str(x)) for x in sample):
                continue
            try:
                df[col] = pd.to_numeric(df[col])
            except (ValueError, TypeError):
                pass
    print(f"{name} types checked.")


customers types checked.
payments types checked.
visits types checked.
units types checked.


In [None]:
# standardizing column names across all datasets
for name, df in datasets.items():
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
        .str.replace("-", "_")
    )


In [15]:
for name, df in datasets.items():
    print(f"{name} columns: {list(df.columns)}\n")


customers columns: ['customer_id', 'first_name', 'last_name', 'email', 'phone', 'street_address', 'city', 'postal_code', 'country', 'sign_up_date', 'lease_start', 'lease_end', 'unit_id', 'unit_size', 'monthly_fee', 'payment_status', 'last_payment_date', 'notes']

payments columns: ['transaction_id', 'customer_id', 'date', 'amount', 'method', 'status', 'note']

visits columns: ['visit_id', 'customer_id', 'unit_id', 'visited_at', 'purpose']

units columns: ['unit_id', 'canonical_unit_id', 'size_sqm', 'monthly_fee']



In [None]:
import os

# Creating cleaned data folder if it doesn't exist
clean_path = "../Data/cleaned"
os.makedirs(clean_path, exist_ok=True)

# Exporting each cleaned dataset
for name, df in datasets.items():
    file_path = os.path.join(clean_path, f"{name}_clean.csv")
    df.to_csv(file_path, index=False)
    print(f"{name} saved to {file_path}")


customers saved to ../Data/cleaned/customers_clean.csv
payments saved to ../Data/cleaned/payments_clean.csv
visits saved to ../Data/cleaned/visits_clean.csv
units saved to ../Data/cleaned/units_clean.csv
