In [1]:
import pandas as pd
import os

# Paths
RAW_PATH = "Raw_Dataset/"
CLEAN_PATH = "Cleaned_Dataset/"

# Ensure Cleaned_Dataset folder exists
os.makedirs(CLEAN_PATH, exist_ok=True)

# Load CSVs
donors = pd.read_csv(os.path.join(RAW_PATH, "donors.csv"))
collectors = pd.read_csv(os.path.join(RAW_PATH, "collectors.csv"))
ewaste = pd.read_csv(os.path.join(RAW_PATH, "ewaste_listings.csv"))
pickup = pd.read_csv(os.path.join(RAW_PATH, "pickup_requests.csv"))

# ------------------ Donors ------------------
donors.drop_duplicates(subset=["Donor_ID"], inplace=True)
donors["Name"] = donors["Name"].str.title()
donors["City"] = donors["City"].str.title()
donors["Contact"] = donors["Contact"].astype(str).str.replace(r"\D", "", regex=True).str[:10]

# ------------------ Collectors ------------------
collectors.drop_duplicates(subset=["Collector_ID"], inplace=True)
collectors["Name"] = collectors["Name"].str.title()
collectors["City"] = collectors["City"].str.title()
collectors["Contact"] = collectors["Contact"].astype(str).str.replace(r"\D", "", regex=True).str[:10]

# ------------------ Ewaste Listings ------------------
ewaste.drop_duplicates(subset=["Waste_ID"], inplace=True)
ewaste["Created_At"] = pd.to_datetime(ewaste["Created_At"], errors="coerce")
ewaste["Expiry_Date"] = pd.to_datetime(ewaste["Expiry_Date"], errors="coerce")
ewaste.dropna(subset=["Created_At", "Expiry_Date"], inplace=True)  # ensure valid dates
ewaste = ewaste[ewaste["Donor_ID"].isin(donors["Donor_ID"])]  # FK validation

# ------------------ Pickup Requests ------------------
pickup.drop_duplicates(subset=["Request_ID"], inplace=True)
pickup["Timestamp"] = pd.to_datetime(pickup["Timestamp"], errors="coerce")
pickup.dropna(subset=["Timestamp"], inplace=True)
pickup = pickup[pickup["Waste_ID"].isin(ewaste["Waste_ID"])]  # FK validation
pickup = pickup[pickup["Collector_ID"].isin(collectors["Collector_ID"])]  # FK validation

# Save cleaned files inside Cleaned_Dataset
donors.to_csv(os.path.join(CLEAN_PATH, "clean_donors.csv"), index=False)
collectors.to_csv(os.path.join(CLEAN_PATH, "clean_collectors.csv"), index=False)
ewaste.to_csv(os.path.join(CLEAN_PATH, "clean_ewaste_listings.csv"), index=False)
pickup.to_csv(os.path.join(CLEAN_PATH, "clean_pickup_requests.csv"), index=False)

print("✅ Cleaning complete! Cleaned files saved in 'E_Waste/Cleaned_Dataset/'")


✅ Cleaning complete! Cleaned files saved in 'E_Waste/Cleaned_Dataset/'
