In [None]:
import pandas as pd
import numpy as np

# --- Load Dataset ---
df = pd.read_csv(
    r"D:\Project_Data\Crimes_-_2025_20250918.csv",
    encoding="utf-8",
    low_memory=False
)

# --- Cleaning Pipeline ---
"""
Cleans crime dataset:
- Keeps only relevant fields for housing/safety analysis
- Fixes datetime columns
- Converts Arrest to boolean
- Handles missing values
- Removes duplicates
"""

# --- 1. Keep Only Relevant Columns ---
useful_cols = [
    "ID", "Date", "Primary Type", "Description",
    "Arrest", "Community Area", "Latitude", "Longitude", "Crime Rate"
]
df = df[[col for col in useful_cols if col in df.columns]]

# --- 2. Fix datetime column ---
if "Date" in df.columns:
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce", utc=True)

# --- 3. Convert Arrest to Boolean ---
if "Arrest" in df.columns:
    df["Arrest"] = df["Arrest"].map({
        True: True, False: False, "TRUE": True, "FALSE": False,
        "true": True, "false": False, "True": True, "False": False
    })
    df["Arrest"] = df["Arrest"].astype("boolean")

# --- 4. Numeric Conversion ---
num_cols = ["Community Area", "Latitude", "Longitude", "Crime Rate"]
for col in num_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# --- 5. Handle Missing Values ---
# Drop rows if critical fields are missing
critical_cols = ["ID", "Primary Type", "Community Area", "Latitude", "Longitude"]
df = df.dropna(subset=[col for col in critical_cols if col in df.columns])

# Fill missing Description with Primary Type
if "Description" in df.columns and "Primary Type" in df.columns:
    df["Description"] = df["Description"].fillna(df["Primary Type"])

# Fill missing Crime Rate with median per Community Area
if "Crime Rate" in df.columns and "Community Area" in df.columns:
    df["Crime Rate"] = df.groupby("Community Area")["Crime Rate"].transform(
        lambda x: x.fillna(x.median())
    )

# --- 6. Remove Duplicates by ID ---
df = df.drop_duplicates(subset=["ID"])

# --- 7. Save Cleaned Dataset ---
output_path = r"D:\Project_Data\Processed Data\Crimes\cleaned_crimes.csv"
df.to_csv(output_path, index=False, encoding="utf-8")

print(f"Cleaning complete. Saved cleaned dataset to: {output_path}")
