In [2]:
import pandas as pd
import numpy as np
import ast

# --- Load Dataset ---
df = pd.read_csv(
    r"D:\DEPI_Project\Datasets\Raw\Real Estate\Dataset.csv",
    encoding="utf-8",
    low_memory=False
)

# --- Cleaning Pipeline ---
"""
Cleans real estate dataset:
- Keeps only relevant columns for property analysis
- Converts numeric columns to float
- Parses amenities safely (from string to list)
- Handles missing values
- Creates derived features (e.g., price per mÂ²)
- Removes duplicates
"""

# --- 1. Keep Only Relevant Columns ---
useful_cols = [
    "No", "Property_Type", "Floor_Area", "Condition", "Amenities", "Furnishing_Status",
    "Air_Conditioning (Yes/No)", "Heating (Yes/No)", "Num_rooms", "Num_bathrooms",
    "Price", "Latitude", "Longitude", "Location",
    "distance to nearest Hospital", "distance to nearest School", "distance to nearest Bus Stand",
    "Crimerate in area"
]
df = df[[col for col in useful_cols if col in df.columns]]

# --- 2. Fix Data Types ---
# Convert numeric columns
num_cols = [
    "Floor_Area", "Num_rooms", "Num_bathrooms", "Price",
    "distance to nearest Hospital", "distance to nearest School",
    "distance to nearest Bus Stand", "Crimerate in area",
    "Latitude", "Longitude"
]
for col in num_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# --- 3. Standardize Boolean Columns ---
bool_map = {"YES": True, "NO": False, "Yes": True, "No": False}
if "Air_Conditioning (Yes/No)" in df.columns:
    df["Air_Conditioning"] = df["Air_Conditioning (Yes/No)"].astype(str).map(bool_map)
if "Heating (Yes/No)" in df.columns:
    df["Heating"] = df["Heating (Yes/No)"].astype(str).map(bool_map)

# --- 4. Parse 'Amenities' Safely ---
def safe_parse_amenities(x):
    try:
        return ast.literal_eval(x) if isinstance(x, str) else []
    except Exception:
        return []
if "Amenities" in df.columns:
    df["Amenities"] = df["Amenities"].apply(safe_parse_amenities)
    # Create binary flags
    df["has_gym"] = df["Amenities"].apply(lambda x: "Gym" in x)
    df["has_parking"] = df["Amenities"].apply(lambda x: "Parking" in x)
    df["has_pool"] = df["Amenities"].apply(lambda x: "Swimming Pool" in x)

# --- 5. Handle Missing Values ---
# Drop rows missing critical info
critical_cols = ["Location", "Price", "Floor_Area", "Latitude", "Longitude"]
df = df.dropna(subset=critical_cols)

# Fill numeric missing values with city-level medians
city_groups = df.groupby("Location")
for col in ["Crimerate in area", "distance to nearest Hospital", "distance to nearest School"]:
    if col in df.columns:
        df[col] = city_groups[col].transform(lambda x: x.fillna(x.median()))

# --- 6. Derived Features ---
# Price per square meter
if "Price" in df.columns and "Floor_Area" in df.columns:
    df["price_per_m2"] = df["Price"] / df["Floor_Area"]

# --- 7. Drop Duplicates ---
df = df.drop_duplicates(subset=["No", "Location", "Latitude", "Longitude"])

# --- 8. Save Cleaned Dataset ---
output_path = r"D:\DEPI_Project\Datasets\Cleaned\Real Estate\cleaned_real_estate.csv"
df.to_csv(output_path, index=False, encoding="utf-8")

print(f"Cleaning complete. Saved cleaned dataset to: {output_path}")


Cleaning complete. Saved cleaned dataset to: D:\DEPI_Project\Datasets\Cleaned\Real Estate\cleaned_real_estate.csv
