In [1]:
import pandas as pd
import numpy as np

# --- Load Final Dataset ---
df = pd.read_csv(r"D:\DEPI_Project\Datasets\Final Dataset\final_real_estate_master.csv", low_memory=False)

# --- Cleaning Pipeline ---
"""
Final Cleaning before Modeling:
- Removes rows with missing key values (Price, Floor_Area, income)
- Drops 'Land' and invalid property types
- Handles inf and 0 values in numeric columns
- Ensures proper dtypes for modeling
"""

# --- 1. Drop rows with missing critical fields ---
critical_cols = ['Price', 'Floor_Area', 'income', 'population', 'avg_delay', 'avg_severity', 'avg_duration']
df = df.dropna(subset=critical_cols)

# --- 2. Remove 'Land' or invalid properties ---
df = df[~df['Property_Type'].str.lower().isin(['land', 'plot', 'empty lot'])]

# --- 3. Remove rows with 0 or infinite Floor_Area ---
df = df[df['Floor_Area'] > 0]
df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=['price_per_m2'])

# --- 4. Ensure numeric columns are valid ---
numeric_cols = [
    'Price', 'Floor_Area', 'price_per_m2', 'income', 'population',
    'avg_delay', 'avg_severity', 'avg_duration', 'price_to_income_ratio', 'transport_score'
]
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.dropna(subset=numeric_cols)

# --- 5. Optional: Clip unrealistic values ---
df = df[(df['income'] > 10000) & (df['income'] < 300000)]
df = df[(df['population'] > 500) & (df['population'] < 50000)]
df = df[df['Price'] < 1e7]  # remove extremely high prices

# --- 6. Drop duplicates ---
df = df.drop_duplicates(subset=['No'])

# --- 7. Reset index and save clean data ---
df = df.reset_index(drop=True)
df.to_csv(r"D:\DEPI_Project\Datasets\Cleaned Final Dataset\cleaned_final_dataset.csv", index=False)

print(f"Cleaning done. Final dataset shape: {df.shape}")


Cleaning done. Final dataset shape: (288461, 35)
