In [9]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
import re
import joblib

# --- 1. Load Data ---
print("Loading data...")
try:
    df = pd.read_csv('real_estate_data.csv')
    print(f"Data loaded: {df.shape}")
except FileNotFoundError:
    print("Error: 'real_estate_data.csv' not found.")

# --- 2. Standardize & Rename Columns ---
df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')

rename_map = {
    'price': 'price',
    'total_area': 'area_sqft',
    'area': 'area_sqft',
    'sqft': 'area_sqft',
    'location': 'neighborhood',
    'bhk': 'beds',
    'bedroom': 'beds',
    'bed': 'beds',
    'bath': 'baths',
    'bathroom': 'baths',
    'bathrooms': 'baths',
    'balcony': 'balcony',
    'parking': 'parking',
    'car_parking': 'parking',
    'floor': 'floor',
    'total_floors': 'total_floors',
    'status': 'status',
    'availability': 'status'
}
df.rename(columns=rename_map, inplace=True)

# Handle potential duplicate price columns
if 'raw_price' not in df.columns and 'price' in df.columns:
    df['raw_price'] = df['price']  # Keep original for reference if needed

# --- 3. ROBUST PRICE CLEANING (Moved to Top) ---
# We clean price FIRST to avoid errors when calculating mean later
def clean_price(x):
    if pd.isna(x): return np.nan
    # Ensure string format
    x = str(x).upper().strip()
    # Remove currency symbols and commas
    x = x.replace('â‚¹', '').replace(',', '').replace('INR', '').strip()
    
    try:
        multiplier = 1
        if 'CR' in x:
            multiplier = 10000000
            x = x.replace('CR', '').strip()
        elif 'L' in x: 
            multiplier = 100000
            x = x.replace('LACS', '').replace('LAC', '').replace('L', '').strip()
        elif 'K' in x:
            multiplier = 1000
            x = x.replace('K', '').strip()
            
        # Use Regex to find the number part (e.g., extracts '1.99' from '1.99')
        match = re.search(r"[\d\.]+", x)
        if match:
            clean_val = float(match.group())
            return clean_val * multiplier
        else:
            return np.nan
    except:
        return np.nan

print("Cleaning Price Column...")
# Clean the 'price' column
# We prioritize 'raw_price' if it exists (original string), else clean 'price' directly
col_to_clean = 'raw_price' if 'raw_price' in df.columns else 'price'
df['price'] = df[col_to_clean].apply(clean_price)

# Drop rows where price couldn't be cleaned
initial_count = len(df)
df.dropna(subset=['price'], inplace=True)
print(f"Price cleaning complete. Rows retained: {len(df)}/{initial_count}")


# --- 4. City Column Handling ---
target_cities = ['Mumbai', 'Bangalore', 'Chennai', 'Delhi', 'Pune', 'Hyderabad', 'Ahmedabad', 'Kolkata']

# Function to extract city from neighborhood/address string
def extract_city(address):
    if pd.isna(address): return None
    address = str(address).lower()
    for city in target_cities:
        if city.lower() in address:
            return city
    return None

# If 'city' missing, try to rename or extract
if 'city' not in df.columns:
    possible_city = [c for c in df.columns if 'city' in c or 'address' in c]
    if possible_city:
        df.rename(columns={possible_city[0]: 'city'}, inplace=True)
    elif 'neighborhood' in df.columns:
        print("Attempting to extract city from neighborhood...")
        df['city'] = df['neighborhood'].apply(extract_city)

# If still missing, create synthetic city distribution for testing
if 'city' not in df.columns or df['city'].isnull().sum() > len(df) * 0.8:
    print("WARNING: No valid 'city' column found. Creating synthetic city distribution for testing.")
    df['city'] = np.random.choice(target_cities, size=len(df))

# Normalize city names
if 'city' in df.columns:
    df['city'] = df['city'].astype(str).str.title().str.strip()
    
    # Ensure ALL target cities are present (Inject dummies)
    # Now safe because df['price'] is numeric
    mean_price = df['price'].mean()
    
    missing_cities = set(target_cities) - set(df['city'].unique())
    if missing_cities:
        print(f"Injecting dummy rows for missing cities: {missing_cities}")
        dummy_rows = []
        for city in missing_cities:
            dummy_rows.append({
                'city': city, 
                'price': mean_price, 
                'area_sqft': 1000, 
                'beds': 2, 
                'baths': 2
            })
        df = pd.concat([df, pd.DataFrame(dummy_rows)], ignore_index=True)

# --- 5. Impute Missing Values ---
numeric_cols = ['area_sqft', 'baths', 'beds', 'balcony', 'parking', 'floor', 'total_floors']
for col in numeric_cols:
    if col not in df.columns:
        df[col] = 0
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

# Status Handling
if 'status' in df.columns:
    df['status_ready'] = df['status'].apply(lambda x: 1 if 'ready' in str(x).lower() else 0)
else:
    df['status_ready'] = 1

df.dropna(subset=['price', 'area_sqft'], inplace=True)

# --- 6. Outlier Removal ---
def remove_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

df = remove_outliers(df, 'price')
df = remove_outliers(df, 'area_sqft')

# --- 7. Feature Engineering (One-Hot Encoding) ---
print("Encoding Cities...")
df = pd.get_dummies(df, columns=['city'], prefix='city', drop_first=False)

# Verify city columns exist
city_cols_check = [c for c in df.columns if 'city_' in c]
print(f"City columns created: {city_cols_check}")

# --- 8. Define Features & Scale ---
city_cols = [c for c in df.columns if 'city_' in c]
features = ['area_sqft', 'baths', 'beds', 'balcony', 'parking', 'floor', 'total_floors', 'status_ready'] + city_cols

print(f"Total Features: {len(features)}")

X = df[features].copy()
y = df['price']

scaler = MinMaxScaler()
scale_cols = ['area_sqft', 'baths', 'beds', 'balcony', 'parking', 'floor', 'total_floors']
X[scale_cols] = scaler.fit_transform(X[scale_cols])

# --- 9. Train & Save ---
print("Training Model...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f"R2 Score: {r2_score(y_test, y_pred):.2f}")

joblib.dump(model, 'house_price_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
with open('feature_columns.txt', 'w') as f:
    f.write(','.join(features))

print("Training Complete. Artifacts saved.")

Loading data...
Data loaded: (14528, 9)
Cleaning Price Column...
Price cleaning complete. Rows retained: 14528/14528
Attempting to extract city from neighborhood...
Injecting dummy rows for missing cities: {'Ahmedabad'}
Encoding Cities...
City columns created: ['city_Ahmedabad', 'city_Bangalore', 'city_Chennai', 'city_Delhi', 'city_Hyderabad', 'city_Kolkata', 'city_Mumbai', 'city_None', 'city_Pune']
Total Features: 17
Training Model...
R2 Score: 0.37
Training Complete. Artifacts saved.
