# Real Estate Machine Learning Pipeline

End-to-end pipeline: Data Loading -> Cleaning -> Clustering -> Classification -> Regression -> Deployment.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, r2_score
import xgboost as xgb
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

## 1. Data Loading & Cleaning

In [None]:
# Load Data (Local or Kaggle)
try:
    if os.path.exists('data.csv'):
        df = pd.read_csv('data.csv')
        print("Loaded local data.")
    elif os.path.exists('/kaggle/input/housedata/data.csv'):
        df = pd.read_csv('/kaggle/input/housedata/data.csv')
        print("Loaded Kaggle data.")
    else:
        raise FileNotFoundError("Dataset not found.")
except FileNotFoundError as e:
    print(f"Error: {e}")
    df = pd.DataFrame()

# Clean dates & Remove Price Outliers
if not df.empty:
    df['date'] = pd.to_datetime(df['date'])
    df.dropna(inplace=True)
    
    Q1 = df['price'].quantile(0.25)
    Q3 = df['price'].quantile(0.75)
    df = df[(df['price'] >= (Q1 - 1.5 * (Q3-Q1))) & (df['price'] <= (Q3 + 1.5 * (Q3-Q1)))]
    print(f"Cleaned Data: {df.shape}")
    df.head(3)

## 2. Feature Engineering

In [None]:
# New Features
df['price_log'] = np.log1p(df['price'])
df['house_age'] = 2025 - df['yr_built']
df['has_renovated'] = (df['yr_renovated'] > 0).astype(int)
df['grade_sqft'] = df['grade'] * df['sqft_living']

# Market Stats for App
stats = {
    'median_price': df['price'].median(),
    'avg_price_sqft': (df['price'] / df['sqft_living']).mean(),
    'sqft_mean': df['sqft_living'].mean(),
    'sqft_std': df['sqft_living'].std(),
    'bedroom_mean': df['bedrooms'].mean(),
    'bedroom_std': df['bedrooms'].std()
}
joblib.dump(stats, 'market_stats.pkl')

## 3. Unsupervised Learning (Clustering)

In [None]:
# K-Means Clustering
scaler = StandardScaler()
X_cluster = scaler.fit_transform(df[['price', 'sqft_living', 'grade', 'house_age']])

kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(X_cluster)

# Map clusters to meaningful names
means = df.groupby('cluster')['price'].mean().sort_values()
mapping = {means.index[0]: 'Budget', means.index[1]: 'Standard', means.index[2]: 'Luxury'}
df['category'] = df['cluster'].map(mapping)

sns.scatterplot(data=df, x='sqft_living', y='price', hue='category', palette='viridis')
plt.title('Property Clusters')
plt.show()

## 4. Modeling

In [None]:
drop_cols = ['price', 'price_log', 'cluster', 'category', 'date', 'id', 'yr_renovated', 'yr_built']
X = df.drop(drop_cols, axis=1, errors='ignore').select_dtypes(include=[np.number])

# 1. Classification (Predict Category)
y_cls = df['category']
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_cls, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_c, y_train_c)
print(f"Classifier Accuracy: {clf.score(X_test_c, y_test_c):.2f}")

joblib.dump(clf, 'category_classifier.pkl')
joblib.dump(X.columns.tolist(), 'cls_features.pkl')

# 2. Regression (Predict Price)
y_reg = df['price_log']
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_reg, test_size=0.2, random_state=42)

reg = xgb.XGBRegressor(n_estimators=150, learning_rate=0.05, max_depth=5, random_state=42)
reg.fit(X_train_r, y_train_r)

preds = np.expm1(reg.predict(X_test_r))
actuals = np.expm1(y_test_r)
print(f"Regression MAE: ${mean_absolute_error(actuals, preds):,.0f}")
print(f"R2 Score: {r2_score(actuals, preds):.3f}")

joblib.dump(reg, 'price_regressor.pkl')
joblib.dump(X.columns.tolist(), 'reg_features.pkl')

## 5. Deployment Info

In [None]:
# Code for Streamlit App (app.py)
app_source = 'import streamlit as st\nimport pandas as pd\nimport joblib\nimport numpy as np\nimport xgboost as xgb\nimport os\nfrom groq import Groq\n\n# Load assets\n@st.cache_resource\ndef load_models():\n    clf = joblib.load(\'category_classifier.pkl\')\n    reg = joblib.load(\'price_regressor.pkl\')\n    cls_feats = joblib.load(\'cls_features.pkl\')\n    reg_feats = joblib.load(\'reg_features.pkl\')\n    market_stats = joblib.load(\'market_stats.pkl\')\n    return clf, reg, cls_feats, reg_feats, market_stats\n\ndef get_explanation(features, price, category, stats):\n    # Prepare context for LLM\n    median = stats[\'median_price\']\n    avg_ppsqft = stats[\'avg_price_sqft\']\n    \n    prompt = f"""\n    Act as a real estate analyst. Write a short, professional explanation for this valuation.\n    Data:\n    - Predicted Price: ${price:,.0f}\n    - Category: {category}\n    - Property: {features[\'sqft_living\']} sqft, {features[\'bedrooms\']} bed, {features[\'bathrooms\']} bath, Grade {features[\'grade\']}\n    - Market Median: ${median:,.0f}\n    - Market Avg $/sqft: ${avg_ppsqft:.0f}\n    \n    Output strictly:\n    1. Reason for price (1 sentence).\n    2. Market comparison (1 sentence).\n    3. Final verdict (Realistic/High/Low).\n    """\n\n    try:\n        api_key = st.secrets.get("GROQ_API_KEY") or os.getenv("GROQ_API_KEY")\n        if not api_key: return "⚠️ Groq API Key missing."\n        \n        client = Groq(api_key=api_key)\n        completion = client.chat.completions.create(\n            model="llama-3.3-70b-versatile",\n            messages=[{"role": "user", "content": prompt}],\n            max_tokens=150, temperature=0.3\n        )\n        return completion.choices[0].message.content\n    except Exception as e:\n        return f"Insight unavailable: {str(e)}"\n\ndef validate_data(data):\n    warnings = []\n    # Basic sanity checks\n    if data[\'grade\'] > 10 and data[\'sqft_living\'] < 1000:\n        warnings.append("High grade with small area is unusual.")\n        data[\'grade\'] = 8\n    if data[\'bedrooms\'] > 0 and (data[\'sqft_living\'] / data[\'bedrooms\'] < 150):\n        warnings.append(" Bedroom count seems high for this size.")\n        data[\'bedrooms\'] = int(data[\'sqft_living\'] / 200)\n    return warnings, data\n\ndef cap_price(price, data):\n    # Prevent unrealistic outliers\n    max_ppsf_map = {5: 150, 7: 300, 9: 500, 11: 800, 13: 1500}\n    limit = next((v for k, v in max_ppsf_map.items() if data[\'grade\'] <= k), 1500)\n    \n    if (price / data[\'sqft_living\']) > limit:\n        price = data[\'sqft_living\'] * limit\n    return price\n\n# Main App\nst.set_page_config(page_title="Real Estate AI", layout="wide")\n\ntry:\n    clf, reg, cls_cols, reg_cols, stats = load_models()\nexcept:\n    st.error("Models failed to load. Please run the training notebook.")\n    st.stop()\n\nst.title("Real Estate Valuation AI 🏡")\n\n# Sidebar\nst.sidebar.header("Property Details")\ns_sqft = st.sidebar.number_input("SqFt Living", 300, 10000, 2000)\ns_grade = st.sidebar.slider("Grade (1-13)", 1, 13, 7)\ns_year = st.sidebar.number_input("Year Built", 1900, 2025, 2000)\ns_beds = st.sidebar.slider("Bedrooms", 0, 10, 3) \ns_baths = st.sidebar.slider("Bathrooms", 0.0, 8.0, 2.0, 0.5) \ns_renov = st.sidebar.number_input("Renovated (Year)", 0, 2025, 0)\ns_floors = st.sidebar.slider("Floors", 1.0, 3.5, 1.0, 0.5)\ns_water = st.sidebar.selectbox("Waterfront", [0, 1])\ns_cond = st.sidebar.slider("Condition (1-5)", 1, 5, 3)\n\nif st.sidebar.button("Valuate"):\n    raw = {\n        \'sqft_living\': s_sqft, \'grade\': s_grade, \'yr_built\': s_year,\n        \'bedrooms\': s_beds, \'bathrooms\': s_baths, \'yr_renovated\': s_renov,\n        \'floors\': s_floors, \'waterfront\': s_water, \'condition\': s_cond,\n        \'zipcode\': 98000\n    }\n    \n    warns, clean_data = validate_data(raw.copy())\n    for w in warns: st.warning(w)\n        \n    # Feature Engineering\n    features = clean_data.copy()\n    features[\'house_age\'] = 2025 - features[\'yr_built\']\n    features[\'has_renovated\'] = 1 if features[\'yr_renovated\'] > 0 else 0\n    features[\'grade_sqft\'] = features[\'grade\'] * features[\'sqft_living\']\n    \n    # Align features\n    input_df = pd.DataFrame([features]).reindex(columns=reg_cols, fill_value=0)\n    cls_input = pd.DataFrame([features]).reindex(columns=cls_cols, fill_value=0)\n\n    c1, c2 = st.columns(2)\n    \n    # Predict\n    try:\n        # Price\n        log_price = reg.predict(input_df)[0]\n        price = cap_price(np.expm1(log_price), clean_data)\n        \n        # Category\n        cat = clf.predict(cls_input)[0]\n        \n        c1.subheader(f"Category: {cat}")\n        c2.metric("Valuation", f"${price:,.0f}")\n        \n        st.divider()\n        with st.spinner("Analyzing..."):\n            insight = get_explanation(clean_data, price, cat, stats)\n            st.info(insight)\n            \n    except Exception as e:\n        st.error(f"Prediction failed: {e}")\n'

with open("app.py", "w") as f:
    f.write(app_source)
    
print("App code written to `app.py`.")

# Requirements
reqs = "streamlit\npandas\nnumpy\nscikit-learn\nxgboost\njoblib\ngroq\nmatplotlib\nseaborn"
with open("requirements.txt", "w") as f:
    f.write(reqs)