In [None]:
# House Price Prediction - End to End Project (Colab Ready)
# =========================================================
# This notebook is designed for Google Colab, with automatic installation of dependencies,
# clean error handling, and fallback to synthetic data if Kaggle dataset is unavailable.

# --- Setup ---
!pip install -q pandas numpy scikit-learn matplotlib seaborn xgboost

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# --- Load Dataset ---
print("Loading dataset...")

try:
    df = pd.read_csv('train.csv')
    print("✅ Loaded Kaggle House Prices dataset successfully.")
except FileNotFoundError:
    print("⚠️ train.csv not found. Using synthetic dataset for demonstration.")
    from sklearn.datasets import make_regression
    X, y = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=42)
    df = pd.DataFrame(X, columns=[f"feature_{i}" for i in range(10)])
    df['SalePrice'] = y

# --- Quick Look ---
print(df.head())

# --- Preprocessing ---
print("Preprocessing...")
df = df.fillna(df.median(numeric_only=True))

X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Models ---
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, random_state=42, verbosity=0)
}

results = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    results[name] = {"RMSE": rmse, "R2": r2}
    print(f"{name}: RMSE={rmse:.2f}, R2={r2:.2f}")

# --- Results ---
results_df = pd.DataFrame(results).T
print("\nModel Performance:")
print(results_df)

# --- Visualization ---
results_df.plot(kind='bar', figsize=(8,5))
plt.title("Model Comparison (House Price Prediction)")
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.show()
