# 🏡 California Housing Price Prediction

This notebook demonstrates a regression workflow using the California Housing dataset. We will perform:
- Data loading and preprocessing
- Exploratory Data Analysis (EDA)
- Feature preparation
- Model training using **Random Forest Regressor**
- Model evaluation and visualization

In [None]:
# 📦 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")


## 📂 Load California Housing Dataset

In [None]:
data = fetch_california_housing(as_frame=True)
df = data.frame
df.head()

## 📊 Dataset Overview

In [None]:
df.info()

In [None]:
df.describe()

## 🧪 Feature Correlation

In [None]:
plt.figure(figsize=(12, 6))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Feature Correlation Matrix")
plt.show()

## 🎯 Feature and Target Separation

In [None]:
X = df.drop(columns=["MedHouseVal"])
y = df["MedHouseVal"]

## 🧠 Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

## 🌲 Model Training: Random Forest

In [None]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", RandomForestRegressor(n_estimators=100, random_state=42))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)


## 📈 Evaluation Metrics

In [None]:
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")


## 📉 Predicted vs Actual Visualization

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--")
plt.xlabel("Actual Median House Value")
plt.ylabel("Predicted Median House Value")
plt.title("Actual vs Predicted")
plt.grid(True)
plt.show()