In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
# Load dataset
housing = fetch_california_housing(as_frame=True)
df = housing.frame

print("Dataset head:")
print(df.head())

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())  # Usually none in this dataset

# Separate features and target
X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

# Split into train/test (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

# Feature scaling - Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeature scaling applied: StandardScaler (mean=0, std=1)")

#Explanation
# Converted sklearn dataset to DataFrame for easy handling.

# Checked for missing values â€” none found, so no imputation needed.

# Splitting dataset into train/test for model evaluation.

# Features scaled with StandardScaler because many algorithms (especially SVR) require features on similar scale for better convergence and performance.


Dataset head:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  

Missing values per column:
MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

Feature scaling applied: StandardScaler (mean=0, std=1)


In [5]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

print("Linear Regression trained.")
#Explanation:
# Simple, interpretable, assumes linear relationship. Good baseline.

Linear Regression trained.


In [7]:
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_scaled, y_train)
y_pred_dt = dt.predict(X_test_scaled)

print("Decision Tree Regressor trained.")
#Explanation:
# Non-linear model, captures interactions, but prone to overfitting if not tuned.


Decision Tree Regressor trained.


In [9]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)

print("Random Forest Regressor trained.")

# Explanation:
# Ensemble of decision trees, reduces overfitting, generally strong performer.


Random Forest Regressor trained.
