In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



In [6]:
# Load California Housing dataset
data = fetch_california_housing()

In [7]:
# Convert to DataFrame
df = pd.DataFrame(data.data, columns=data.feature_names)
df['Target'] = data.target



In [8]:
# Check for missing values
print("Missing Values:\n", df.isnull().sum())



Missing Values:
 MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
Target        0
dtype: int64


We convert data to a DataFrame for efficient data manipulation, analysis, and compatibility with Pandas' powerful functions.
Checking for missing values is essential to ensure data quality, prevent errors in analysis, and apply appropriate handling techniques for accurate results.


In [10]:
# Splitting dataset into features and target
X = df.drop(columns=['Target'])
y = df['Target']



In [11]:
# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# Standardizing features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)





In [13]:
# Initializing models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}

Random Forest is good because it reduces overfitting, handles missing values well, works with both classification and regression tasks, and improves accuracy by averaging multiple decision trees.

Random Forest is good for California housing data because it handles non-linearity, captures complex feature interactions, reduces overfitting, and provides robust predictions even with missing or noisy data.

In [14]:
# Training and evaluating models
results = {}
for name, model in models.items():
    if name == "Support Vector Regressor":
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    results[name] = {
        "R2 Score": r2_score(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred)
    }



In [15]:
# Convert results to DataFrame
results_df = pd.DataFrame(results).T
print("\nModel Performance Comparison:\n", results_df)




Model Performance Comparison:
                           R2 Score       MSE       MAE
Linear Regression         0.575788  0.555892  0.533200
Decision Tree             0.622076  0.495235  0.454679
Random Forest             0.805123  0.255368  0.327543
Gradient Boosting         0.775645  0.293997  0.371643
Support Vector Regressor  0.727563  0.357004  0.398599


In [16]:
# Identify best and worst performing models
best_model = results_df['R2 Score'].idxmax()
worst_model = results_df['R2 Score'].idxmin()

print(f"Best Performing Model: {best_model} with R2 Score: {results_df.loc[best_model, 'R2 Score']}")
print(f"Worst Performing Model: {worst_model} with R2 Score: {results_df.loc[worst_model, 'R2 Score']}")


Best Performing Model: Random Forest with R2 Score: 0.8051230593157366
Worst Performing Model: Linear Regression with R2 Score: 0.575787706032451
