# California Housing Price Prediction Model
## Training with Preprocessing Pipeline

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import os

## 1. Load California Housing Dataset

In [2]:
california = fetch_california_housing()
df = pd.DataFrame(california.data, columns=california.feature_names)
df['target'] = california.target

print(f"Dataset shape: {df.shape}")
print(f"\nFeatures: {california.feature_names}")
print(f"\nFirst few rows:")
df.head()

Dataset shape: (20640, 9)

Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']

First few rows:


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


## 2. Data Exploration

In [3]:
print("Dataset Info:")
print(df.info())
print("\nStatistical Summary:")
print(df.describe())
print("\nMissing Values:")
print(df.isnull().sum())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   target      20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB
None

Statistical Summary:
             MedInc      HouseAge      AveRooms     AveBedrms    Population  \
count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
mean       3.870671     28.639486      5.429000      1.096675   1425.476744   
std        1.899822     12.585558      2.474173      0.473911   1132.462122   
min        0.499900      1.000000      0.846

## 3. Save Raw Dataset

In [4]:
os.makedirs('../data', exist_ok=True)
df.to_csv('../data/california_housing.csv', index=False)
print("Dataset saved to ../data/california_housing.csv")

Dataset saved to ../data/california_housing.csv


## 4. Preprocessing Pipeline

In [5]:
def preprocess_data(df, scaler=None, fit_scaler=True):
    """
    Preprocess the housing data.
    
    Args:
        df: DataFrame with features and target
        scaler: StandardScaler object (optional)
        fit_scaler: Whether to fit the scaler
    
    Returns:
        X_scaled, y, scaler
    """
    df = df.copy()
    
    # Handle missing values
    df = df.fillna(df.median())
    
    # Separate features and target
    if 'target' in df.columns:
        X = df.drop('target', axis=1)
        y = df['target']
    else:
        X = df
        y = None
    
    # Scale features
    if scaler is None:
        scaler = StandardScaler()
    
    if fit_scaler:
        X_scaled = scaler.fit_transform(X)
    else:
        X_scaled = scaler.transform(X)
    
    return X_scaled, y, scaler

In [6]:
# Apply preprocessing
X_scaled, y, scaler = preprocess_data(df)
print(f"Preprocessed data shape: {X_scaled.shape}")
print(f"Target shape: {y.shape}")

Preprocessed data shape: (20640, 8)
Target shape: (20640,)


## 5. Train-Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

Training set size: 16512
Test set size: 4128


## 6. Train Models

In [8]:
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

print("Models trained successfully!")

Models trained successfully!


## 7. Evaluate Models

In [9]:
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n{model_name} Performance:")
    print(f"  RMSE: ${rmse*100000:.2f}")
    print(f"  MAE: ${mae*100000:.2f}")
    print(f"  R² Score: {r2:.4f}")
    
    return {'rmse': rmse, 'mae': mae, 'r2': r2}

lr_metrics = evaluate_model(lr_model, X_test, y_test, "Linear Regression")
rf_metrics = evaluate_model(rf_model, X_test, y_test, "Random Forest")


Linear Regression Performance:
  RMSE: $74558.14
  MAE: $53320.01
  R² Score: 0.5758

Random Forest Performance:
  RMSE: $50546.79
  MAE: $32761.31
  R² Score: 0.8050


## 8. Save Models and Scaler

In [10]:
os.makedirs('../models', exist_ok=True)

# Save the better performing model (Random Forest)
joblib.dump(rf_model, '../models/housing_model.pkl')
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(california.feature_names, '../models/feature_names.pkl')

# Save metadata
metadata = {
    'model_type': 'RandomForestRegressor',
    'features': california.feature_names,
    'metrics': rf_metrics,
    'training_samples': len(X_train)
}
joblib.dump(metadata, '../models/metadata.pkl')

print("\nModels saved successfully!")
print("  - housing_model.pkl")
print("  - scaler.pkl")
print("  - feature_names.pkl")
print("  - metadata.pkl")


Models saved successfully!
  - housing_model.pkl
  - scaler.pkl
  - feature_names.pkl
  - metadata.pkl


## 9. Test Prediction

In [11]:
# Test with a sample
sample = X_test[0:1]
prediction = rf_model.predict(sample)
actual = y_test.iloc[0]

print(f"\nSample Prediction:")
print(f"  Predicted: ${prediction[0]*100000:.2f}")
print(f"  Actual: ${actual*100000:.2f}")
print(f"  Difference: ${abs(prediction[0] - actual)*100000:.2f}")


Sample Prediction:
  Predicted: $50950.00
  Actual: $47700.00
  Difference: $3250.00
