# California Housing â€“ Preprocessing


In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [3]:
housing = fetch_california_housing(as_frame=True)
df = housing.frame.copy()

X = df.drop(columns="MedHouseVal")
y = df["MedHouseVal"]


In [4]:
print("X shape:", X.shape)
print("y shape:", y.shape)
print("Features:", X.columns.tolist())

X shape: (20640, 8)
y shape: (20640,)
Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print("X_train:", X_train.shape, "X_test:", X_test.shape)
print("y_train:", y_train.shape, "y_test:", y_test.shape)


X_train: (16512, 8) X_test: (4128, 8)
y_train: (16512,) y_test: (4128,)


In [8]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Scaled shapes:", X_train_scaled.shape, X_test_scaled.shape)


Scaled shapes: (16512, 8) (4128, 8)


In [9]:
print("Train mean (first 5):", X_train_scaled.mean(axis=0)[:5])
print("Train std  (first 5):", X_train_scaled.std(axis=0)[:5])

print("Test mean  (first 5):", X_test_scaled.mean(axis=0)[:5])
print("Test std   (first 5):", X_test_scaled.std(axis=0)[:5])


Train mean (first 5): [-6.51933288e-17 -9.25185854e-18 -1.98108110e-16 -1.70729064e-16
 -2.15159501e-19]
Train std  (first 5): [1. 1. 1. 1. 1.]
Test mean  (first 5): [-0.02647585  0.01237949 -0.01305925 -0.00011079 -0.00429306]
Test std   (first 5): [0.9879485  0.99322892 1.17051691 1.4084032  0.97965408]


In [12]:
np.save("../data/X_train_scaled.npy", X_train_scaled)
np.save("../data/X_test_scaled.npy", X_test_scaled)
np.save("../data/y_train.npy", y_train.to_numpy())
np.save("../data/y_test.npy", y_test.to_numpy())

print("Saved arrays to ../data/")


Saved arrays to ../data/
