# Solve the California Housing Prices

In [1]:
from sklearn.datasets import fetch_california_housing

california_housing = fetch_california_housing(as_frame=True)

x = california_housing.data
y = california_housing.target


## Use 5-fold cross-validation

In [2]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5)

## Compare linear regression and decision tree

## Evaluate the MSE and MAE

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor()
}

for name, model in models.items():

    MAE = []
    MSE = []

    for train_index, test_index in kfold.split(x,y):
        X_train, X_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        MAE.append(mean_absolute_error(y_test, y_pred))
        MSE.append(mean_squared_error(y_test, y_pred))

    print(f'{name}')
    print(f"MAE: {np.mean(MAE):.4f}")
    print(f"MSE: {np.mean(MSE):.4f}")
    print()

Linear Regression
MAE: 0.5475
MSE: 0.5583

Decision Tree
MAE: 0.6238
MSE: 0.8239



## Compare each model when using and not using power transform

In [8]:
from sklearn.preprocessing import PowerTransformer

power_transform = PowerTransformer()
x_transform = power_transform.fit_transform(x)



In [11]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor()
}

for name, model in models.items():

    MAE = []
    MSE = []

    for train_index, test_index in kfold.split(x_transform, y):
        X_train, X_test = x_transform[train_index], x_transform[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        MAE.append(mean_absolute_error(y_test, y_pred))
        MSE.append(mean_squared_error(y_test, y_pred))

    print(f'{name}')
    print(f"MAE: {np.mean(MAE):.4f}")
    print(f"MSE: {np.mean(MSE):.4f}")
    print()

Linear Regression
MAE: 0.5957
MSE: 0.6105

Decision Tree
MAE: 0.6258
MSE: 0.8082



## Compare each model with z-scale and min-max scale

In [14]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor()
}

for name, model in models.items():

    MAE = []
    MSE = []

    for train_index, test_index in kfold.split(x_scaled, y):
        X_train, X_test = x_scaled[train_index], x_scaled[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        MAE.append(mean_absolute_error(y_test, y_pred))
        MSE.append(mean_squared_error(y_test, y_pred))

    print(f'{name}')
    print(f"MAE: {np.mean(MAE):.4f}")
    print(f"MSE: {np.mean(MSE):.4f}")
    print()


Linear Regression
MAE: 0.5475
MSE: 0.5583

Decision Tree
MAE: 0.6153
MSE: 0.7997



In [16]:
from sklearn.preprocessing import MinMaxScaler

scaler_minmax = MinMaxScaler()
x_scaled_minmax = scaler_minmax.fit_transform(x)

models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor()
}

for name, model in models.items():

    MAE = []
    MSE = []

    for train_index, test_index in kfold.split(x_scaled_minmax, y):
        X_train, X_test = x_scaled_minmax[train_index], x_scaled_minmax[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        MAE.append(mean_absolute_error(y_test, y_pred))
        MSE.append(mean_squared_error(y_test, y_pred))

    print(f'{name}')
    print(f"MAE: {np.mean(MAE):.4f}")
    print(f"MSE: {np.mean(MSE):.4f}")
    print()

Linear Regression
MAE: 0.5475
MSE: 0.5583

Decision Tree
MAE: 0.6254
MSE: 0.8296



Analysis:

 - Z-scale preprocessing showed the most stable results
 - Power transformation slightly degraded model performance
 - Min-Max scaling did not significantly improve model outcomes