In [6]:
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder

from house_prices.cluster_similarity import ClusterSimilarity
from house_prices.data import load_housing_data

In [7]:
# Prepare the data for the exercises
housing = load_housing_data()

In [8]:
import numpy as np

# Create income categories in order to get representative samples from the dataset

housing["income_cat"] = pd.cut(housing["median_income"], bins=[0, 1.5, 3, 4.5, 6, np.Inf], labels=[1, 2, 3, 4, 5])

stratified_train_set, stratified_test_set = train_test_split(housing, test_size=0.2, random_state=42,
                                                             stratify=housing["income_cat"])

for set_ in (stratified_test_set, stratified_train_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [9]:
housing_test = stratified_test_set.copy()

housing = stratified_train_set.drop("median_house_value", axis=1)
housing_labels = stratified_train_set["median_house_value"].copy()
housing.shape

(16512, 9)

In [10]:
from house_prices.data import column_ratio


def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="mean"),
        FunctionTransformer(column_ratio, feature_names_out=lambda a, b: ["ratio"]),
        StandardScaler()
    )


log_pipeline = make_pipeline(
    SimpleImputer(strategy="mean"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler()
)

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1, random_state=42)

default_num_pipeline = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessing = ColumnTransformer([
    ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
    ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
    ("people_per_home", ratio_pipeline(), ["population", "households"]),
    ("geo", cluster_simil, ["latitude", "longitude"]),
    ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),
    ("cat", cat_pipeline, make_column_selector(dtype_include=object))
],
    remainder=default_num_pipeline)

housing_prepared = preprocessing.fit_transform(housing)
housing_prepared.shape

(16512, 24)

In [13]:
preprocessing.get_feature_names_out()

array(['bedrooms__ratio', 'rooms_per_house__ratio',
       'people_per_home__ratio', 'geo__Cluster 0 similarity',
       'geo__Cluster 1 similarity', 'geo__Cluster 2 similarity',
       'geo__Cluster 3 similarity', 'geo__Cluster 4 similarity',
       'geo__Cluster 5 similarity', 'geo__Cluster 6 similarity',
       'geo__Cluster 7 similarity', 'geo__Cluster 8 similarity',
       'geo__Cluster 9 similarity', 'log__total_bedrooms',
       'log__total_rooms', 'log__population', 'log__households',
       'log__median_income', 'cat__ocean_proximity_<1H OCEAN',
       'cat__ocean_proximity_INLAND', 'cat__ocean_proximity_ISLAND',
       'cat__ocean_proximity_NEAR BAY', 'cat__ocean_proximity_NEAR OCEAN',
       'remainder__housing_median_age'], dtype=object)

In [15]:
housing_labels.head()

13096    458300.0
14973    483800.0
3785     101700.0
14689     96100.0
20507    361800.0
Name: median_house_value, dtype: float64

# 1. Try a support vector machine (SVM) regressor

with various hyperparameters, such as `kernel="linear"` ( with various values for the C hyperparameter) or `kernel="rbf"` (with various values for the C and gamma hyperparameters). Note that SVMs don't scale well to large datasets, so you should probably train your model on just the first 5,000 instances of the training set and use only 3-fold cross-validation, or else it will take hours. Don't worry about what the hyperparameters mean for now: we'll discuss them in Chapter 5. How does the best SVR predictor perform?

In [29]:
from sklearn.svm import SVR

svm_linear = SVR(kernel="linear", C=15)

svm_linear.fit(housing_prepared, housing_labels)

In [30]:
# Measure performance
housing_predictions = svm_linear.predict(housing_prepared)
housing_predictions[:5]

array([177298.07138257, 268222.72657448, 122965.72150066, 111114.78887808,
       216431.61016052])

In [31]:
from sklearn.metrics import mean_squared_error

svm_linear_rmse = mean_squared_error(housing_labels, housing_predictions, squared=False)
svm_linear_rmse

83690.28497512378

In [43]:
svm_rbf = SVR(kernel="rbf", C=15, gamma=2)
svm_rbf.fit(housing_prepared, housing_labels)

In [44]:
svm_rbf_predictions = svm_rbf.predict(housing_prepared)

In [45]:
svm_rbf_rmse = mean_squared_error(housing_labels, svm_rbf_predictions, squared=False)
svm_rbf_rmse

118304.92555161349

TODO:

- Do Cross Validation with test set
- Tune hyperparameters

# 2. Try replacing the `GridSearchCV` with a `RandomizedSearchCV`

# 3. Try adding a `SelectFromModel` transformer in the preparation pipeline to select only the most important attributes

# Try creating a custom transformer that trains a k-nearest neighbors regressor

(sklearn.neighbors.KNeibhborsRegressor) in its fit() method, and outputs the model's predictions in its transform() method. The add this feature to the preprocessing pipeline, using latitude and longitude as the inputs to this transformer. This will add a feature in the model that correspondes to the housing median price of the nearest districts.