In [49]:
import pandas as pd
import numpy as np

from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FunctionTransformer, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor


Bare-bone notebook without any of the data exploration cells. Its purpose is to have a working pipeline so that 
I can do the exercises.

In [5]:
def load_housing_data():
  tarball_path = Path("datasets/housing.tgz")
  if not tarball_path.is_file():
    Path("datasets").mkdir(parents=True, exist_ok=True)
    url = "https://github.com/ageron/data/raw/main/housing.tgz"
    urllib.request.urlretrieve(url, tarball_path)
    
    with tarfile.open(tarball_path) as housing_tarball:
      housing_tarball.extractall(path="datasets")
  
  return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()

In [7]:
# Separate the label column from the data

X = housing.drop("median_house_value", axis=1)
y = housing["median_house_value"].copy()

In [15]:
# First, split the data into a temporary training set and a temporary validation + testing set
X_train_val, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now, split the temporary validation + testing set into a validation set and a testing set
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_temp, test_size=0.4, random_state=42)

print ("X_train shape: ", X_train.shape)
print ("y_train shape: ", y_train.shape)
print ("-" * 30)
print ("X_val shape: ", X_val.shape)
print ("y_val shape: ", y_val.shape)
print ("-" * 30)
print ("X_test shape: ", X_test.shape)
print ("y_test shape: ", y_test.shape)

X_train shape:  (9907, 9)
y_train shape:  (9907,)
------------------------------
X_val shape:  (6605, 9)
y_val shape:  (6605,)
------------------------------
X_test shape:  (4128, 9)
y_test shape:  (4128,)


In [42]:
# Preprocessing pipeline definitions


class ClusterSimilarity(BaseEstimator, TransformerMixin):
  def __init__(self, n_cluster=10, gamma=1.0, random_state=None) -> None:
    super().__init__()

    self.n_cluster = n_cluster
    self.gamma = gamma
    self.random_state = random_state

  def fit(self, X, y=None, sample_weight=None):
    self.kmeans_ = KMeans(self.n_cluster, random_state=self.random_state, n_init=10)
    self.kmeans_.fit(X, sample_weight=sample_weight)

    return self
  
  def transform(self, X):
    return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
  
  def get_feature_names_out(self, names=None):
    return [f"Cluster {i} similarity" for i in range(self.n_cluster)]
  

# Takes a numpy array and does a ratio of the first and second columns
def column_ratio(X):
  return X[:, [0]] / X[:, [1]]

# Returns a name. Used in the FunctionTransformer. The name will get appended to the ColumnTransformer
# column name
def ratio_name(function_transformer, feature_names_in):
  return ["ratio"]

# Pipeline to compute the ratio of any given two columns
def ratio_pipeline():
  return make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(column_ratio, feature_names_out=ratio_name),
    StandardScaler()
  )

# Runs any given column in a log function for smoothing
log_pipeline = make_pipeline(
  SimpleImputer(strategy="median"),
  FunctionTransformer(np.log, feature_names_out="one-to-one"),
  StandardScaler()
)

# Computes centers and cluster similarity to these centers
cluster_simil = ClusterSimilarity(n_cluster=10, gamma=1, random_state=42)

# Impute and scale for any numerical features
default_num_pipeline = make_pipeline(
  SimpleImputer(strategy="median"),
  StandardScaler()
)

# Impute and encode for any categorical features
default_cat_pipeline = make_pipeline(
  SimpleImputer(strategy="most_frequent"),
  OneHotEncoder(handle_unknown="ignore")
)

# Building the pipeline from all parts above. Each 3-tuple is composed of (name, pipeline/transformer, columns to use for it).
preprocessing = ColumnTransformer([
  ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
  ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
  ("people_per_house", ratio_pipeline(), ["population", "households"]),
  ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),
  ("geo", cluster_simil, ["latitude", "longitude"]),
  ("cat", default_cat_pipeline, make_column_selector(dtype_include=object)),
  ("num", default_num_pipeline, ["housing_median_age"]) 
])


### Exercise 1: Using SVR

In [47]:
from sklearn.svm import SVR

svr = make_pipeline(
  preprocessing, 
  SVR(kernel="linear", C=15000)  
)

svr.fit(X_train, y_train)

In [48]:
housing_predictions = svr.predict(X_train)

svr_rmse = mean_squared_error(y_train, housing_predictions, squared=False)
print ("SVR RMSE: ", svr_rmse)

SVR RMSE:  72156.00873081066


In [35]:
svr_rmses = -1 * cross_val_score(svr, X_train, y_train, 
                             scoring="neg_root_mean_squared_error", cv=10)

svr_rmses

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


array([114492.20197595, 112489.08752852, 110483.27873357, 109276.4169371 ,
       109065.93495031, 114122.56268446, 111892.29005758, 118012.93113222,
       115228.27485572, 114542.97682645])

In [43]:

full_pipeline = Pipeline([
  ("preprocessing", preprocessing),
  ("svr", SVR(kernel="linear"))
])

# Since we're using composite pipelines to process the training data, our hyperparameters are buried deep in the 
# pipeline's transformers and predictors. To access these parameters we use the pipeline's name followed by underscore
# to get to the pipeline/transformer/predictor name followed by the name of the hyperparameter we're changing.

# For example, preprocessing__geo__n_cluster -> 
# preprocessing pipeline, geo transformer (ClusterSimilarity), n_cluster hyperparameter 
param_grid = [
        {'svr__kernel': ['linear'], 'svr__C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
        {'svr__kernel': ['rbf'], 'svr__C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
         'svr__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
]

# First set of hyperparameters options will yield 3 * 3 = 9 combinations, the second set will yield 2 * 3 = 6 combinations.
# 9 + 6 = 15 different parameter combinations to try over 3 CV rounds (k = 3) yields 15 * 3 = 45 training/evaluation rounds.
grid_search = GridSearchCV(full_pipeline, param_grid, cv=3, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)



KeyboardInterrupt: 

In [44]:
# Getting the best parameters from the grid search
print (grid_search.best_params_)
# Getting the evaluation data from the grid search
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="rank_test_score", ascending=True, inplace=True)
cv_res.head()


AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

### Exercise 3: using SelectFromModel transformer to automatically select the most important features 

In [71]:
from sklearn.feature_selection import SelectFromModel

# This pipeline will do preprocessing, then train the model using a RandomForestRegressor that's wrapped in SelectFromModel.
# This step will return a selector which when given the dataset, it'll transform it by removing the non-important columns.
# Then we send the truncated dataset to a RandomForestRegressor model to do the training on.
tree_reg = Pipeline([
    ("preprocessing", preprocessing),
    ("feature_selection", SelectFromModel(RandomForestRegressor(random_state=42))),
    ("forest_regressor", RandomForestRegressor(random_state=42, max_features=6))
  ]
)
tree_reg.fit(X_train, y_train)



In [72]:
selector = tree_reg["feature_selection"]
selector.threshold_
selector.get_support()

predictions = tree_reg.predict(X_train)

print ("random forest RMSE (train): ", mean_squared_error(y_train, predictions, squared=False))


random forest RMSE (train):  26390.706967850176


### Exercise 4: custom KNN regressor transformer

Custom transformer that does regression using KNN. We'll use it in the pipeline using the lat/long features and the house value
as labels to create a another feature which will the housing value of the nearest districts

In [84]:
from sklearn.neighbors import KNeighborsRegressor

class KNNRegTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, n_neighbors=5) -> None:
    super().__init__()
    self.n_neighbors = n_neighbors

  def fit(self, X, y=None, sample_weight=None):
    y = X["median_house_value"]
    model = KNeighborsRegressor(n_neighbors=self.n_neighbors)
    model.fit(X, y)
    self.model = model
    return self
  
  def transform(self, X):
    output = self.model.predict(X)
    output = output.reshape((output.shape[0],1))
    print ("output: ", output.shape)

    return output
    
  
  # def get_feature_names_out(self, names=None):
  #   return [f"Cluster {i} similarity" for i in range(self.n_cluster)]


knn_reg = KNNRegTransformer()

preprocessing = ColumnTransformer([
  ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
  ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
  ("people_per_house", ratio_pipeline(), ["population", "households"]),
  ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),
  ("geo", knn_reg, ["latitude", "longitude", "median_house_value"]),
  ("cat", default_cat_pipeline, make_column_selector(dtype_include=object)),
  ("num", default_num_pipeline, ["housing_median_age"]) 
])

tree_reg = Pipeline([
    ("preprocessing", preprocessing),
    ("forest_regressor", RandomForestRegressor(random_state=42, max_features=6))
  ]
)
tree_reg.fit(housing, y)

output:  (20640, 1)


In [86]:
predictions = tree_reg.predict(housing)

print ("random forest RMSE (train): ", mean_squared_error(y, predictions, squared=False))

output:  (20640, 1)
random forest RMSE (train):  1178.6543318206916


#### Exercise 6: implement StandardScaler from scratch

In [103]:

class StandardScalerClone(BaseEstimator, TransformerMixin):
  def __init__(self) -> None:
    super().__init__()

  def fit(self, X, y=None, sample_weight=None):
    self.mean = np.mean(X, axis=0)
    self.std = np.std(X, axis=0)
    return self

  def transform(self, X):
    return (X - self.mean) / self.std
  
  def inverse_transform(self, X):
    return (X * self.std) + self.mean
  

std_scaler_clone = StandardScalerClone()
x_temp = np.array([
  [10,200,1],
  [14,400,3]
]).T
print (x_temp.shape)

std_scaler_clone_result = std_scaler_clone.fit_transform(x_temp)
print (std_scaler_clone_result)

std_scaler = StandardScaler()
std_scaler_result = std_scaler.fit_transform(x_temp)
print (std_scaler_result)

print (std_scaler_clone.inverse_transform(std_scaler_clone_result))


(3, 2)
[[-0.65749968 -0.67710492]
 [ 1.41307942  1.41379508]
 [-0.75557974 -0.73669016]]
[[-0.65749968 -0.67710492]
 [ 1.41307942  1.41379508]
 [-0.75557974 -0.73669016]]
[[ 10.  14.]
 [200. 400.]
 [  1.   3.]]
