In [1]:
import numpy as np
import pandas as pd
import sklearn 
from pathlib import Path

In [2]:
!pip install -upgrade scikit-learn
sklearn.__version__


Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: -u


'1.1.3'

In [3]:
sklearn.set_config(display='diagram')

In [4]:
# 1 - read data 

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()

In [5]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [6]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [7]:
# 2 - train test split

In [8]:
from sklearn.model_selection import train_test_split 

X = housing.drop("median_house_value", axis=1)
y = np.array(housing["median_house_value"])

(X_train, X_test, y_train, y_test) = train_test_split(X, y, stratify=housing["ocean_proximity"] )

In [9]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(15480, 9) (5160, 9) (15480,) (5160,)


In [10]:
# 3 - preprocessing

In [11]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

from sklearn.cluster import KMeans

In [12]:
# numerical columns -> log pipeline, long tailed distribution
log_pipeline = make_pipeline(SimpleImputer(strategy='median'),
                             FunctionTransformer(np.log, feature_names_out='one-to-one'),
                             StandardScaler())

# categorical variables
cat_pipeline = Pipeline([('impute_freq', SimpleImputer(strategy='most_frequent')),
                        ('encode', OneHotEncoder(handle_unknown='ignore'))])

# new variables -> ratio
def column_ratio(X):
  return X[:,[0]] / X[:,[1]]

def ratio_name(function_transformer, feature_names_in):
  return["ratio"]

def ratio_pipeline():
  return make_pipeline(
      SimpleImputer(strategy="median"),
      FunctionTransformer(column_ratio, feature_names_out=ratio_name),
      StandardScaler()
  )

# latitude and longitude -> clustering
loc_pipeline = Pipeline([('impute_mean', SimpleImputer(strategy='mean')),
                     ('similarity', KMeans(n_clusters=10, random_state=42))
                     ])

#remainder -> housing_median_age
num_pipeline = make_pipeline(SimpleImputer(strategy='median'),
                             StandardScaler())

In [13]:
# 4 - pipeline

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector

In [15]:
preprocessing_pipeline = ColumnTransformer([
                                            ('log_pipeline', log_pipeline, ["total_bedrooms", 
                                                                            "total_rooms", "population", 
                                                                            "households", "median_income"]),
                                            ('cat_pipeline', cat_pipeline, make_column_selector(dtype_include=object)),
                                            ('ratio_pipeline_bedrooms', ratio_pipeline(), ["total_rooms", "households"]),
                                            ('ratio_pipeline_people', ratio_pipeline(), ["population", "households"]),
                                            ('location_pipeline', loc_pipeline, ["latitude", "longitude"])
                                         ], remainder=num_pipeline)

In [16]:
X_pre = preprocessing_pipeline.fit_transform(X_train, y_train)

In [17]:
X_pre

array([[ 1.52603743,  0.75543414,  0.39849372, ...,  0.71905335,
         1.45584776,  1.85876497],
       [-0.8597525 , -1.26373483, -0.97596102, ...,  4.92475768,
         4.63770735, -0.85804003],
       [-1.74816236, -3.0755268 , -2.16515267, ...,  0.93385269,
         1.58821068,  1.85876497],
       ...,
       [-2.29441549, -2.01954498, -2.05089078, ...,  5.02474374,
         4.69979961,  0.5802685 ],
       [ 1.59080889,  1.74368298,  1.4359237 , ...,  1.08381509,
         0.11387818, -1.49728826],
       [ 0.41157743,  0.79803262,  0.58401149, ...,  4.49171049,
         4.24689846, -0.21879179]])

In [18]:
# 4 - train

1. Try a support vector machine regressor (sklearn.svm.SVR) with various hyperparameters, such as kernel="linear" (with various values for the C hyperparameter) or kernel="rbf" (with various values for the C and gamma hyperparameters). Note that support vector machines don’t scale well to large datasets, so you should probably train your model on just the first 5,000 instances of the training set and use only 3-fold cross-validation, or else it will take hours. Don’t worry about what the hyperparameters mean for now; we’ll discuss them in Chapter 5. How does the best SVR predictor perform?

In [19]:
from sklearn.svm import SVR

In [20]:
#regression_pipeline = make_pipeline()
final_pipeline = Pipeline([('preprocessing', preprocessing_pipeline),
                                    ('regression', SVR(kernel='linear', C=1.0, epsilon=0.2))
                                   ])
final_pipeline

In [21]:
final_pipeline.fit(X_train, y_train)

In [22]:
from sklearn.model_selection import cross_val_score
train_rmses = -cross_val_score(final_pipeline, X_train, y_train, scoring="neg_root_mean_squared_error", cv=3)

In [23]:
pd.Series(train_rmses).describe()

count         3.000000
mean     113310.271266
std        1025.337366
min      112299.352665
25%      112790.685194
50%      113282.017722
75%      113815.730567
max      114349.443412
dtype: float64

In [24]:
test_rmses = -cross_val_score(final_pipeline, X_test, y_test, scoring="neg_root_mean_squared_error", cv=3)

In [25]:
pd.Series(test_rmses).describe()

count         3.000000
mean     116579.207383
std        1217.831311
min      115177.756115
25%      116178.685174
50%      117179.614233
75%      117279.933017
max      117380.251801
dtype: float64

In [66]:
# Exercise 1 - GridSearch

In [53]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'preprocessing__location_pipeline__similarity__n_clusters': [5,8,10],
     'regression__kernel':['linear','rbf_kernel'],
     'regression__C': [1,3,6]}
]

grid_search = GridSearchCV(final_pipeline, param_grid, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
#n_jobs=-1 runs it in parallel
grid_search.fit(X_train.iloc[0:5000, :], y_train[0:5000])


27 fits failed out of a total of 54.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
27 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/aina/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/aina/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/aina/opt/anaconda3/lib/python3.9/site-packages/sklearn/svm/_base.py", line 251, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "/Users/aina/opt/anaconda3/lib/python3.9/site-packages/sklearn/svm

In [54]:
grid_search.best_params_

{'preprocessing__location_pipeline__similarity__n_clusters': 10,
 'regression__C': 6,
 'regression__kernel': 'linear'}

In [45]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="rank_test_score", ascending=True).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessing__location_pipeline__similarity__n_clusters,param_regression__C,param_regression__kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
16,0.943114,0.013121,0.206981,0.013741,10,6,linear,{'preprocessing__location_pipeline__similarity...,-108357.584907,-108330.396497,-108722.17674,-108470.052715,178.623805,1
10,0.793071,0.183104,0.244593,0.074541,8,6,linear,{'preprocessing__location_pipeline__similarity...,-108800.320724,-108728.666104,-109378.836419,-108969.274416,291.077735,2
4,0.821649,0.006587,0.26786,0.002096,5,6,linear,{'preprocessing__location_pipeline__similarity...,-109681.239125,-109587.10527,-109962.659929,-109743.668108,159.548026,3
14,0.789411,0.230005,0.206023,0.05601,10,3,linear,{'preprocessing__location_pipeline__similarity...,-112881.675211,-112771.271371,-113101.835383,-112918.260655,137.409395,4
8,0.809089,0.168127,0.245893,0.075562,8,3,linear,{'preprocessing__location_pipeline__similarity...,-113247.847379,-112922.734468,-113533.419963,-113234.66727,249.485444,5


In [46]:
# Exercise 2 - randomized Searcg

In [51]:
from sklearn.model_selection import RandomizedSearchCV

param_distributions = [
    {'preprocessing__location_pipeline__similarity__n_clusters': np.arange(0,100),
     'regression__C': np.arange(0,10)}
]

random_search = RandomizedSearchCV(final_pipeline, param_distributions, cv=3, scoring='neg_root_mean_squared_error', n_jobs=-1)
random_search.fit(X_train.iloc[0:5000, :], y_train[0:5000])

In [56]:
random_search.best_params_

In [64]:
from sklearn.metrics import mean_squared_error

final_model = random_search.best_estimator_
final_prediction = final_model.predict(X_test)
rmse_error = mean_squared_error(y_test, final_prediction, squared=False)

In [65]:
rmse_error

93196.22000034657

In [67]:
#Exercise 3 - Try adding a SelectFromModel transformer in the preparation pipeline 
              # to select only the most important attributes.

In [None]:
from sklearn.feature_selection import SelectFromModel

final_model_SFM = SelectFromModel(final_pipeline, )