## Code from the chapter needed to complete excerises

In [1]:
from sklearn.pipeline import Pipeline, make_pipeline #type: ignore
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer   #type: ignore
from sklearn.preprocessing import OneHotEncoder, StandardScaler,FunctionTransformer #type: ignore
from sklearn.impute import SimpleImputer    #type: ignore
import pandas as pd #type: ignore
import numpy as np  #type: ignore
import matplotlib.pyplot as plt #type: ignore
import seaborn as sns   #type: ignore
import sklearn #type: ignore
sklearn.set_config(display="diagram")

In [2]:
import pandas as pd #type: ignore
housing = pd.read_csv("datasets/housing/housing.csv")

In [4]:
housing["income_cat"] = pd.cut(housing["median_income"],
                                bins = [0, 1.5, 3, 4.5, 6, np.inf],
                                labels =[1, 2, 3, 4, 5])

In [5]:
from sklearn.model_selection import train_test_split    #type: ignore

train, test = train_test_split(housing, test_size=0.2,
                               stratify=housing["income_cat"],
                                random_state=42)

In [6]:
X_train = train.drop(["median_house_value", "income_cat"], axis=1)
y_train = train["median_house_value"]

X_test = test.drop(["median_house_value", "income_cat"], axis=1)
y_test = test["median_house_value"]

In [7]:
from sklearn.preprocessing import OneHotEncoder #type: ignore
from sklearn.preprocessing import StandardScaler    #type: ignore
from sklearn.impute import SimpleImputer    #type: ignore
from sklearn.pipeline import Pipeline   #type: ignore
from sklearn.compose import ColumnTransformer   #type: ignore

numeric = X_train.select_dtypes(include=["int64", "float64"]).columns
categorical = X_train.select_dtypes(include="object").columns



num_pipeline = Pipeline([("num_impute", SimpleImputer(strategy="median")),
                         ("scaler", StandardScaler())])

cat_pipeline = Pipeline([("cat_impute", SimpleImputer(strategy="most_frequent")),
                         ("encoder_cat", OneHotEncoder(handle_unknown="ignore"))
                        ])

# preprocessing = ColumnTransformer([("num", num_pipeline, numeric),
#                                    ("cat", cat_pipeline, categorical)])
print(categorical)

Index(['ocean_proximity'], dtype='object')


In [8]:
from sklearn.base import BaseEstimator, TransformerMixin       #type: ignore
from sklearn.cluster import KMeans  #type: ignore
from sklearn.metrics.pairwise import rbf_kernel #type: ignore

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self # always return self!
    
    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

In [14]:
def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]  # feature names out

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler())

log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler())
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)
default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                     StandardScaler())
preprocessing = ColumnTransformer([
        ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
        ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
        ("people_per_house", ratio_pipeline(), ["population", "households"]),
        ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                               "households", "median_income"]),
        ("geo", cluster_simil, ["latitude", "longitude"]),
        ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder=default_num_pipeline)  # one column remaining: housing_median_age

In [19]:
new_data = preprocessing.fit_transform(X_train)

df = pd.DataFrame(new_data,
                columns=preprocessing.get_feature_names_out(),
                index=X_train.index)
df.shape

(16512, 24)

<hr>

## Models

<hr>

### Decision Tree from Chapter

In [15]:
from sklearn.tree import DecisionTreeRegressor  #type: ignore
from sklearn.metrics import root_mean_squared_error #type: ignore

tree_model = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))
tree_model.fit(X_train, y_train)
tree_model_pred = tree_model.predict(X_train)
tree_model_result = root_mean_squared_error(y_train, tree_model_pred)
print(tree_model_result)
tree_model

found 0 physical cores < 1
  File "c:\Users\Маша\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


0.0


We can see that the model clearly overfits the data, so we should use CV to get fair results

In [139]:
from sklearn.model_selection import cross_val_score #type: ignore

tree_cv_result = -cross_val_score(tree_model, X_train, y_train,
                             scoring="neg_root_mean_squared_error", cv=10)

tree_cv_result.mean()

67013.36094934531

**It is more common to perform cross-validation on the training set and not on the test set**

Alternatively we could use test set to estimate the score of our model, but author recommends to do it as a last resort

## Exercise 1

### Support Vector Machines

In [13]:
from sklearn.svm import SVR #type: ignore

svr_model = make_pipeline(preprocessing, SVR())
svr_model.fit(X_train.iloc[:4000], y_train.iloc[:4000])
svr_predict = svr_model.predict(X_train.iloc[:4000])
svr_result = root_mean_squared_error(y_train.iloc[:4000], svr_predict)
print(svr_result)
svr_model

120897.25775879824


In [19]:
from sklearn.model_selection import GridSearchCV    #type: ignore
from sklearn.svm import SVR #type: ignore

grid_svr_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("svr", SVR())
])

params = [
    {"svr__kernel": ["linear"],
     "svr__C": [8000, 10000, 12000]},
     {"svr__kernel":["rbf"],
      "svr__C": [8000, 10000, 12000],
      "svr__gamma": ["scale", "auto"]}
]

grid_svr = GridSearchCV(grid_svr_pipeline, params,
                            scoring="neg_root_mean_squared_error",
                               cv=3)

grid_svr.fit(X_train.iloc[:4000], y_train.iloc[:4000])

In [15]:
grid_svr.best_params_

{'svr__C': 12000, 'svr__gamma': 'scale', 'svr__kernel': 'rbf'}

In [16]:
pd.DataFrame(grid_svr.cv_results_).sort_values(by="mean_test_score", ascending=False).head(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svr__C,param_svr__kernel,param_svr__gamma,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
7,1.416432,0.345137,1.268658,0.465179,12000,rbf,scale,"{'svr__C': 12000, 'svr__gamma': 'scale', 'svr_...",-62996.531226,-62351.623098,-60977.5712,-62108.575175,841.963624,1
5,1.305521,0.20207,1.435622,0.935831,10000,rbf,scale,"{'svr__C': 10000, 'svr__gamma': 'scale', 'svr_...",-64189.487789,-63105.662631,-62002.362629,-63099.171016,892.901907,2
8,2.231807,1.298388,3.019169,2.272996,12000,rbf,auto,"{'svr__C': 12000, 'svr__gamma': 'auto', 'svr__...",-64561.951539,-63835.310105,-62210.274457,-63535.845367,983.14313,3


In [146]:
-grid_svr.best_score_

62108.575174813246

In [144]:
best_svr_model = grid_svr.best_estimator_
best_svr_predict = best_svr_model.predict(X_train.iloc[:4000])
best_svr_result = root_mean_squared_error(y_train.iloc[:4000], best_svr_predict)
print(best_svr_result)
best_svr_model

58932.82144695249


<hr>

# Exercise 2

In [28]:
from sklearn.model_selection import RandomizedSearchCV  #type: ignore
from scipy.stats import randint     #type: ignore
from scipy.stats import expon   #type: ignore
from scipy.stats import loguniform #type: ignore

params = [{"svr__kernel":["linear"],
          "svr__C": loguniform(5000, 20000)},
         {"svr__kernel":["rbf"],
          "svr__C": loguniform(5000, 250000),
          "svr__gamma": expon(scale=1)}
          ]


random_svr = RandomizedSearchCV(grid_svr_pipeline, params, 
                                scoring="neg_root_mean_squared_error",
                                n_iter=50, cv=3)

random_svr.fit(X_train.iloc[:5000], y_train.iloc[:5000])

In [29]:
random_svr.best_params_

{'svr__C': 87113.50263570255,
 'svr__gamma': 0.22657562079639362,
 'svr__kernel': 'rbf'}

In [30]:
-random_svr.best_score_

56026.30233712735

In [31]:
from sklearn.metrics import root_mean_squared_error #type: ignore

random_svr_model = random_svr.best_estimator_
random_pred = random_svr_model.predict(X_train)
random_svr_result = root_mean_squared_error(y_train, random_pred)
print(random_svr_result)
random_svr_model

53546.94104770023


# Exercise 3

In [34]:
from sklearn.feature_selection import SelectFromModel   #type: ignore
from sklearn.tree import DecisionTreeRegressor #type: ignore

svr_3 = make_pipeline(preprocessing,  SelectFromModel(DecisionTreeRegressor(), threshold=0.005),
                      SVR(C=random_svr.best_params_["svr__C"],
                        gamma=random_svr.best_params_["svr__gamma"],
                        kernel=random_svr.best_params_["svr__kernel"]))

svr_3.fit(X_train.iloc[:6000], y_train.iloc[:6000])
svr_3_pred = svr_3.predict(X_train.iloc[:6000])
svr_3_results = root_mean_squared_error(y_train.iloc[:6000], svr_3_pred)
print(svr_3_results)
svr_3

48281.93652044723


Cross-validation

In [38]:
from sklearn.model_selection import cross_val_score #type: ignore

svr_rmses_3 = -cross_val_score(svr_3,
                              X_train.iloc[:4000],
                              y_train.iloc[:4000],
                              scoring="neg_root_mean_squared_error",
                              cv=3)

svr_rmses_3.mean()

57159.68248858529

As we can see the results have not improved that much

<hr>

# Exercise 4

In [None]:
from sklearn.neighbors import KNeighborsRegressor   #type: ignore
from sklearn.base import MetaEstimatorMixin, clone  #type: ignore
from sklearn.utils.validation import check_is_fitted    #type: ignore

class FeatureFromRegressor(MetaEstimatorMixin, BaseEstimator, TransformerMixin):
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None):
        estimator_ = clone (self.estimator)
        estimator_.fit(X_train, y_train)
        self.estimator = estimator_
        self.n_features_in = self.estimator_n.features_in_
        if hasattr(self.estimator, "feature_names_in_"):
            self.feature_names_in_ = self.estimator.feature_names_in_
        return self
    
    def transform(self, X):
        check_is_fitted(self)
        predictions = self.estimator_.predict(X_test)
        if predictions.ndim == 1:
            predictions = predictions.reshape(-1, 1)
        return predictions
    
    def get_feature_names_out(self, names=None):
        check_is_fitted(self)
        n_outputs = getattr(self.estimator_, "n_outputs_", 1)
        estimator_class_name = self.estimator_.__class__.__name__
        estimator_short_name = estimator_class_name.lower().replace("_", "")
        return [f"{estimator_short_name}_prediction_{i}"
                for i in range(n_outputs)]
    




In [None]:
from sklearn.utils.estimator_checks import check_estimator  #type: ignore

check_estimator(FeatureFromRegressor(KNeighborsRegressor()))

In [None]:
knn_reg = KNeighborsRegressor(n_neighbors=3, weights="distance")
knn_transformer = FeatureFromRegressor(knn_reg)
geo_features = housing[["latitude", "longitude"]]
knn_transformer.fit_transform(geo_features, y_train)

<hr>

# Other staff

### Heavy tails

<hr>

## Pipeline

### Long version

In [97]:
from sklearn.preprocessing import OneHotEncoder #type: ignore
from sklearn.preprocessing import StandardScaler    #type: ignore
from sklearn.impute import SimpleImputer    #type: ignore
from sklearn.pipeline import Pipeline   #type: ignore
from sklearn.compose import ColumnTransformer   #type: ignore

numeric = X_train.select_dtypes(include=["int64", "float64"]).columns
categorical = X_train.select_dtypes(include="object").columns


num_pipeline = Pipeline([("num_impute", SimpleImputer(strategy="median")),
                         ("scaler", StandardScaler())])

cat_pipeline = Pipeline([("cat_impute", SimpleImputer(strategy="most_frequent")),
                         ("encoder_cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
                        ])

preprocessing = ColumnTransformer([("num", num_pipeline, numeric),
                                   ("cat", cat_pipeline, categorical)])

### Short version

In [98]:
from sklearn.preprocessing import OneHotEncoder #type: ignore
from sklearn.impute import SimpleImputer    #type: ignore
from sklearn.preprocessing import StandardScaler    #type: ignore
from sklearn.pipeline import make_pipeline  #type: ignore
from sklearn.compose import make_column_transformer, make_column_selector   #type: ignore
import sklearn  #type: ignore

sklearn.set_config(display="diagram")

numeric = make_pipeline(SimpleImputer(strategy="median"),
                        StandardScaler())

categorical = make_pipeline(SimpleImputer(strategy="most_frequent"),
                            OneHotEncoder(sparse_output=False, handle_unknown="ignore"))

preprocessing = make_column_transformer((numeric, make_column_selector(dtype_include=np.number)),
                                         (categorical, make_column_selector(dtype_include=object)))



### From the Book

<hr>

## Linear Regression

In [99]:
from sklearn.linear_model import LinearRegression   #type: ignore
from sklearn.metrics import mean_squared_error  #type: ignore

lin_model = make_pipeline(preprocessing, LinearRegression())
lin_model.fit(X_train, y_train)
lin_model_pred = lin_model.predict(X_test)
lin_model_result = np.sqrt(mean_squared_error(y_test, lin_model_pred))
print(lin_model_result)
lin_model

71002.83776920449


<hr>

### Decision Tree

In [100]:
from sklearn.svm import SVR #type: ignore

svr_model = make_pipeline(preprocessing, SVR(degree=1, kernel="linear"))
svr_model.fit(X_train, y_train)
svr_model_pred = svr_model.predict(X_test)
svr_model_result = mean_squared_error(y_test, svr_model_pred, squared=False)
print(tree_model_result)
svr_model

66178.0376260829




In [101]:
new_train = housing.copy()
new_train = new_train.sample(3000)


target_svr = new_train["median_house_value"]
features_svr = new_train.drop("median_house_value", axis=1)
features_svr.shape

(3000, 10)

In [102]:
from sklearn.model_selection import GridSearchCV    #type: ignore


params = [
    {"svr__kernel": ["linear", "poly"],
     "svr__degree": [1, 3, 5]}
]

grid = GridSearchCV(svr_model, params, cv=3, scoring="neg_root_mean_squared_error")
grid.fit(features_svr, target_svr)
grid.best_params_

{'svr__degree': 1, 'svr__kernel': 'linear'}