In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [2]:
dataset1: pd.DataFrame = pd.read_pickle("data/dataset_0.pandas_pickle")

In [3]:
X_train, X_val, y_train, y_val = train_test_split(dataset1.drop(columns=["SALE_PRICE"]), dataset1["SALE_PRICE"])


In [4]:
ct = make_column_transformer((make_pipeline(StandardScaler()), ["ZIP_CODE", "LAND_SQUARE_FEET", "GROSS_SQUARE_FEET", "YEAR_BUILT", "BATHROOM_COUNT"]),
                             (OneHotEncoder(sparse=False, handle_unknown = "ignore"), ["BOROUGH", "NEIGHBORHOOD", "BUILDING_CLASS_CATEGORY", "SALE_DATE_ORD", "BUILDING_CLASS_CATEGORY_ORD", "BUYER_SEX"]), 
                             verbose_feature_names_out=False)

pipeline: Pipeline = make_pipeline(ct, LinearSVR(random_state=42), memory='.cache', verbose=True)
estimator = TransformedTargetRegressor(regressor=pipeline, transformer=StandardScaler())

In [5]:
estimator.fit(X_train, y_train)

[Pipeline] . (step 1 of 2) Processing columntransformer, total=   8.9s


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  X, fitted_transformer = fit_transform_one_cached(


[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  32.8s




TransformedTargetRegressor(regressor=Pipeline(memory='.cache',
                                              steps=[('columntransformer',
                                                      ColumnTransformer(transformers=[('pipeline',
                                                                                       Pipeline(steps=[('standardscaler',
                                                                                                        StandardScaler())]),
                                                                                       ['ZIP_CODE',
                                                                                        'LAND_SQUARE_FEET',
                                                                                        'GROSS_SQUARE_FEET',
                                                                                        'YEAR_BUILT',
                                                                                        'BATHROOM

In [6]:
y_pred = estimator.predict(X_val)

In [7]:
mean_absolute_error(y_val, y_pred)

786982.0614911406

In [8]:
r2_score(y_val, y_pred)

0.14154496090974222

In [9]:
from sklearn.ensemble import HistGradientBoostingRegressor

In [10]:
ct = make_column_transformer((make_pipeline(StandardScaler()), ["ZIP_CODE", "LAND_SQUARE_FEET", "GROSS_SQUARE_FEET", "YEAR_BUILT", "BATHROOM_COUNT"]),
                             (OneHotEncoder(sparse=False, handle_unknown = "ignore"), ["BOROUGH", "NEIGHBORHOOD", "BUILDING_CLASS_CATEGORY", "SALE_DATE_ORD", "BUILDING_CLASS_CATEGORY_ORD", "BUYER_SEX"]), 
                             verbose_feature_names_out=False)

pipeline: Pipeline = make_pipeline(ct, HistGradientBoostingRegressor(random_state=42), memory='.cache', verbose=True)
estimator = TransformedTargetRegressor(regressor=pipeline, transformer=StandardScaler())


In [11]:
estimator.fit(X_train, y_train)

[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total= 2.9min


TransformedTargetRegressor(regressor=Pipeline(memory='.cache',
                                              steps=[('columntransformer',
                                                      ColumnTransformer(transformers=[('pipeline',
                                                                                       Pipeline(steps=[('standardscaler',
                                                                                                        StandardScaler())]),
                                                                                       ['ZIP_CODE',
                                                                                        'LAND_SQUARE_FEET',
                                                                                        'GROSS_SQUARE_FEET',
                                                                                        'YEAR_BUILT',
                                                                                        'BATHROOM

In [12]:
y_pred = estimator.predict(X_val)

In [13]:
mean_absolute_error(y_val, y_pred)

505467.675028784

In [14]:
r2_score(y_val, y_pred)

0.6301633903650985