In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import LinearSVR
from typing import Dict
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.ensemble import HistGradientBoostingRegressor

In [2]:
def load_data():
    data_dict: Dict[int, pd.DataFrame] = dict()
    for i in range(4):
        data_dict[i] = pd.read_pickle(f"data/dataset_{i}.pandas_pickle")
    return data_dict

In [3]:
def get_estimator(regressor_name='linearSVR'):

    if regressor_name == 'linearSVR':
        regressor = LinearSVR(random_state=42)
    elif regressor_name == 'HistGradientBoostingRegressor':
        regressor = HistGradientBoostingRegressor(random_state=42)
    ct = make_column_transformer((make_pipeline(StandardScaler()), ["ZIP_CODE", "LAND_SQUARE_FEET", "GROSS_SQUARE_FEET", "YEAR_BUILT", "BATHROOM_COUNT", "SALE_DATE_ORD"]),
                             (OneHotEncoder(sparse=False, handle_unknown = "ignore"), ["BOROUGH", "NEIGHBORHOOD", "BUILDING_CLASS_CATEGORY", "BUILDING_CLASS_CATEGORY_ORD", "BUYER_SEX"]), 
                             verbose_feature_names_out=False)

    pipeline: Pipeline = make_pipeline(ct, regressor, memory='.cache', verbose=True)
    estimator = TransformedTargetRegressor(regressor=pipeline, transformer=StandardScaler())
    return estimator

In [4]:
def classify(data_dict):
    results = []
    estimators = dict()
    for idx, dataset in data_dict.items():
        X_train, X_val, y_train, y_val = train_test_split(dataset.drop(columns=["SALE_PRICE"]), dataset["SALE_PRICE"], random_state=42)
        estimators[idx] = dict()
        for regressor_name in ['linearSVR', 'HistGradientBoostingRegressor']:
            estimator = get_estimator(regressor_name)
            estimator.fit(X_train, y_train)

            y_pred = estimator.predict(X_val)

            estimators[idx][regressor_name] = estimator
            
            results.append([idx, regressor_name, mean_absolute_error(y_val, y_pred), r2_score(y_val, y_pred)])
    return results, estimators

In [5]:
data_dict = load_data()
results, estimators = classify(data_dict=data_dict)



[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  19.3s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   6.5s




[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  18.8s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   9.2s




[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  19.3s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   8.3s




[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  19.2s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   5.0s


In [6]:
df = pd.DataFrame(results, columns=['Dataset Idx', 'Regressor', 'MAE', 'R2'])
df

Unnamed: 0,Dataset Idx,Regressor,MAE,R2
0,0,linearSVR,826707.600905,0.204332
1,0,HistGradientBoostingRegressor,602822.336186,0.644838
2,1,linearSVR,821178.536598,0.264087
3,1,HistGradientBoostingRegressor,632824.458058,0.589781
4,2,linearSVR,809373.64882,0.211983
5,2,HistGradientBoostingRegressor,574875.248383,0.500433
6,3,linearSVR,872769.508629,0.340836
7,3,HistGradientBoostingRegressor,655705.125719,0.364833


In [7]:
import unittest


class TestDataRegression(unittest.TestCase):
    def setUp(self):
        data_dict = load_data()
        self.dataset = data_dict[0]

        results, estimators = classify(data_dict={0: self.dataset})
        
        self.results = results
        self.estimators = estimators
    
    def test_sex_invariance(self,):
        dataset = self.dataset.copy()
        dataset["BUYER_SEX"] = 'M'
        data_dict = {0: dataset}
        results, _ = classify(data_dict=data_dict)
        setup_results_df = pd.DataFrame(self.results, columns=['Dataset Idx', 'Regressor', 'MAE', 'R2'])
        test_results_df = pd.DataFrame(results, columns=['Dataset Idx', 'Regressor', 'MAE', 'R2'])
        self.assertTrue(np.allclose(setup_results_df['R2'].values, test_results_df['R2'].values, atol=1e-2))
    
    def test_direction_expectation(self):

        X_test1 = self.dataset.drop(columns=["SALE_PRICE"]).sample(frac=0.2)
        X_test2 = X_test1.copy()

        X_test2["GROSS_SQUARE_FEET"] = X_test2["GROSS_SQUARE_FEET"] * 2
        X_test2["BATHROOM_COUNT"] = X_test2["BATHROOM_COUNT"] + 5

        estimator_names =self.estimators[0].keys()
        is_increasing = []
        for est in estimator_names:
            prices1 = self.estimators[0][est].predict(X_test1)
            prices2 = self.estimators[0][est].predict(X_test2)
            is_increasing.append(((prices2 - prices1) > 0).all())

        self.assertTrue(np.array(is_increasing).all())

    def test_results_are_determined(self):
        results, _ = classify(data_dict={0: self.dataset})
        setup_results_df = pd.DataFrame(self.results, columns=['Dataset Idx', 'Regressor', 'MAE', 'R2'])
        test_results_df = pd.DataFrame(results, columns=['Dataset Idx', 'Regressor', 'MAE', 'R2'])
        self.assertTrue(np.array_equiv(setup_results_df.values, test_results_df.values))
    

unittest.main(argv=[''], verbosity=2, exit=False)



[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  19.8s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   5.6s


FAIL


[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  19.5s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   6.1s




[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  19.3s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   5.4s


ok


[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  21.0s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   6.1s




[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  20.3s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   5.2s


ok

FAIL: test_direction_expectation (__main__.TestDataRegression)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_24230/1821046248.py", line 38, in test_direction_expectation
    self.assertTrue(np.array(is_increasing).all())
AssertionError: False is not true

----------------------------------------------------------------------
Ran 3 tests in 140.017s

FAILED (failures=1)


<unittest.main.TestProgram at 0x7fb9b6b61bb0>