In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import LinearSVR
from sklearn.base import BaseEstimator
from typing import Dict
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.ensemble import HistGradientBoostingRegressor
from copy import deepcopy

In [2]:
class SimpleDataset():

    def __init__(self, path="") -> None:
        self.dataset = pd.read_pickle(path)
    
    def prepare(self):
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(self.dataset.drop(columns=["SALE_PRICE"]), self.dataset["SALE_PRICE"], random_state=42)
    
    def getColumnTransformer(self):
        ct = make_column_transformer((make_pipeline(StandardScaler()), ["ZIP_CODE", "LAND_SQUARE_FEET", "GROSS_SQUARE_FEET", "YEAR_BUILT", "BATHROOM_COUNT", "SALE_DATE_ORD"]),
                             (OneHotEncoder(sparse=False, handle_unknown = "ignore"), ["BOROUGH", "NEIGHBORHOOD", "BUILDING_CLASS_CATEGORY", "BUILDING_CLASS_CATEGORY_ORD", "BUYER_SEX"]), 
                             verbose_feature_names_out=False)
        return ct

    def getTrainSet(self):
        return self.X_train, self.y_train
    
    def getValSet(self):
        return self.X_val, self.y_val

In [3]:
def load_data():
    data_dict: Dict[int, SimpleDataset] = dict()
    for i in range(4):
        data_dict[i] = SimpleDataset(path=f"data/dataset_{i}.pandas_pickle")
    return data_dict

In [4]:
def get_regressors() -> Dict[str, BaseEstimator]:
    return dict(linearSVR=LinearSVR(random_state=42), HistGradientBoostingRegressor=HistGradientBoostingRegressor(random_state=42))

In [5]:
def classify(data_dict: Dict[int, SimpleDataset], regressors: Dict[str, BaseEstimator]):
    results = []
    estimators = dict()
    for idx, dataset in data_dict.items():
        dataset.prepare()
        estimators[idx] = dict()
        for regressor_name, regressor in regressors.items():
            
            pipeline: Pipeline = make_pipeline(dataset.getColumnTransformer(), regressor, memory='.cache', verbose=True)
            estimator = TransformedTargetRegressor(regressor=pipeline, transformer=StandardScaler())
            
            X_train, y_train = dataset.getTrainSet()
            estimator.fit(X_train, y_train)

            X_val, y_val = dataset.getValSet()

            y_pred = estimator.predict(X_val)

            estimators[idx][regressor_name] = estimator
            
            results.append([idx, regressor_name, mean_absolute_error(y_val, y_pred), r2_score(y_val, y_pred)])
    return results, estimators

In [6]:
data_dict = load_data()
regressors = get_regressors()
results, estimators = classify(data_dict=data_dict, regressors=regressors)



[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  20.4s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   5.6s




[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  19.5s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   9.5s




[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  19.2s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   8.7s




[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  19.3s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   5.1s


In [7]:
df = pd.DataFrame(results, columns=['Dataset Idx', 'Regressor', 'MAE', 'R2'])
df

Unnamed: 0,Dataset Idx,Regressor,MAE,R2
0,0,linearSVR,826707.600905,0.204332
1,0,HistGradientBoostingRegressor,602822.336186,0.644838
2,1,linearSVR,821178.536598,0.264087
3,1,HistGradientBoostingRegressor,632824.458058,0.589781
4,2,linearSVR,809373.64882,0.211983
5,2,HistGradientBoostingRegressor,574875.248383,0.500433
6,3,linearSVR,872769.508629,0.340836
7,3,HistGradientBoostingRegressor,655705.125719,0.364833


In [11]:
import unittest


class TestDataRegression(unittest.TestCase):
    def setUp(self):
        data_dict = load_data()
        self.dataset = data_dict[0]

        regressors = get_regressors()
        results, estimators = classify(data_dict={0: self.dataset}, regressors=regressors)
        
        self.results = results
        self.estimators = estimators
    
    def test_sex_invariance_linearSVR(self,):
        dataset = deepcopy(self.dataset)
        dataset.dataset["BUYER_SEX"] = 'M'
        data_dict = {0: dataset}
        regressors = dict(linearSVR=LinearSVR(random_state=42))
        results, _ = classify(data_dict=data_dict, regressors=regressors)
        setup_results_df = pd.DataFrame(self.results, columns=['Dataset Idx', 'Regressor', 'MAE', 'R2']).query(f"Regressor == 'linearSVR'")
        test_results_df = pd.DataFrame(results, columns=['Dataset Idx', 'Regressor', 'MAE', 'R2'])
        self.assertTrue(np.allclose(setup_results_df['R2'].values, test_results_df['R2'].values, atol=1e-2))
    
    def test_sex_invariance_histGradBoost(self,):

        dataset = deepcopy(self.dataset)
        dataset.dataset["BUYER_SEX"] = 'M'
        data_dict = {0: dataset}
        regressors = dict(HistGradientBoostingRegressor=HistGradientBoostingRegressor(random_state=42))
        results, _ = classify(data_dict=data_dict, regressors=regressors)
        setup_results_df = pd.DataFrame(self.results, columns=['Dataset Idx', 'Regressor', 'MAE', 'R2']).query(f"Regressor == 'HistGradientBoostingRegressor'")
        test_results_df = pd.DataFrame(results, columns=['Dataset Idx', 'Regressor', 'MAE', 'R2'])
        self.assertTrue(np.allclose(setup_results_df['R2'].values, test_results_df['R2'].values, atol=1e-2))
    
    def test_direction_expectation_linearSVR(self):

        X_test1 = self.dataset.dataset.drop(columns=["SALE_PRICE"]).sample(frac=0.2, random_state=42)
        X_test2 = X_test1.copy()

        X_test2["GROSS_SQUARE_FEET"] = X_test2["GROSS_SQUARE_FEET"] * 2
        X_test2["BATHROOM_COUNT"] = X_test2["BATHROOM_COUNT"] + 5

        estimator_name = "linearSVR"

        prices1 = self.estimators[0][estimator_name].predict(X_test1)
        prices2 = self.estimators[0][estimator_name].predict(X_test2)
        is_increasing = ((prices2 - prices1) > 0).all()

        self.assertTrue(is_increasing)
    
    def test_direction_expectation_histGradBoost(self):

        X_test1 = self.dataset.dataset.drop(columns=["SALE_PRICE"]).sample(frac=0.2, random_state=42)
        X_test2 = X_test1.copy()

        X_test2["GROSS_SQUARE_FEET"] = X_test2["GROSS_SQUARE_FEET"] * 2
        X_test2["BATHROOM_COUNT"] = X_test2["BATHROOM_COUNT"] + 5

        estimator_name = "HistGradientBoostingRegressor"

        prices1 = self.estimators[0][estimator_name].predict(X_test1)
        prices2 = self.estimators[0][estimator_name].predict(X_test2)
        is_increasing = ((prices2 - prices1) > 0).all()

        self.assertTrue(is_increasing)

    def test_results_are_determined(self):

        regressors = get_regressors()
        results, _ = classify(data_dict={0: self.dataset}, regressors=regressors)
        
        setup_results_df = pd.DataFrame(self.results, columns=['Dataset Idx', 'Regressor', 'MAE', 'R2'])
        test_results_df = pd.DataFrame(results, columns=['Dataset Idx', 'Regressor', 'MAE', 'R2'])
        self.assertTrue(np.array_equiv(setup_results_df.values, test_results_df.values))

unittest.main(defaultTest='TestDataRegression', argv=[''], verbosity=2, exit=False)



[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  22.2s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   7.6s


FAIL


[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  25.5s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   7.7s


ok


[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  25.6s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   7.5s




[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  23.1s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   7.9s


ok


[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  26.2s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   7.6s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   7.9s


ok


[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  24.4s
[Pipeline]  (step 2 of 2) Processing histgradientboostingregressor, total=   7.6s




[Pipeline] ......... (step 2 of 2) Processing linearsvr, total=  25.8s


ok

FAIL: test_direction_expectation_histGradBoost (__main__.TestDataRegression)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_24272/1384773004.py", line 66, in test_direction_expectation_histGradBoost
    self.assertTrue(is_increasing)
AssertionError: False is not true

----------------------------------------------------------------------
Ran 5 tests in 246.242s

FAILED (failures=1)


<unittest.main.TestProgram at 0x7fdb9f36d100>

In [10]:
import unittest
from pandas.api.types import  is_numeric_dtype

class TestSimpleDataset(unittest.TestCase):
    def setUp(self):
        self.dataset = SimpleDataset(path=f"data/dataset_0.pandas_pickle")
    
    def test_column_transformer_scaler(self):
        ct = self.dataset.getColumnTransformer()
        std_scaling_columns = ct.transformers[0][-1]
        are_numeric_types = [is_numeric_dtype(self.dataset.dataset[column_name]) for column_name in std_scaling_columns]
        self.assertTrue(np.array(are_numeric_types).all())



unittest.main(defaultTest='TestSimpleDataset', argv=[''], verbosity=2, exit=False)

test_column_transformer_scaler (__main__.TestSimpleDataset) ... ok

----------------------------------------------------------------------
Ran 1 test in 0.015s

OK


<unittest.main.TestProgram at 0x7fdbaacf7eb0>