In [1]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import itertools
import pandas as pd
import tqdm

In [7]:
cali_housing_path = '../data/California_Houses.csv'
RANDOM_SEED = 492

In [4]:
cali_df = pd.read_csv(cali_housing_path)

In [5]:
print(cali_df.head())

   Median_House_Value  Median_Income  Median_Age  Tot_Rooms  Tot_Bedrooms  \
0            452600.0         8.3252          41        880           129   
1            358500.0         8.3014          21       7099          1106   
2            352100.0         7.2574          52       1467           190   
3            341300.0         5.6431          52       1274           235   
4            342200.0         3.8462          52       1627           280   

   Population  Households  Latitude  Longitude  Distance_to_coast  \
0         322         126     37.88    -122.23        9263.040773   
1        2401        1138     37.86    -122.22       10225.733072   
2         496         177     37.85    -122.24        8259.085109   
3         558         219     37.85    -122.25        7768.086571   
4         565         259     37.85    -122.25        7768.086571   

   Distance_to_LA  Distance_to_SanDiego  Distance_to_SanJose  \
0   556529.158342         735501.806984         67432.5170

In [44]:
y_series = cali_df['Median_House_Value']
y = pd.DataFrame(y_series, columns=['Median_House_Value'])
features = [col for col in cali_df.columns if col != 'Median_House_Value']
X = cali_df[features]

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

In [47]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [48]:
print(X_train.index)

RangeIndex(start=0, stop=16512, step=1)


In [64]:
class GradientBoostedTreesEnsembleRegressor(tf.keras.Model):
    def __init__(self, n_trees=100, max_depth=3, n_estimators=100, sample_method='RANDOM'): 
        super(GradientBoostedTreesEnsembleRegressor, self).__init__()
        self.n_trees = n_trees
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.sample_method = sample_method
        self.estimators = []

    def build_estimator(self):
        model = tfdf.keras.GradientBoostedTreesModel(
            task=tfdf.keras.Task.REGRESSION,
            num_trees=self.n_trees,
            max_depth=np.random.randint(self.max_depth//2, self.max_depth+1),
            validation_ratio=0.1,
            sampling_method=self.sample_method,
            random_seed=self.seed # try to set this first. Maybe this is enough
        )
        return model
    
    # if not, the number of trees/depth different for each estimator

    def fit(self, X, y):
        num_data = X.shape[0]

        dataset = tf.data.Dataset.from_tensor_slices((X.values, y.values))
        
        for seed in np.random.randint(0, 1000000, size=self.n_estimators):
            self.seed = int(seed)
            train_estimator_indices = np.random.choice(num_data, size=num_data, replace=False)
            
            # Create a subset of the dataset using sampled indices
            subset_dataset = dataset.enumerate().filter(lambda i, data: tf.reduce_any(i == train_estimator_indices))

            # building each estimator
            estimator = self.build_estimator()
            estimator.fit(subset_dataset)
            self.estimators.append(estimator)

    def predict(self, X): # use all estimators
        predictions = []
        # sampled_estimators = np.random.choice(self.estimators, size=n_samples, replace=False)
        for estimator in self.estimators:
            estimator_predictions = []
            estimator_predictions.append(estimator.predict(X))
            estimator_predictions = tf.stack(estimator_predictions, axis=-1)
            predictions.append(tf.reduce_mean(estimator_predictions, axis=-1))
        predictions = tf.stack(predictions, axis=0)
        
        mean_prediction = tf.reduce_mean(predictions, axis=0)
        variance_prediction = tf.math.reduce_std(predictions, axis=0)
        return mean_prediction, variance_prediction


In [65]:
gbt_model = GradientBoostedTreesEnsembleRegressor(
    n_trees=50, 
    max_depth=5, 
    n_estimators=80, 
    # subportion=0.3, 
    sample_method='RANDOM'
)

In [66]:
gbt_model.fit(X_train, y_train)

Use /var/folders/3r/pzt2_p_x2xdfl2057nxkp3nm0000gn/T/tmpbpz9rt4r as temporary training directory




ValueError: The dataset does not contain a 'batch' operation. TF-DF models should be trained with batch operations. Add a batch operations to solve this issue. Alternatively, you can disabled this check with the constructor argument `check_dataset=False`. If this message is a false positive, please let us know so we can improve this dataset check logic.