In [None]:
import tensorflow_decision_forests as tfdf
import pandas as pd
import tensorflow as tf
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.metrics import RootMeanSquaredError
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [None]:
data = pd.read_csv("house_data/train.csv")
data = data.drop('Id', axis=1)

data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
list(set(data.dtypes.tolist()))
df_num = data.select_dtypes(include = ['float64', 'int64'])
df_num.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,61,0,0,0,0,0,2,2008,208500
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,298,0,0,0,0,0,0,5,2007,181500
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,42,0,0,0,0,0,9,2008,223500
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,35,272,0,0,0,0,2,2006,140000
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,192,84,0,0,0,0,0,12,2008,250000


In [None]:
df_num.fillna(df_num.median(), inplace=True)
# Log transform target variable
df_num['SalePrice'] = np.log1p(df_num['SalePrice'])

# Separate numerical features
num_cols = df_num.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Feature scaling
scaler = StandardScaler()
df_num[num_cols] = scaler.fit_transform(df_num[num_cols])

# Feature engineering: Add polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df_num[num_cols])
poly_feature_names = poly.get_feature_names_out(num_cols)
data_poly = pd.DataFrame(poly_features, columns=poly_feature_names, index=df_num.index)
df_num = pd.concat([df_num, data_poly], axis=1)

In [None]:
# Split dataset
def split_dataset(dataset, test_ratio=0.20):
    test_indices = np.random.rand(len(dataset)) < test_ratio
    return dataset[~test_indices], dataset[test_indices]

train_ds_pd, valid_ds_pd = split_dataset(data)

# Convert to TensorFlow dataset
label = 'SalePrice'
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label, task=tfdf.keras.Task.REGRESSION)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label=label, task=tfdf.keras.Task.REGRESSION)

2025-01-01 10:35:19.983965: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-01-01 10:35:20.094476: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-01-01 10:35:20.094533: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-01-01 10:35:20.099191: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-01-01 10:35:20.099242: I tensorflow/compile

In [None]:
# Random Forest Model with hyperparameter tuning
rf = tfdf.keras.RandomForestModel(
    task=tfdf.keras.Task.REGRESSION,
    num_trees=1000,
    max_depth=100,
    min_examples=5
)
rf.compile(metrics=["mse", tf.keras.metrics.RootMeanSquaredError(name="rmse")])


Use /tmp/tmpdcezwmnr as temporary training directory


In [None]:
kf = KFold(n_splits=5)
rmse_scores = []

In [None]:
for train_index, val_index in kf.split(data):
    train_fold, val_fold = data.iloc[train_index], data.iloc[val_index]
    train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_fold, label='SalePrice', task=tfdf.keras.Task.REGRESSION)
    val_ds = tfdf.keras.pd_dataframe_to_tf_dataset(val_fold, label='SalePrice', task=tfdf.keras.Task.REGRESSION)

    rf.fit(x=train_ds)
    evaluation = rf.evaluate(x=val_ds, return_dict=True)
    rmse_scores.append(evaluation['rmse'])

Reading training dataset...
Training dataset read in 0:00:00.449130. Found 1168 examples.
Training model...


[INFO 25-01-01 10:44:17.3640 UTC kernel.cc:1233] Loading model from path /tmp/tmpdcezwmnr/model/ with prefix efd7fceaa5744bde


Model trained in 0:00:01.786948
Compiling model...


[INFO 25-01-01 10:44:17.9680 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 371748 node(s), and 77 input feature(s).
[INFO 25-01-01 10:44:17.9680 UTC abstract_model.cc:1343] Engine "RandomForestOptPred" built
[INFO 25-01-01 10:44:17.9681 UTC kernel.cc:1061] Use fast generic engine


Model compiled.
Reading training dataset...
Training dataset read in 0:00:00.052319. Found 1168 examples.
Training model...


[INFO 25-01-01 10:44:20.0486 UTC kernel.cc:1233] Loading model from path /tmp/tmpdcezwmnr/model/ with prefix efd7fceaa5744bde


Model trained in 0:00:01.679749
Compiling model...


[INFO 25-01-01 10:44:20.6264 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 371528 node(s), and 77 input feature(s).
[INFO 25-01-01 10:44:20.6265 UTC kernel.cc:1061] Use fast generic engine


Model compiled.
Reading training dataset...
Training dataset read in 0:00:00.051790. Found 1168 examples.
Training model...


[INFO 25-01-01 10:44:22.4957 UTC kernel.cc:1233] Loading model from path /tmp/tmpdcezwmnr/model/ with prefix efd7fceaa5744bde


Model trained in 0:00:01.637034
Compiling model...


[INFO 25-01-01 10:44:23.0095 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 371450 node(s), and 76 input feature(s).
[INFO 25-01-01 10:44:23.0095 UTC kernel.cc:1061] Use fast generic engine


Model compiled.
Reading training dataset...
Training dataset read in 0:00:00.050472. Found 1168 examples.
Training model...


[INFO 25-01-01 10:44:24.8988 UTC kernel.cc:1233] Loading model from path /tmp/tmpdcezwmnr/model/ with prefix efd7fceaa5744bde


Model trained in 0:00:01.671804
Compiling model...


[INFO 25-01-01 10:44:25.4757 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 371760 node(s), and 76 input feature(s).
[INFO 25-01-01 10:44:25.4758 UTC kernel.cc:1061] Use fast generic engine


Model compiled.
Reading training dataset...
Training dataset read in 0:00:00.048306. Found 1168 examples.
Training model...


[INFO 25-01-01 10:44:27.3818 UTC kernel.cc:1233] Loading model from path /tmp/tmpdcezwmnr/model/ with prefix efd7fceaa5744bde


Model trained in 0:00:01.696377
Compiling model...


[INFO 25-01-01 10:44:27.9496 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 371362 node(s), and 75 input feature(s).
[INFO 25-01-01 10:44:27.9497 UTC kernel.cc:1061] Use fast generic engine


Model compiled.


In [None]:
print(f"Average RMSE across folds: {np.mean(rmse_scores):.4f}")


Average RMSE across folds: 28199.5145


In [None]:

# Final model training
rf.fit(x=train_ds)

Reading training dataset...
Training dataset read in 0:00:00.042839. Found 1168 examples.
Training model...


[INFO 25-01-01 10:44:38.0032 UTC kernel.cc:1233] Loading model from path /tmp/tmpdcezwmnr/model/ with prefix efd7fceaa5744bde


Model trained in 0:00:01.583721
Compiling model...


[INFO 25-01-01 10:44:38.5049 UTC decision_forest.cc:660] Model loaded with 1000 root(s), 371362 node(s), and 75 input feature(s).
[INFO 25-01-01 10:44:38.5050 UTC abstract_model.cc:1343] Engine "RandomForestOptPred" built
[INFO 25-01-01 10:44:38.5050 UTC kernel.cc:1061] Use fast generic engine


Model compiled.


<keras.src.callbacks.History at 0x7f4eb9fc3ad0>

In [None]:
evaluation = rf.evaluate(x=valid_ds, return_dict=True)
for name, value in evaluation.items():
    print(f"{name}: {value:.4f}")

loss: 0.0000
mse: 474686432.0000
rmse: 21787.2988


In [None]:

test_file_path = "house_data/test.csv"


In [None]:
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    test_data,
    task = tfdf.keras.Task.REGRESSION)

preds = rf.predict(test_ds)
output = pd.DataFrame({'Id': ids,
                       'SalePrice': preds.squeeze()})

output.head()



Unnamed: 0,Id,SalePrice
0,1461,124992.703125
1,1462,154435.421875
2,1463,178392.859375
3,1464,184099.515625
4,1465,195659.546875


In [None]:
sample_submission_df = pd.read_csv('house_data/sample_submission.csv')
sample_submission_df['SalePrice'] = rf.predict(test_ds)
sample_submission_df.to_csv('submission2.csv', index=False)
sample_submission_df.head()



Unnamed: 0,Id,SalePrice
0,1461,124992.703125
1,1462,154435.421875
2,1463,178392.859375
3,1464,184099.515625
4,1465,195659.546875
