In [None]:
import tensorflow_decision_forests as tfdf
import pandas as pd
import numpy as np

In [None]:
# train_file_path = "/kaggle/input/house-prices-advanced-regression-techniques/train.csv"
# test_file_path = "/kaggle/input/house-prices-advanced-regression-techniques/test.csv"

train_file_path = "data/train.csv"
test_file_path = "data/test.csv"

In [None]:
dataset_df = pd.read_csv(train_file_path)
dataset_df = dataset_df.drop('Id', axis=1)

In [None]:
Q1 = dataset_df['SalePrice'].quantile(0.25)
Q3 = dataset_df['SalePrice'].quantile(0.75)
IQR = Q3 - Q1

In [None]:
lower_bound = Q1 - 0.3 * IQR
upper_bound = Q3 + 0.3 * IQR

dataset_df = dataset_df[(dataset_df['SalePrice'] >= lower_bound) & (dataset_df['SalePrice'] <= upper_bound)]

In [None]:
def remove_outliers(data, column, m=3):
    mean = np.mean(data[column])
    std_dev = np.std(data[column])
    lower_bound = mean - m * std_dev
    upper_bound = mean + m * std_dev
    return lower_bound, upper_bound

In [None]:
columns = ["YearBuilt", "GarageYrBlt", "GrLivArea", "GarageArea", "OverallQual", "SalePrice"]

m = 3

for column in columns:
    lower_bound, upper_bound = remove_outliers(dataset_df, column, m)
    dataset_df = dataset_df[(dataset_df[column] >= lower_bound) & (dataset_df[column] <= upper_bound)]
    
dataset_df.reset_index(drop=True, inplace=True)

In [None]:
label = 'SalePrice'

In [None]:
features = dataset_df.drop(columns=[label])
labels = dataset_df[label]

In [None]:
numeric_features = features.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = features.select_dtypes(exclude=[np.number]).columns.tolist()

In [None]:
def split_dataset(dataset, test_ratio=0.30):
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]

train_ds_pd, valid_ds_pd = split_dataset(dataset_df)

In [None]:
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label, task=tfdf.keras.Task.REGRESSION)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label=label, task=tfdf.keras.Task.REGRESSION)

In [None]:
argument = tfdf.keras.AdvancedArguments(fail_on_non_keras_compatible_feature_name = False)

In [None]:
rf = tfdf.keras.GradientBoostedTreesModel(num_trees=1000,task = tfdf.keras.Task.REGRESSION, advanced_arguments=argument)
rf.compile(metrics=["mse"])

rf.fit(x=train_ds)

In [None]:
evaluation = rf.evaluate(valid_ds)

In [None]:
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    test_data,
    task = tfdf.keras.Task.REGRESSION)

preds = rf.predict(test_ds)
output = pd.DataFrame({'Id': ids,
                       'SalePrice': preds.squeeze()})

In [None]:
# sample_submission_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission_df = pd.read_csv('data/sample_submission.csv')
sample_submission_df['SalePrice'] = rf.predict(test_ds)
# sample_submission_df.to_csv('/kaggle/working/submission.csv', index=False)
sample_submission_df.to_csv('submission.csv', index=False)
sample_submission_df.head()