# Tabular Data Only

In [134]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, make_scorer
import xgboost as xgb
import torch
import torchvision.models as models
from pathlib import Path

In [None]:
ROOT_DIR=Path(__file__).resolve().parents[1]
INP_DIR = ROOT_DIR/"data"/"raw"
OUT_DIR = ROOT_DIR/"data"/"processed"

In [141]:
df_train=pd.read_csv(INP_DIR/'train(1).csv')
df_test=pd.read_csv(INP_DIR/'test2.csv')

In [142]:
X_train=pd.read_csv(OUT_DIR/'X_train.csv')
X_test=pd.read_csv(OUT_DIR/'X_test.csv')

In [143]:
gbm = HistGradientBoostingRegressor(
    loss="squared_error",
    learning_rate=0.03,
    max_iter=800,
    max_depth=8,
    max_leaf_nodes=64,
    min_samples_leaf=30,
    l2_regularization=0.0,
    random_state=42
)

In [144]:
y_train = np.log1p(df_train["price"].values)

In [145]:
gbm.fit(X_train, y_train)

0,1,2
,"loss  loss: {'squared_error', 'absolute_error', 'gamma', 'poisson', 'quantile'}, default='squared_error' The loss function to use in the boosting process. Note that the ""squared error"", ""gamma"" and ""poisson"" losses actually implement ""half least squares loss"", ""half gamma deviance"" and ""half poisson deviance"" to simplify the computation of the gradient. Furthermore, ""gamma"" and ""poisson"" losses internally use a log-link, ""gamma"" requires ``y > 0`` and ""poisson"" requires ``y >= 0``. ""quantile"" uses the pinball loss. .. versionchanged:: 0.23  Added option 'poisson'. .. versionchanged:: 1.1  Added option 'quantile'. .. versionchanged:: 1.3  Added option 'gamma'.",'squared_error'
,"quantile  quantile: float, default=None If loss is ""quantile"", this parameter specifies which quantile to be estimated and must be between 0 and 1.",
,"learning_rate  learning_rate: float, default=0.1 The learning rate, also known as *shrinkage*. This is used as a multiplicative factor for the leaves values. Use ``1`` for no shrinkage.",0.03
,"max_iter  max_iter: int, default=100 The maximum number of iterations of the boosting process, i.e. the maximum number of trees.",800
,"max_leaf_nodes  max_leaf_nodes: int or None, default=31 The maximum number of leaves for each tree. Must be strictly greater than 1. If None, there is no maximum limit.",64
,"max_depth  max_depth: int or None, default=None The maximum depth of each tree. The depth of a tree is the number of edges to go from the root to the deepest leaf. Depth isn't constrained by default.",8
,"min_samples_leaf  min_samples_leaf: int, default=20 The minimum number of samples per leaf. For small datasets with less than a few hundred samples, it is recommended to lower this value since only very shallow trees would be built.",30
,"l2_regularization  l2_regularization: float, default=0 The L2 regularization parameter penalizing leaves with small hessians. Use ``0`` for no regularization (default).",0.0
,"max_features  max_features: float, default=1.0 Proportion of randomly chosen features in each and every node split. This is a form of regularization, smaller values make the trees weaker learners and might prevent overfitting. If interaction constraints from `interaction_cst` are present, only allowed features are taken into account for the subsampling. .. versionadded:: 1.4",1.0
,"max_bins  max_bins: int, default=255 The maximum number of bins to use for non-missing values. Before training, each feature of the input array `X` is binned into integer-valued bins, which allows for a much faster training stage. Features with a small number of unique values may use less than ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin is always reserved for missing values. Must be no larger than 255.",255


In [146]:
y_test_log = gbm.predict(X_test)
X_test["price"] = np.expm1(y_test_log)

In [147]:
X_test.to_csv(OUT_DIR/"predicted prices-tabular-data.csv", index=False)


# Image data

In [94]:
df_train = pd.read_csv(OUT_DIR/'df_train.csv')

In [95]:
y = np.log1p(df_train["price"].values)

In [96]:
img_cols = [c for c in df_train.columns if c.isdigit()]

In [97]:
X_img = df_train[img_cols].astype(np.float32).values

In [98]:
tab_cols = [
    'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'sqft_living15', 'sqft_lot15', 'is_renovated',
       'years_since_renovation', 'house_age', 'zipcode_price_mean',
       'zipcode_price_per_sqft'
]

X_tab = df_train[tab_cols].values
X_tab = np.nan_to_num(X_tab)

In [99]:
scaler = StandardScaler()
X_img_scaled = scaler.fit_transform(X_img)

In [100]:
pca = PCA(n_components=1, random_state=42)
X_img_pca = pca.fit_transform(X_img_scaled)
X_img_pca *= 0.3

In [101]:
X_final = np.hstack([X_tab, X_img_pca])

In [122]:
model = xgb.XGBRegressor(
    n_estimators=600,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

In [123]:
model.fit(X_final, y)

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'reg:squarederror'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.8
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [124]:
df_img = pd.read_csv(OUT_DIR/"image_embeddings_resnet50.csv")

df_img = df_img.sort_values("id").reset_index(drop=True)

train_embeddings_array = df_img.drop(columns=["id"]).values


In [125]:
df_testimg = pd.read_csv(OUT_DIR/"test_image_embeddings_resnet50.csv")

df_test_img = df_testimg.sort_values("id").reset_index(drop=True)

test_embeddings_array = df_test_img.drop(columns=["id"]).values

In [126]:
mean_emb = np.mean(train_embeddings_array, axis=0)

In [127]:
test_img_cols = [c for c in df_testimg.columns if c != "id"]

df_test_img_emb = pd.DataFrame(
    test_embeddings_array,
    columns=test_img_cols
)

df_test_img_emb["id"] = df_test_img["id"].values

In [128]:
df_test_merged = df_test.merge(
    df_test_img_emb,
    on="id",
    how="left"
)

In [129]:
for i, col in enumerate(test_img_cols):
    df_test_merged[col].fillna(mean_emb[i], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test_merged[col].fillna(mean_emb[i], inplace=True)


In [130]:
X_test_final = np.hstack([
    X_test[tab_cols].astype(float).values,
    pca.transform(df_test_merged[test_img_cols].values)
])

In [None]:
pred_log = model.predict(X_test_final)
pred_price = np.expm1(pred_log)

submission = pd.DataFrame({
    "id": df_test_merged["id"],
    "predicted_price": pred_price
})

submission.to_csv(ROOT_DIR/"24113027_final.csv", index=False)