In [1]:
import pandas as pd
import numpy as np
import tensorflow.keras.layers
from keras.src.layers import Lambda
from matplotlib import pyplot as plt
import seaborn as sns
import tensorflow as tf
import math
import tensorflow_decision_forests as tfdf
from sklearn.model_selection import train_test_split
from wurlitzer import sys_pipes
from tensorflow.keras.layers.experimental import preprocessing
from sklearn.model_selection import KFold
import keras_tuner as kt
import tqdm as tqdm

%matplotlib inline
plt.rcParams['figure.figsize'] = [16, 10]

2025-03-28 08:22:02.394826: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-28 08:22:02.420497: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-28 08:22:03.112807: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-28 08:22:03.113147: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-28 08:22:03.163172: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

In [2]:
def train_and_eval(model, train_ds, test_ds = None):
    # Optionally, add evaluation metrics.
    model.compile(metrics=["mse"])
    rmse = 0

    with sys_pipes():
        model.fit(x=train_ds)

    if test_ds is not None:
        evaluation = model.evaluate(x=test_ds, return_dict=True)
        rmse = math.sqrt(evaluation["mse"])

    return rmse

def latlon_to_xyz(lat, lon):
    lat, lon = np.radians(lat), np.radians(lon)
    x = np.cos(lat) * np.cos(lon)
    y = np.cos(lat) * np.sin(lon)
    z = np.sin(lat)
    return x, y, z

# Example: Normalize Cartesian coordinates between 0 and 1
def normalize_xyz(x, y, z):
    # Normalizing each coordinate to the [0, 1] range
    return (x + 1) / 2, (y + 1) / 2, (z + 1) / 2

class LatLonToXYZ(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(LatLonToXYZ, self).__init__(**kwargs)

    def call(self, inputs):
        lat, lon = inputs[..., 0], inputs[..., 1]  # Assuming inputs shape is (..., 2)
        rad_factor = tf.constant(np.pi / 180, dtype=tf.float32)
        lat, lon = lat * rad_factor, lon * rad_factor

        x = tf.cos(lat) * tf.cos(lon)
        y = tf.cos(lat) * tf.sin(lon)
        z = tf.sin(lat)

        return tf.stack([x, y, z], axis=-1)

class NormalizeXYZ(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(NormalizeXYZ, self).__init__(**kwargs)

    def call(self, inputs):
        return (inputs + 1) / 2  # Normalize to [0,1] range

def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    # Create a StringLookup layer which will turn strings into integer indices
    if dtype == 'string':
        index = preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = preprocessing.IntegerLookup(max_tokens=max_tokens)

    # TODO
    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: tf.compat.as_str_any(x[name]))

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Create a Discretization for our integer indices.
    encoder = preprocessing.CategoryEncoding(num_tokens=index.vocabulary_size())

    # Apply one-hot encoding to our indices. The lambda function captures the
    # layer so we can use them, or include them in the functional model later.
    # return lambda feature: encoder(index(feature))

    return Lambda(lambda feature: encoder(index(feature)))

def dataset_preprocessing(unused_columns, categorical_ft_columns, X_train):
    used_columns = []

    raw_inputs = {}
    processed_inputs = {}

    for col in X_train.columns:
        dtype = X_train.dtypes[col]
        if col in categorical_ft_columns:
            dtype = "string"
        raw_inputs[col] = tf.keras.layers.Input(shape=(1,), name=col, dtype=dtype)
        processed_inputs[col] = raw_inputs[col]

    processed_inputs["phi"] = raw_inputs["total_individuals"] / raw_inputs["total_households"]
    processed_inputs["id_area"] = raw_inputs["total_individuals"] / raw_inputs["AREA_SQKM"]
    processed_inputs["hs_area"] = raw_inputs["total_households"] / raw_inputs["AREA_SQKM"]
    lon_lat = tf.keras.layers.Concatenate(name="lon_lat")([raw_inputs["lon"], raw_inputs["lat"]])
    processed_inputs["xyz"] = NormalizeXYZ()(LatLonToXYZ()(lon_lat))
    zone_encoder = get_category_encoding_layer("ADM2_ID", train_ds, "string", 100)
    processed_inputs["zone"] = zone_encoder(raw_inputs["ADM2_ID"])
    processed_cols = ["total_individuals", "total_households", "AREA_SQKM", "lon", "lat", "ADM2_ID"]
    new_cols = ["phi", "id_area", "hs_area", "xyz", "zone"]

    all_features = []
    for col in X_train.columns:
        if col not in unused_columns + processed_cols:
            if col in categorical_ft_columns:
                all_features.append(tfdf.keras.FeatureUsage(name=col, semantic=tfdf.keras.FeatureSemantic.CATEGORICAL))
            else:
                all_features.append(tfdf.keras.FeatureUsage(name=col))
            used_columns.append(col)

    for col in new_cols:
        all_features.append(tfdf.keras.FeatureUsage(name=col))
        used_columns.append(col)

    preprocessor = tf.keras.Model(inputs=raw_inputs, outputs=processed_inputs, name="preprocessor")

    return all_features, preprocessor

In [3]:
df = pd.read_csv('outputs/Train.csv')
test_df = pd.read_csv('outputs/Test.csv')
vocab_df = pd.read_csv('variable_descriptions.csv')
admin_df = pd.read_csv('zaf_adminboundaries_tabulardata.csv', sep=";")

admin_df = admin_df[["ADM4_PCODE", "AREA_SQKM", "ADM2_ID"]] # ADM3_ID
admin_df["AREA_SQKM"] = admin_df["AREA_SQKM"].str.replace(",", ".").astype(float)
df = pd.merge(df, admin_df, on="ADM4_PCODE", how="left")
test_df = pd.merge(test_df, admin_df, on="ADM4_PCODE", how="left")

X = df.drop("target", axis=1)
y = df["target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

train_ds_pd = pd.concat([X_train, y_train], axis=1)
test_ds_pd = pd.concat([X_val, y_val], axis=1)
label_column = "target"
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label_column, task=tfdf.keras.Task.REGRESSION)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label_column, task=tfdf.keras.Task.REGRESSION)

default_columns = ["ward", "ADM4_PCODE"]
nn_cols = ["dw_00", "psa_00", "lan_00", "pg_00", "pw_00", "stv_00", "car_00", "lln_00"]
categorical_ft_columns = ["ADM2_ID"]

ft_columns = [
    default_columns + ["dw_12", "dw_13", "lan_13", "pw_08", "pw_07"],
    default_columns + ["dw_12", "dw_13", "lan_13", "pw_08", "pw_07"] + nn_cols,
    default_columns,
]

In [None]:
all_models = {
    "RF": tfdf.keras.RandomForestModel,
    "GBM": tfdf.keras.GradientBoostedTreesModel
}
unused_columns = ft_columns[0]
preprocess_function = dataset_preprocessing
tuning_logs = {mn: {} for mn in all_models}

n_fold = 10
kf = KFold(n_splits=n_fold, shuffle=False)
hps = []
rmse = []
# Find best HP
for train_index, test_index in tqdm.tqdm(kf.split(X_train), desc="Tuning"):
    X_fold_train, X_fold_test = X_train.iloc[train_index], X_train.iloc[test_index]
    y_fold_train, y_fold_test = y_train.iloc[train_index], y_train.iloc[test_index]

    X_fold_train["target"] = y_fold_train
    X_fold_test["target"] = y_fold_test
    train_fold = tfdf.keras.pd_dataframe_to_tf_dataset(X_fold_train, label=label_column, task=tfdf.keras.Task.REGRESSION)
    test_fold = tfdf.keras.pd_dataframe_to_tf_dataset(X_fold_test, label=label_column, task=tfdf.keras.Task.REGRESSION)

    for mn, md in all_models.items():
        tuner = tfdf.tuner.RandomSearch(num_trials=100, use_predefined_hps=True)
        all_features, preprocessor = preprocess_function(unused_columns, categorical_ft_columns, X_train)
        tuned_model = md(features=all_features, exclude_non_specified_features=True, preprocessing=preprocessor, task=tfdf.keras.Task.REGRESSION, tuner=tuner)
        tuned_model.compile(metrics=["mse"])
        tuned_model.fit(train_ds)

        rmse = math.sqrt(tuned_model.evaluate(test_ds, return_dict=True, verbose=0)["mse"])
        tuning_logs = tuned_model.make_inspector().tuning_logs()
        hp = tuning_logs[tuning_logs.best].iloc[0]

        model_log = tuning_logs.get(mn)
        tuning_logs[mn] = {
            "rmse": model_log.get("rmse", []) + [rmse],
            "hp": model_log.get("hp", []) + [hp],
        }

# Eval with best HP
for mn, logs in tuning_logs.items():
    mean_rmse = np.mean(logs["rmse"])
    std_rmse = np.std(logs["rmse"])
    best_hp_idx = np.argmin(logs["hp"])
    md = all_models[mn]
    best_hp = logs["hp"][best_hp_idx]
    all_features, preprocessor = preprocess_function(unused_columns, categorical_ft_columns, X_train)
    best_model = md(features=all_features, exclude_non_specified_features=True, preprocessing=preprocessor, task=tfdf.keras.Task.REGRESSION, **best_hp)
    best_model.compile(metrics=["mse"])
    best_model.fit(train_ds)

    rmse = math.sqrt(best_model.evaluate(test_ds, return_dict=True, verbose=0)["mse"])
    print(f"{mn}: RMSE: {rmse}, HP MRMSE: {mean_rmse}, HP SMRMSE: {std_rmse}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_fold_train["target"] = y_fold_train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_fold_test["target"] = y_fold_test


Use /tmp/tmp0qqeype5 as temporary training directory
Reading training dataset...
Training dataset read in 0:00:11.252964. Found 2257 examples.
Training model...
