In [1]:
import pandas as pd
import numpy as np
import tensorflow.keras.layers
from keras.src.layers import Lambda
from matplotlib import pyplot as plt
import tensorflow as tf
import math
import ydf  # Yggdrasil Decision Forests
from sklearn.model_selection import train_test_split
from wurlitzer import sys_pipes
import keras.layers as preprocessing
from sklearn.model_selection import KFold
import tqdm as tqdm
from sklearn.preprocessing import MinMaxScaler


%matplotlib inline
plt.rcParams['figure.figsize'] = [16, 10]

2025-03-30 09:12:32.901386: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743340352.912744 1567528 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743340352.916277 1567528 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743340352.925991 1567528 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743340352.926000 1567528 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743340352.926001 1567528 computation_placer.cc:177] computation placer alr

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Set memory growth to avoid TensorFlow using all GPU memory
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)

        # Set TensorFlow to use only the first GPU (if multiple GPUs available)
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        print("Using GPU:", gpus[0])
    except RuntimeError as e:
        print(e)

Using GPU: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


In [3]:
def train_and_eval(model, train_ds, test_ds = None):
    # Optionally, add evaluation metrics.
    model.compile(metrics=["mse"])
    rmse = 0

    with sys_pipes():
        model.fit(x=train_ds)

    if test_ds is not None:
        evaluation = model.evaluate(x=test_ds, return_dict=True)
        rmse = math.sqrt(evaluation["mse"])

    return rmse

def latlon_to_xyz(lat, lon):
    lat, lon = np.radians(lat), np.radians(lon)
    x = np.cos(lat) * np.cos(lon)
    y = np.cos(lat) * np.sin(lon)
    z = np.sin(lat)
    return x, y, z

# Example: Normalize Cartesian coordinates between 0 and 1
def normalize_xyz(x, y, z):
    # Normalizing each coordinate to the [0, 1] range
    return (x + 1) / 2, (y + 1) / 2, (z + 1) / 2

In [4]:
ROOT_DIR = "temporary"
train_df = pd.read_csv(f'{ROOT_DIR}/Train.csv')
test_df = pd.read_csv(f'{ROOT_DIR}/Test.csv')
vocab_df = pd.read_csv(f'{ROOT_DIR}/variable_descriptions.csv')
admin_df = pd.read_csv(f'{ROOT_DIR}/zaf_adminboundaries_tabulardata.csv', sep=";")

admin_df = admin_df[["ADM4_PCODE", "AREA_SQKM", "ADM2_ID"]] # ADM3_ID
admin_df["AREA_SQKM"] = admin_df["AREA_SQKM"].str.replace(",", ".").astype(float)
train_df = pd.merge(train_df, admin_df, on="ADM4_PCODE", how="left")
test_df = pd.merge(test_df, admin_df, on="ADM4_PCODE", how="left")
label_column = "target"

default_columns = ["ward", "ADM4_PCODE"]
nul_cols = ["dw_12", "dw_13", "lan_13", "pw_08", "pw_07"] # Columns with null values
cat_columns = ["ADM2_ID"] # Categorical columns
ft_columns = default_columns + cat_columns

In [28]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans

scaler = MinMaxScaler()
kmeans = KMeans(n_clusters=5, random_state=42)
encoder = OneHotEncoder()
def preprocess_df(input_df, is_train=False):
    drop_cols = []
    df = input_df.copy()
    df = df.drop(nul_cols, axis=1)

    ## Create a new feature
    df["phi"] = df["total_individuals"] / df["total_households"]
    df["id_area"] = df["total_individuals"] / df["AREA_SQKM"]
    df["hs_area"] = df["total_households"] / df["AREA_SQKM"]
    df["cluster"] = kmeans.fit_predict(df[["lon", "lat"]]) if is_train else kmeans.predict(df[["lon", "lat"]])
    encoded_cluster = encoder.fit_transform(df[["cluster"]]) if is_train else encoder.transform(df[["cluster"]])
    encoded_cluster_df = pd.DataFrame(encoded_cluster.toarray(), columns=encoder.get_feature_names_out(["cluster"]))
    for col in encoded_cluster_df.columns:
        df[col] = encoded_cluster_df[col]

    ## Transform lat and lon to xyz
    df["x"], df["y"], df["z"] = zip(*df.apply(lambda x: latlon_to_xyz(x["lat"], x["lon"]), axis=1))
    df["x"], df["y"], df["z"] = zip(*df.apply(lambda x: normalize_xyz(x["x"], x["y"], x["z"]), axis=1))

    ## Normalize some columns
    norm_cols = ["phi", "NL", "id_area", "hs_area"]
    df[norm_cols] = scaler.fit_transform(df[norm_cols]) if is_train else scaler.transform(df[norm_cols])

    drop_cols = drop_cols + default_columns + cat_columns + ["cluster", "cluster_0"] # Drop one of the encoded columns, to avoid multicollinearity and the cluster column
    if len(drop_cols) > 0:
        df.drop(drop_cols, axis=1, inplace=True)
    
    return df

## Test preprocess_df
# preprocess_df(train_ds_pd, is_train=True).head()

In [36]:
all_models = {
    "GBT": ydf.GradientBoostedTreesLearner,
    "RF": ydf.RandomForestLearner,
}

## Split data on train and test based on the zone
X = train_df.drop(label_column, axis=1)
y = train_df[label_column]
g = train_df["ADM2_ID"]

## Sample a subset of the zone
test_g = g.sample(n=6, random_state=42)
train_g = g[~g.isin(test_g)]
X_train, X_val = X[g.isin(train_g)], X[g.isin(test_g)]
y_train, y_val = y[g.isin(train_g)], y[g.isin(test_g)]

X_train = preprocess_df(X_train, is_train=True)
X_val = preprocess_df(X_val, is_train=False)
X_train["target"] = y_train
X_val["target"] = y_val
outputs = {}

for mn, md in all_models.items():
    tuner = ydf.RandomSearchTuner(num_trials=20, automatic_search_space=True)
    model = md(label=label_column, task=ydf.Task.REGRESSION, num_threads=32, tuner=tuner)
    trainer = model.train(ds=X_train, valid=X_val if mn == "GBT" else None)
    evaluation = trainer.evaluate(X_val)
    outputs[mn] = {
        "evaluation": evaluation,
        "model": model,
        "tuner": tuner,
        "trainer": trainer,
    }

    print(f"{mn}: {evaluation.rmse:.2f}")
# tuning_logs

Train model on 2509 training examples and 313 validation examples
Model trained in 0:24:27.538198
GBT: 3.01
Train model on 2509 examples
Model trained in 1:21:39.314272
RF: 3.24


In [37]:
best_model_name = "GBT"
# best_hp = outputs[best_model_name]["hp"].hyperparameters
# best_model = all_models[best_model_name](label=label_column, task=ydf.Task.REGRESSION, num_threads=32, **best_hp)
# trainer = best_model.train(ds=X_train)
evaluation = outputs[best_model_name]["trainer"].evaluate(X_val)
print(f"Best model: {evaluation.rmse:.2f}")

Best model: 3.01


We can have a better model with tuning the hyperparameters.

In [None]:
test_feature_df = preprocess_df(test_df, is_train=False)
test_df['target'] = outputs[best_model_name]["trainer"].predict(test_feature_df)
timestamp = pd.Timestamp.now().strftime("%Y%m%d%H%M%S")
test_df[['ward', 'target']].to_csv(f'{timestamp}_submission.csv', index=False)