In [None]:
import pandas as pd
import numpy as np
import tensorflow.keras.layers
from keras.src.layers import Lambda
from matplotlib import pyplot as plt
import seaborn as sns
import tensorflow as tf
import math
import tensorflow_decision_forests as tfdf
from sklearn.model_selection import train_test_split
from wurlitzer import sys_pipes
from tensorflow.keras.layers.experimental import preprocessing
from sklearn.model_selection import KFold
import keras_tuner as kt
import tqdm as tqdm

%matplotlib inline
plt.rcParams['figure.figsize'] = [16, 10]

In [None]:
def train_and_eval(model, train_ds, test_ds = None):
    # Optionally, add evaluation metrics.
    model.compile(metrics=["mse"])
    rmse = 0

    with sys_pipes():
        model.fit(x=train_ds)

    if test_ds is not None:
        evaluation = model.evaluate(x=test_ds, return_dict=True)
        rmse = math.sqrt(evaluation["mse"])

    return rmse

def latlon_to_xyz(lat, lon):
    lat, lon = np.radians(lat), np.radians(lon)
    x = np.cos(lat) * np.cos(lon)
    y = np.cos(lat) * np.sin(lon)
    z = np.sin(lat)
    return x, y, z

# Example: Normalize Cartesian coordinates between 0 and 1
def normalize_xyz(x, y, z):
    # Normalizing each coordinate to the [0, 1] range
    return (x + 1) / 2, (y + 1) / 2, (z + 1) / 2

class LatLonToXYZ(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(LatLonToXYZ, self).__init__(**kwargs)

    def call(self, inputs):
        lat, lon = inputs[..., 0], inputs[..., 1]  # Assuming inputs shape is (..., 2)
        rad_factor = tf.constant(np.pi / 180, dtype=tf.float32)
        lat, lon = lat * rad_factor, lon * rad_factor

        x = tf.cos(lat) * tf.cos(lon)
        y = tf.cos(lat) * tf.sin(lon)
        z = tf.sin(lat)

        return tf.stack([x, y, z], axis=-1)

class NormalizeXYZ(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(NormalizeXYZ, self).__init__(**kwargs)

    def call(self, inputs):
        return (inputs + 1) / 2  # Normalize to [0,1] range

def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    # Create a StringLookup layer which will turn strings into integer indices
    if dtype == 'string':
        index = preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = preprocessing.IntegerLookup(max_tokens=max_tokens)

    # TODO
    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: tf.compat.as_str_any(x[name]))

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Create a Discretization for our integer indices.
    encoder = preprocessing.CategoryEncoding(num_tokens=index.vocabulary_size())

    # Apply one-hot encoding to our indices. The lambda function captures the
    # layer so we can use them, or include them in the functional model later.
    # return lambda feature: encoder(index(feature))

    return Lambda(lambda feature: encoder(index(feature)))

def dataset_preprocessing(unused_columns, categorical_ft_columns, X_train):
    used_columns = []

    raw_inputs = {}
    processed_inputs = {}

    for col in X_train.columns:
        dtype = X_train.dtypes[col]
        if col in categorical_ft_columns:
            dtype = "string"
        raw_inputs[col] = tf.keras.layers.Input(shape=(1,), name=col, dtype=dtype)
        processed_inputs[col] = raw_inputs[col]

    processed_inputs["phi"] = raw_inputs["total_individuals"] / raw_inputs["total_households"]
    processed_inputs["id_area"] = raw_inputs["total_individuals"] / raw_inputs["AREA_SQKM"]
    processed_inputs["hs_area"] = raw_inputs["total_households"] / raw_inputs["AREA_SQKM"]
    lon_lat = tf.keras.layers.Concatenate(name="lon_lat")([raw_inputs["lon"], raw_inputs["lat"]])
    processed_inputs["xyz"] = NormalizeXYZ()(LatLonToXYZ()(lon_lat))
    zone_encoder = get_category_encoding_layer("ADM2_ID", train_ds, "string", 100)
    processed_inputs["zone"] = zone_encoder(raw_inputs["ADM2_ID"])
    processed_cols = ["total_individuals", "total_households", "AREA_SQKM", "lon", "lat", "ADM2_ID"]
    new_cols = ["phi", "id_area", "hs_area", "xyz", "zone"]

    all_features = []
    for col in X_train.columns:
        if col not in unused_columns + processed_cols:
            if col in categorical_ft_columns:
                all_features.append(tfdf.keras.FeatureUsage(name=col, semantic=tfdf.keras.FeatureSemantic.CATEGORICAL))
            else:
                all_features.append(tfdf.keras.FeatureUsage(name=col))
            used_columns.append(col)

    for col in new_cols:
        all_features.append(tfdf.keras.FeatureUsage(name=col))
        used_columns.append(col)

    preprocessor = tf.keras.Model(inputs=raw_inputs, outputs=processed_inputs, name="preprocessor")

    return all_features, preprocessor

def model_builder(hp):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Flatten(input_shape=(28, 28)))

    # Tune the number of units in the first Dense layer
    # Choose an optimal value between 32-512
    hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
    model.add(tf.keras.layers.Dense(units=hp_units, activation='relu'))
    model.add(tf.keras.layers.Dense(10))

    # Tune the learning rate for the optimizer
    # Choose an optimal value from 0.01, 0.001, or 0.0001
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model

In [None]:
df = pd.read_csv('outputs/Train.csv')
test_df = pd.read_csv('outputs/Test.csv')
vocab_df = pd.read_csv('variable_descriptions.csv')
admin_df = pd.read_csv('zaf_adminboundaries_tabulardata.csv', sep=";")

admin_df = admin_df[["ADM4_PCODE", "AREA_SQKM", "ADM2_ID"]] # ADM3_ID
admin_df["AREA_SQKM"] = admin_df["AREA_SQKM"].str.replace(",", ".").astype(float)
df = pd.merge(df, admin_df, on="ADM4_PCODE", how="left")
test_df = pd.merge(test_df, admin_df, on="ADM4_PCODE", how="left")

X = df.drop("target", axis=1)
y = df["target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

train_ds_pd = pd.concat([X_train, y_train], axis=1)
test_ds_pd = pd.concat([X_val, y_val], axis=1)
label_column = "target"
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label_column, task=tfdf.keras.Task.REGRESSION)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label_column, task=tfdf.keras.Task.REGRESSION)

default_columns = ["ward", "ADM4_PCODE"]
nn_cols = ["dw_00", "psa_00", "lan_00", "pg_00", "pw_00", "stv_00", "car_00", "lln_00"]
categorical_ft_columns = ["ADM2_ID"]

ft_columns = [
    default_columns + ["dw_12", "dw_13", "lan_13", "pw_08", "pw_07"],
    default_columns + ["dw_12", "dw_13", "lan_13", "pw_08", "pw_07"] + nn_cols,
    default_columns,
]