In [9]:
import pandas as pd
import numpy as np
import tensorflow.keras.layers
from keras.src.layers import Lambda
from matplotlib import pyplot as plt
import seaborn as sns
import tensorflow as tf
import math
import tensorflow_decision_forests as tfdf
from sklearn.model_selection import train_test_split
from wurlitzer import sys_pipes
from tensorflow.keras.layers.experimental import preprocessing
from sklearn.model_selection import KFold
import keras_tuner as kt
import tqdm as tqdm

%matplotlib inline
plt.rcParams['figure.figsize'] = [16, 10]

## Helpers

In [2]:
def train_and_eval(model, train_ds, test_ds = None):
    # Optionally, add evaluation metrics.
    model.compile(metrics=["mse"])
    rmse = 0

    with sys_pipes():
        model.fit(x=train_ds)

    if test_ds is not None:
        evaluation = model.evaluate(x=test_ds, return_dict=True)
        rmse = math.sqrt(evaluation["mse"])

    return rmse

def latlon_to_xyz(lat, lon):
    lat, lon = np.radians(lat), np.radians(lon)
    x = np.cos(lat) * np.cos(lon)
    y = np.cos(lat) * np.sin(lon)
    z = np.sin(lat)
    return x, y, z

# Example: Normalize Cartesian coordinates between 0 and 1
def normalize_xyz(x, y, z):
    # Normalizing each coordinate to the [0, 1] range
    return (x + 1) / 2, (y + 1) / 2, (z + 1) / 2

class LatLonToXYZ(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(LatLonToXYZ, self).__init__(**kwargs)

    def call(self, inputs):
        lat, lon = inputs[..., 0], inputs[..., 1]  # Assuming inputs shape is (..., 2)
        rad_factor = tf.constant(np.pi / 180, dtype=tf.float32)
        lat, lon = lat * rad_factor, lon * rad_factor

        x = tf.cos(lat) * tf.cos(lon)
        y = tf.cos(lat) * tf.sin(lon)
        z = tf.sin(lat)

        return tf.stack([x, y, z], axis=-1)

class NormalizeXYZ(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(NormalizeXYZ, self).__init__(**kwargs)

    def call(self, inputs):
        return (inputs + 1) / 2  # Normalize to [0,1] range

def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    # Create a StringLookup layer which will turn strings into integer indices
    if dtype == 'string':
        index = preprocessing.StringLookup(max_tokens=max_tokens)
    else:
        index = preprocessing.IntegerLookup(max_tokens=max_tokens)

    # TODO
    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: tf.compat.as_str_any(x[name]))

    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)

    # Create a Discretization for our integer indices.
    encoder = preprocessing.CategoryEncoding(num_tokens=index.vocabulary_size())

    # Apply one-hot encoding to our indices. The lambda function captures the
    # layer so we can use them, or include them in the functional model later.
    # return lambda feature: encoder(index(feature))

    return Lambda(lambda feature: encoder(index(feature)))


In [3]:
def find_best_hp(model, is_dnn_model, train_ds, test_ds):
    if not is_dnn_model:
        tuner = tfdf.tuner.RandomSearch(num_trials=100, use_predefined_hps=True)
        tuned_model = model(tuner=tuner)
        model.compile(metrics=["mse"])
        tuned_model.fit(train_ds)

        k_rmse = math.sqrt(tuned_model.evaluate(test_ds, return_dict=True, verbose=0)["mse"])
        tuning_logs = tuned_model.make_inspector().tuning_logs()
        best_hps = tuning_logs[tuning_logs.best].iloc[0]

    else:
        tuner = kt.Hyperband(build_dnn_model, objective='mse', max_epochs=100,
                             overwrite=True,
                             factor=3,
                             directory='/tmp/keras_tuning',
                             project_name='women_poverty')

        stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
        tuner.search(train_ds, validation_data=test_ds, callbacks=[stop_early])
        best_hps = tuner.get_best_hyperparameters()[0].values
        hypermodel = tuner.hypermodel.build(best_hps)
        hypermodel.fit(train_ds, callbacks=[stop_early])
        k_rmse = math.sqrt(hypermodel.evaluate(test_ds, return_dict=True, verbose=0)["mse"])

    return k_rmse, best_hps

def dataset_preprocessing(unused_columns, categorical_ft_columns, X_train):
    used_columns = []

    raw_inputs = {}
    processed_inputs = {}

    for col in X_train.columns:
        dtype = X_train.dtypes[col]
        if col in categorical_ft_columns:
            dtype = "string"
        raw_inputs[col] = tf.keras.layers.Input(shape=(1,), name=col, dtype=dtype)
        processed_inputs[col] = raw_inputs[col]

    processed_inputs["phi"] = raw_inputs["total_individuals"] / raw_inputs["total_households"]
    processed_inputs["id_area"] = raw_inputs["total_individuals"] / raw_inputs["AREA_SQKM"]
    processed_inputs["hs_area"] = raw_inputs["total_households"] / raw_inputs["AREA_SQKM"]
    lon_lat = tf.keras.layers.Concatenate(name="lon_lat")([raw_inputs["lon"], raw_inputs["lat"]])
    processed_inputs["xyz"] = NormalizeXYZ()(LatLonToXYZ()(lon_lat))
    zone_encoder = get_category_encoding_layer("ADM2_ID", train_ds, "string", 100)
    processed_inputs["zone"] = zone_encoder(raw_inputs["ADM2_ID"])
    processed_cols = ["total_individuals", "total_households", "AREA_SQKM", "lon", "lat", "ADM2_ID"]
    new_cols = ["phi", "id_area", "hs_area", "xyz", "zone"]

    all_features = []
    for col in X_train.columns:
        if col not in unused_columns + processed_cols:
            if col in categorical_ft_columns:
                all_features.append(tfdf.keras.FeatureUsage(name=col, semantic=tfdf.keras.FeatureSemantic.CATEGORICAL))
            else:
                all_features.append(tfdf.keras.FeatureUsage(name=col))
            used_columns.append(col)

    for col in new_cols:
        all_features.append(tfdf.keras.FeatureUsage(name=col))
        used_columns.append(col)

    preprocessor = tf.keras.Model(inputs=raw_inputs, outputs=processed_inputs, name="preprocessor")

    return all_features, preprocessor

## Load Data

In [7]:
df = pd.read_csv('outputs/Train.csv')
test_df = pd.read_csv('outputs/Test.csv')
vocab_df = pd.read_csv('variable_descriptions.csv')
admin_df = pd.read_csv('zaf_adminboundaries_tabulardata.csv', sep=";")

admin_df = admin_df[["ADM4_PCODE", "AREA_SQKM"]]
admin_df["AREA_SQKM"] = admin_df["AREA_SQKM"].str.replace(",", ".").astype(float)
df = pd.merge(df, admin_df, on="ADM4_PCODE", how="left")
test_df = pd.merge(test_df, admin_df, on="ADM4_PCODE", how="left")

X = df.drop("target", axis=1)
y = df["target"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

In [8]:
train_ds_pd = pd.concat([X_train, y_train], axis=1)
test_ds_pd = pd.concat([X_val, y_val], axis=1)
label = "target"
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label, task=tfdf.keras.Task.REGRESSION)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label, task=tfdf.keras.Task.REGRESSION)

## TFDF baseline

In [12]:
tfdf.keras.get_all_models()


[tensorflow_decision_forests.keras.RandomForestModel,
 tensorflow_decision_forests.keras.GradientBoostedTreesModel,
 tensorflow_decision_forests.keras.CartModel,
 tensorflow_decision_forests.keras.DistributedGradientBoostedTreesModel]

In [21]:
## show some row of train_ds
for row in train_ds.take(1):
    print(len(row),  row[1][0], row[0]["ward"])

2 tf.Tensor(16.7737569179, shape=(), dtype=float64) tf.Tensor(
[b'41601001: Ward 1' b'41601002: Ward 2' b'41601003: Ward 3'
 b'41601004: Ward 4' b'41601005: Ward 5' b'41601006: Ward 6'
 b'41602001: Ward 1' b'41602002: Ward 2' b'41602003: Ward 3'
 b'41602004: Ward 4' b'41602005: Ward 5' b'41602006: Ward 6'
 b'41602007: Ward 7' b'41602008: Ward 8' b'41603001: Ward 1'
 b'41603002: Ward 2' b'41603003: Ward 3' b'41603004: Ward 4'
 b'41603005: Ward 5' b'41603006: Ward 6' b'41801001: Ward 1'
 b'41801002: Ward 2' b'41801003: Ward 3' b'41801004: Ward 4'
 b'41801005: Ward 5' b'41801006: Ward 6' b'41801007: Ward 7'
 b'41801008: Ward 8' b'41801009: Ward 9' b'41801010: Ward 10'
 b'41802001: Ward 1' b'41802002: Ward 2' b'41802003: Ward 3'
 b'41802004: Ward 4' b'41803001: Ward 1' b'41803002: Ward 2'
 b'41803003: Ward 3' b'41803004: Ward 4' b'41803005: Ward 5'
 b'41803006: Ward 6' b'41803007: Ward 7' b'41803008: Ward 8'
 b'41804001: Ward 1' b'41804002: Ward 2' b'41804003: Ward 3'
 b'41804004: Ward 4' 

In [24]:
# Random Forest
base_rf_model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)
rf_rmse = train_and_eval(base_rf_model, train_ds, test_ds)

# GradientBoostedTreesModel
base_gbt_model = tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION)
gbm_rmse = train_and_eval(base_gbt_model, train_ds, test_ds)

print("\n\n")
print(f"Random Forest RMSE: {rf_rmse}")
print(f"GBM RMSE: {gbm_rmse}")

Use /tmp/tmpg4_8p3hf as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.512333. Found 2257 examples.
Training model...
Model trained in 0:00:01.521313
Compiling model...
Model compiled.
Use /tmp/tmpfi4p8c8a as temporary training directory




Reading training dataset...
Training dataset read in 0:00:00.710514. Found 2257 examples.
Training model...
Model trained in 0:00:02.749378
Compiling model...
Model compiled.



Random Forest RMSE: 3.948803133653853
GBM RMSE: 3.935451035007041


In [27]:
# Random Forest
base_rf_model = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION, hyperparameter_template="benchmark_rank1")
rf_rmse = train_and_eval(base_rf_model, train_ds, test_ds)

# GradientBoostedTreesModel
base_gbt_model = tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION, hyperparameter_template="benchmark_rank1")
gbm_rmse = train_and_eval(base_gbt_model, train_ds, test_ds)

print("\n\n")
print(f"Random Forest RMSE: {rf_rmse}")
print(f"GBM RMSE: {gbm_rmse}")

Resolve hyper-parameter template "benchmark_rank1" to "benchmark_rank1@v1" -> {'winner_take_all': True, 'categorical_algorithm': 'RANDOM', 'split_axis': 'SPARSE_OBLIQUE', 'sparse_oblique_normalization': 'MIN_MAX', 'sparse_oblique_num_projections_exponent': 1.0}.
Use /tmp/tmpna0yrmuz as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.543317. Found 2257 examples.
Training model...
Model trained in 0:00:12.252680
Compiling model...
Model compiled.
Resolve hyper-parameter template "benchmark_rank1" to "benchmark_rank1@v1" -> {'growing_strategy': 'BEST_FIRST_GLOBAL', 'categorical_algorithm': 'RANDOM', 'split_axis': 'SPARSE_OBLIQUE', 'sparse_oblique_normalization': 'MIN_MAX', 'sparse_oblique_num_projections_exponent': 1.0}.
Use /tmp/tmp1akv3qn7 as temporary training directory




Reading training dataset...


 25-03-26 07:32:53.4515 EDT gradient_boosted_trees.cc:1886] "goss_alpha" set but "sampling_method" not equal to "GOSS".


Training dataset read in 0:00:00.467344. Found 2257 examples.
Training model...
Model trained in 0:00:04.162935
Compiling model...
Model compiled.



Random Forest RMSE: 3.8236503763180782
GBM RMSE: 3.7203017130907248


In [38]:
rs = {}
for md, nm in zip([tfdf.keras.RandomForestModel, tfdf.keras.GradientBoostedTreesModel, tfdf.keras.CartModel], ["RF", "GBM", "CM"]):
    rs[nm] = {}
    for bhp in md.predefined_hyperparameters():
        tmd = md(task = tfdf.keras.Task.REGRESSION, hyperparameter_template=bhp.name)
        gbm_rmse = train_and_eval(tmd, train_ds, test_ds)
        rs[nm][bhp.name] = gbm_rmse


print("\n\n")
rs

Resolve hyper-parameter template "better_default" to "better_default@v1" -> {'winner_take_all': True}.
Use /tmp/tmpwxxgpi7w as temporary training directory
Reading training dataset...
Training dataset read in 0:00:01.338744. Found 2257 examples.
Training model...
Model trained in 0:00:03.156727
Compiling model...
Model compiled.
Resolve hyper-parameter template "benchmark_rank1" to "benchmark_rank1@v1" -> {'winner_take_all': True, 'categorical_algorithm': 'RANDOM', 'split_axis': 'SPARSE_OBLIQUE', 'sparse_oblique_normalization': 'MIN_MAX', 'sparse_oblique_num_projections_exponent': 1.0}.
Use /tmp/tmph7snb5vv as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.639573. Found 2257 examples.
Training model...
Model trained in 0:00:14.265966
Compiling model...
Model compiled.
Resolve hyper-parameter template "better_default" to "better_default@v1" -> {'growing_strategy': 'BEST_FIRST_GLOBAL'}.
Use /tmp/tmpnoxvprcr as temporary training directory




Reading training dataset...
Training dataset read in 0:00:00.733964. Found 2257 examples.
Training model...
Model trained in 0:00:03.850088
Compiling model...
Model compiled.
Resolve hyper-parameter template "benchmark_rank1" to "benchmark_rank1@v1" -> {'growing_strategy': 'BEST_FIRST_GLOBAL', 'categorical_algorithm': 'RANDOM', 'split_axis': 'SPARSE_OBLIQUE', 'sparse_oblique_normalization': 'MIN_MAX', 'sparse_oblique_num_projections_exponent': 1.0}.
Use /tmp/tmpae5pvmhq as temporary training directory




Reading training dataset...
Training dataset read in 0:00:00.773336. Found 2257 examples.
Training model...
Model trained in 0:00:04.289561
Compiling model...
Model compiled.





{'RF': {'better_default': 3.948803133653853,
  'benchmark_rank1': 3.8236503763180782},
 'GBM': {'better_default': 3.87372051157626,
  'benchmark_rank1': 3.7203017130907248},
 'CM': {}}

In [40]:
cart_md = tfdf.keras.CartModel(task = tfdf.keras.Task.REGRESSION)
cart_rmse = train_and_eval(cart_md, train_ds, test_ds)
cart_rmse

Use /tmp/tmpg2ep1kcn as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.600077. Found 2257 examples.
Training model...
Model trained in 0:00:00.064971
Compiling model...
Model compiled.


5.15198058033476

In [28]:
unk_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_df, task=tfdf.keras.Task.REGRESSION)
main_ds = tfdf.keras.pd_dataframe_to_tf_dataset(df, label=label, task=tfdf.keras.Task.REGRESSION)

final_gbt_model = tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION, hyperparameter_template="benchmark_rank1")
train_and_eval(final_gbt_model, main_ds)
predictions = final_gbt_model.predict(unk_ds)

Resolve hyper-parameter template "benchmark_rank1" to "benchmark_rank1@v1" -> {'growing_strategy': 'BEST_FIRST_GLOBAL', 'categorical_algorithm': 'RANDOM', 'split_axis': 'SPARSE_OBLIQUE', 'sparse_oblique_normalization': 'MIN_MAX', 'sparse_oblique_num_projections_exponent': 1.0}.
Use /tmp/tmp706heg6c as temporary training directory




Reading training dataset...
Training dataset read in 0:00:00.597815. Found 2822 examples.
Training model...
Model trained in 0:00:07.076419
Compiling model...
Model compiled.


In [29]:
timestamp = pd.Timestamp.now().strftime("%Y%m%d%H%M%S")
test_df['target'] = predictions
test_df[['ward', 'target']].to_csv(f'outputs/{timestamp}_submission.csv', index=False)
# I score 3.74

In [30]:
print(tfdf.keras.GradientBoostedTreesModel.predefined_hyperparameters()[0].name)
print(tfdf.keras.RandomForestModel.predefined_hyperparameters())

better_default
[HyperParameterTemplate(name='better_default', version=1, parameters={'winner_take_all': True}, description='A configuration that is generally better than the default parameters without being more expensive.'), HyperParameterTemplate(name='benchmark_rank1', version=1, parameters={'winner_take_all': True, 'categorical_algorithm': 'RANDOM', 'split_axis': 'SPARSE_OBLIQUE', 'sparse_oblique_normalization': 'MIN_MAX', 'sparse_oblique_num_projections_exponent': 1.0}, description='Top ranking hyper-parameters on our benchmark slightly modified to run in reasonable time.')]


In [33]:
final_gbt_model.summary()

Model: "gradient_boosted_trees_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1 (1.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 1 (1.00 Byte)
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: REGRESSION
Label: "__LABEL"

Input Features (63):
	ADM4_PCODE
	AREA_SQKM
	NL
	car_00
	car_01
	dw_00
	dw_01
	dw_02
	dw_03
	dw_04
	dw_05
	dw_06
	dw_07
	dw_08
	dw_09
	dw_10
	dw_11
	dw_12
	dw_13
	lan_00
	lan_01
	lan_02
	lan_03
	lan_04
	lan_05
	lan_06
	lan_07
	lan_08
	lan_09
	lan_10
	lan_11
	lan_12
	lan_13
	lan_14
	lat
	lgt_00
	lln_00
	lln_01
	lon
	pg_00
	pg_01
	pg_02
	pg_03
	pg_04
	psa_00
	psa_01
	psa_02
	psa_03
	psa_04
	pw_00
	pw_01
	pw_02
	pw_03
	pw_04
	pw_05
	pw_06
	pw_07
	pw_08
	stv_00
	stv_01
	total_households
	total_individuals
	ward

No weights

Variable Importance: INV_MEAN_MIN_DEPTH:
    1.            "car_01"  0.190

In [34]:
final_gbt_model.make_inspector().features()

["ADM4_PCODE" (4; #0),
 "AREA_SQKM" (1; #1),
 "NL" (1; #2),
 "car_00" (1; #4),
 "car_01" (1; #5),
 "dw_00" (1; #6),
 "dw_01" (1; #7),
 "dw_02" (1; #8),
 "dw_03" (1; #9),
 "dw_04" (1; #10),
 "dw_05" (1; #11),
 "dw_06" (1; #12),
 "dw_07" (1; #13),
 "dw_08" (1; #14),
 "dw_09" (1; #15),
 "dw_10" (1; #16),
 "dw_11" (1; #17),
 "dw_12" (1; #18),
 "dw_13" (1; #19),
 "lan_00" (1; #20),
 "lan_01" (1; #21),
 "lan_02" (1; #22),
 "lan_03" (1; #23),
 "lan_04" (1; #24),
 "lan_05" (1; #25),
 "lan_06" (1; #26),
 "lan_07" (1; #27),
 "lan_08" (1; #28),
 "lan_09" (1; #29),
 "lan_10" (1; #30),
 "lan_11" (1; #31),
 "lan_12" (1; #32),
 "lan_13" (1; #33),
 "lan_14" (1; #34),
 "lat" (1; #35),
 "lgt_00" (1; #36),
 "lln_00" (1; #37),
 "lln_01" (1; #38),
 "lon" (1; #39),
 "pg_00" (1; #40),
 "pg_01" (1; #41),
 "pg_02" (1; #42),
 "pg_03" (1; #43),
 "pg_04" (1; #44),
 "psa_00" (1; #45),
 "psa_01" (1; #46),
 "psa_02" (1; #47),
 "psa_03" (1; #48),
 "psa_04" (1; #49),
 "pw_00" (1; #50),
 "pw_01" (1; #51),
 "pw_02" (1; 

In [35]:
tfdf.model_plotter.plot_model_in_colab(final_gbt_model, tree_idx=0, max_depth=3)

In [36]:
# Explain here: https://github.com/tensorflow/decision-forests/issues/52#issuecomment-925755243
final_gbt_model.make_inspector().variable_importances()

{'INV_MEAN_MIN_DEPTH': [("car_01" (1; #5), 0.19088141764709893),
  ("car_00" (1; #4), 0.18806990617584238),
  ("NL" (1; #2), 0.18095244829611615),
  ("dw_02" (1; #8), 0.17698648498938924),
  ("dw_01" (1; #7), 0.17451612373401998),
  ("AREA_SQKM" (1; #1), 0.17369229147518211),
  ("dw_10" (1; #16), 0.17364537349343007),
  ("dw_07" (1; #13), 0.17342033757502368),
  ("psa_00" (1; #45), 0.1724796905295596),
  ("dw_06" (1; #12), 0.1718197306783491),
  ("dw_11" (1; #17), 0.1715041243202503),
  ("dw_04" (1; #10), 0.17012027575605337),
  ("dw_05" (1; #11), 0.1697224675194455),
  ("dw_00" (1; #6), 0.16900865199538798),
  ("dw_08" (1; #14), 0.1684114561182447),
  ("lan_04" (1; #24), 0.16839273851328226),
  ("lan_01" (1; #21), 0.16794873289792064),
  ("dw_03" (1; #9), 0.1678277632258081),
  ("dw_12" (1; #18), 0.16712009177654324),
  ("lan_00" (1; #20), 0.16711329814098572),
  ("dw_09" (1; #15), 0.1668645794731588),
  ("lan_03" (1; #23), 0.16676436321119031),
  ("lan_09" (1; #29), 0.166554259693737

In [37]:
final_gbt_model.make_inspector().evaluation()

Evaluation(num_examples=None, accuracy=None, loss=2.7011845111846924, rmse=2.7011845111846924, ndcg=None, aucs=None, auuc=None, qini=None)

Our baseline model using TFDF only use numerical values, and score here 3.72 on our test set and on the lb 3.74

## TFDF with Feature engineering

In [41]:
df = pd.read_csv('outputs/Train.csv')
test_df = pd.read_csv('outputs/Test.csv')
vocab_df = pd.read_csv('variable_descriptions.csv')
admin_df = pd.read_csv('zaf_adminboundaries_tabulardata.csv', sep=";")

admin_df = admin_df[["ADM4_PCODE", "AREA_SQKM", "ADM2_ID"]] # ADM3_ID
admin_df["AREA_SQKM"] = admin_df["AREA_SQKM"].str.replace(",", ".").astype(float)
df = pd.merge(df, admin_df, on="ADM4_PCODE", how="left")
test_df = pd.merge(test_df, admin_df, on="ADM4_PCODE", how="left")

X = df.drop("target", axis=1)
y = df["target"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

train_ds_pd = pd.concat([X_train, y_train], axis=1)
test_ds_pd = pd.concat([X_val, y_val], axis=1)
label = "target"
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label, task=tfdf.keras.Task.REGRESSION)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label, task=tfdf.keras.Task.REGRESSION)

In [82]:
default_columns = ["ward", "ADM4_PCODE"]
nn_cols = ["dw_00", "psa_00", "lan_00", "pg_00", "pw_00", "stv_00", "car_00", "lln_00"]
categorical_ft_columns = ["ADM2_ID"]

ft_columns = [
    default_columns + ["dw_12", "dw_13", "lan_13", "pw_08", "pw_07"],
    default_columns + ["dw_12", "dw_13", "lan_13", "pw_08", "pw_07"] + nn_cols,
    default_columns,
]

In [108]:
X_train["ADM2_ID"][0]

'DC16'

In [176]:
unused_columns = ft_columns[0]
used_columns = []

raw_inputs = {}
processed_inputs = {}
processed_cols = []

for col in X_train.columns:
    dtype = X_train.dtypes[col]
    if col in categorical_ft_columns:
        dtype = "string"
    raw_inputs[col] = tf.keras.layers.Input(shape=(1,), name=col, dtype=dtype)
    processed_inputs[col] = raw_inputs[col]

processed_inputs["phi"] = raw_inputs["total_individuals"] / raw_inputs["total_households"]
processed_inputs["id_area"] = raw_inputs["total_individuals"] / raw_inputs["AREA_SQKM"]
processed_inputs["hs_area"] = raw_inputs["total_households"] / raw_inputs["AREA_SQKM"]
lon_lat = tf.keras.layers.Concatenate(name="lon_lat")([raw_inputs["lon"], raw_inputs["lat"]])
processed_inputs["xyz"] = NormalizeXYZ()(LatLonToXYZ()(lon_lat))
zone_encoder = get_category_encoding_layer("ADM2_ID", train_ds, "string", 100)
processed_inputs["zone"] = zone_encoder(raw_inputs["ADM2_ID"])
processed_cols = ["total_individuals", "total_households", "AREA_SQKM", "lon", "lat", "ADM2_ID"]
new_cols = ["phi", "id_area", "hs_area", "xyz", "zone"]
categorical_new_cols = ["zone"]

all_features = []
for col in X_train.columns:
    if col not in unused_columns + processed_cols:
        if col in categorical_ft_columns:
            all_features.append(tfdf.keras.FeatureUsage(name=col, semantic=tfdf.keras.FeatureSemantic.CATEGORICAL))
        else:
            all_features.append(tfdf.keras.FeatureUsage(name=col))
        used_columns.append(col)

for col in new_cols:
    all_features.append(tfdf.keras.FeatureUsage(name=col))
    used_columns.append(col)

preprocessor = tf.keras.Model(inputs=raw_inputs, outputs=processed_inputs, name="preprocessor")

base_model = tfdf.keras.GradientBoostedTreesModel(features=all_features, exclude_non_specified_features=True, preprocessing=preprocessor, task=tfdf.keras.Task.REGRESSION, hyperparameter_template="benchmark_rank1")

Resolve hyper-parameter template "benchmark_rank1" to "benchmark_rank1@v1" -> {'growing_strategy': 'BEST_FIRST_GLOBAL', 'categorical_algorithm': 'RANDOM', 'split_axis': 'SPARSE_OBLIQUE', 'sparse_oblique_normalization': 'MIN_MAX', 'sparse_oblique_num_projections_exponent': 1.0}.
Use /tmp/tmpxzl3d07v as temporary training directory




In [181]:
gbm_rmse = train_and_eval(base_model, train_ds, test_ds)
gbm_rmse

Reading training dataset...
Training dataset read in 0:00:00.947761. Found 2257 examples.
Training model...
Model trained in 0:00:06.876870
Compiling model...
Model compiled.


3.988094852287529

The rmse of the model with new feature is not better than previously, it need better FE.

In [185]:
tf.keras.utils.plot_model(base_model, show_shapes=True, rankdir="LR")

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [114]:
base_model.make_inspector().variable_importances()

{'INV_MEAN_MIN_DEPTH': [("car_00" (1; #2), 0.20574400280849667),
  ("car_01" (1; #3), 0.19369827712912496),
  ("NL" (1; #0), 0.19333293887370645),
  ("psa_00" (1; #41), 0.1750128830898806),
  ("dw_00" (1; #4), 0.17491623126624706),
  ("dw_01" (1; #5), 0.1735350193939619),
  ("dw_03" (1; #7), 0.1714179999961673),
  ("dw_10" (1; #14), 0.17107255573738647),
  ("dw_02" (1; #6), 0.17105417369549325),
  ("dw_07" (1; #11), 0.17043365231453766),
  ("dw_06" (1; #10), 0.17037786596669036),
  ("lan_05" (1; #23), 0.1694133517830304),
  ("dw_08" (1; #12), 0.169086774550926),
  ("lan_12" (1; #30), 0.16881749301505897),
  ("dw_09" (1; #13), 0.16813116764746475),
  ("dw_04" (1; #8), 0.16812885043671397),
  ("phi" (1; #40), 0.16802357436350301),
  ("dw_05" (1; #9), 0.16799287661940232),
  ("pw_00" (1; #46), 0.16750311250014616),
  ("hs_area" (1; #16), 0.16746402406072686),
  ("dw_11" (1; #15), 0.16690361044894161),
  ("lan_00" (1; #18), 0.16684662477453702),
  ("lan_07" (1; #25), 0.1665556182055481),
 