## Installing TensorFlow Decision Forests

In [None]:
!pip install tensorflow_decision_forests



## Importing libraries

In [None]:
import tensorflow_decision_forests as tfdf
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import math
tf.random.set_seed(8395337)

### Load the dataset and convert it in a tf.Dataset

In [None]:
family_data=pd.read_csv('./Metadata_and_relative_abundance_of_seminal_microbiota_from_idiopathic_infertile_patients_and_donors.xlsx - Family-level microbiota.csv', na_values=['#DIV/0!'])
genus_data=pd.read_csv('./Metadata_and_relative_abundance_of_seminal_microbiota_from_idiopathic_infertile_patients_and_donors.xlsx - Genus-level microbiota.csv', na_values=['#DIV/0!'])
phylum_data=pd.read_csv('./Metadata_and_relative_abundance_of_seminal_microbiota_from_idiopathic_infertile_patients_and_donors.xlsx - Pylum-level microbiota.csv', na_values=['#DIV/0!'])
metadata=pd.read_csv('./Metadata_and_relative_abundance_of_seminal_microbiota_from_idiopathic_infertile_patients_and_donors.xlsx - Sample info + Sperm quality.csv', na_values=['#DIV/0!'])
pd.set_option('display.max_columns', None)
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
metadata_phylum_relation = pd.merge(metadata[['Sample ID', 'Clinical status']],phylum_data , on = 'Sample ID' , how = 'inner')
metadata_family_relation = pd.merge(metadata[['Sample ID', 'Clinical status']],family_data , on = 'Sample ID' , how = 'inner')
metadata_genus_relation = pd.merge(metadata[['Sample ID', 'Clinical status']],genus_data , on = 'Sample ID' , how = 'inner')

In [None]:
metadata_phylum_relation['Clinical status'] = metadata_phylum_relation.get('Clinical status').apply(lambda x: 0 if x == 'Infertile' else 1)
metadata_family_relation['Clinical status'] = metadata_family_relation.get('Clinical status').apply(lambda x: 0 if x == 'Infertile' else 1)
metadata_genus_relation['Clinical status'] = metadata_genus_relation.get('Clinical status').apply(lambda x: 0 if x == 'Infertile' else 1)


In [None]:
metadata_phylum_relation.drop('Sample ID', axis=1, inplace=True)
metadata_family_relation.drop('Sample ID', axis=1, inplace=True)
metadata_genus_relation.drop('Sample ID', axis=1, inplace=True)

In [None]:
dataset = metadata_phylum_relation

In [None]:
# Split the dataset into a training and a testing dataset.
DO_EVAL = True

def split_dataset(dataset, test_ratio=0.15):
  """Splits a panda dataframe in two."""
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]

if DO_EVAL:
  train_ds_pd, test_ds_pd = split_dataset(dataset)
  print("{} examples in training, {} examples for testing.".format(
      len(train_ds_pd), len(test_ds_pd)))
else:
  train_ds_pd = dataset

48 examples in training, 8 examples for testing.


And finally, convert the pandas dataframe (`pd.Dataframe`) into tensorflow datasets (`tf.data.Dataset`):

In [None]:
label = "Clinical status"
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label, fix_feature_names=False)
if DO_EVAL:
  test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test_ds_pd, label=label, fix_feature_names=False)

### Train the model

In [None]:
# Specify the model.
model_1 = tfdf.keras.RandomForestModel(verbose=2)

# Train the model.
model_1.fit(train_ds)

Use 2 thread(s) for training
Use /tmp/tmptcmxye7a as temporary training directory
Reading training dataset...
Training tensor examples:
Features: {'Firmicutes': <tf.Tensor 'data:0' shape=(None,) dtype=float64>, 'Proteobacteria': <tf.Tensor 'data_1:0' shape=(None,) dtype=float64>, 'Actinobacteria': <tf.Tensor 'data_2:0' shape=(None,) dtype=float64>, 'Tenericutes': <tf.Tensor 'data_3:0' shape=(None,) dtype=float64>, 'Bacteroidetes': <tf.Tensor 'data_4:0' shape=(None,) dtype=float64>, 'Armatimonadetes': <tf.Tensor 'data_5:0' shape=(None,) dtype=float64>, 'Spirochaetes': <tf.Tensor 'data_6:0' shape=(None,) dtype=float64>, 'Planctomycetes': <tf.Tensor 'data_7:0' shape=(None,) dtype=float64>, 'Verrucomicrobia': <tf.Tensor 'data_8:0' shape=(None,) dtype=float64>, 'Chloroflexi': <tf.Tensor 'data_9:0' shape=(None,) dtype=float64>, 'Chrysiogenetes': <tf.Tensor 'data_10:0' shape=(None,) dtype=float64>, 'Aquificae': <tf.Tensor 'data_11:0' shape=(None,) dtype=float64>, 'Fusobacteria': <tf.Tensor 'd

[INFO 23-12-16 19:50:00.1681 UTC kernel.cc:771] Start Yggdrasil model training
[INFO 23-12-16 19:50:00.1682 UTC kernel.cc:772] Collect training examples
[INFO 23-12-16 19:50:00.1682 UTC kernel.cc:785] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 23-12-16 19:50:00.1688 UTC kernel.cc:391] Number of batches: 1
[INFO 23-12-16 19:50:00.1688 UTC kernel.cc:392] Number of examples: 48
[INFO 23-12-16 19:50:00.1689 UTC kernel.cc:792] Training dataset:
Number of records: 48
Number of columns: 32

Number of columns by type:
	NUMERICAL: 31 (96.875%)
	CATEGORICAL: 1 (3.125%)

Columns:

NUMERICAL: 31 (96.875%)
	0: "Acidobacteria" NUMERICAL mean:0.0107396 min:0 max:0.3866 sd

Model trained in 0:00:00.487677
Compiling model...
Model compiled.


<keras.src.callbacks.History at 0x7cc2e961e1d0>

Let's evaluate our model on the test dataset.

In [None]:
model_1.compile(metrics=["accuracy"])
evaluation = model_1.evaluate(test_ds, return_dict=True)

for name, value in evaluation.items():
  print(f"{name}: {value:.4f}")

loss: 0.0000
accuracy: 1.0000


## Prepare this model for TensorFlow Serving.

Export the model to the SavedModel format for later re-use e.g.
[TensorFlow Serving](https://www.tensorflow.org/tfx/guide/serving).


In [None]:
model_1.save("/tmp/my_saved_model")



In [None]:
tfdf.model_plotter.plot_model_in_colab(model_1, tree_idx=0, max_depth=3)

In [None]:
model_1.make_inspector().evaluation()

Evaluation(num_examples=48, accuracy=0.6666666666666666, loss=0.6341070346534252, rmse=None, ndcg=None, aucs=None, auuc=None, qini=None)