In [1]:
# Install Yggdrasil Decision Forests if not installed
!pip install tensorflow_decision_forests pandas numpy

Collecting tensorflow_decision_forests
  Downloading tensorflow_decision_forests-1.11.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (6.0 kB)
Collecting tensorflow==2.18.0 (from tensorflow_decision_forests)
  Downloading tensorflow-2.18.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (4.0 kB)
Collecting wheel (from tensorflow_decision_forests)
  Using cached wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
Collecting wurlitzer (from tensorflow_decision_forests)
  Using cached wurlitzer-3.1.1-py3-none-any.whl.metadata (2.5 kB)
Collecting tf-keras~=2.17 (from tensorflow_decision_forests)
  Downloading tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting astunparse>=1.6.0 (from tensorflow==2.18.0->tensorflow_decision_forests)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow==2.18.0->tensorflow_decision_forests)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
# Install Yggdrasil Decision Forests if not installed
!pip install tensorflow_decision_forests pandas numpy

import tensorflow_decision_forests as tfdf
import pandas as pd
import numpy as np
import time
import tensorflow as tf

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

"""
### Function to Load and Preprocess Cancer Data
This function loads the dataset, removes irrelevant columns, handles missing values,
and filters samples based on a given cohort.
"""
def get_X_y(f, root="./data/", cohort=[], verbose=False):
    df = pd.read_csv(root + f)

    non_features = [
        "Run", "Sample", "Library", "Cancer Status", "Tumor type", "Stage",
        "Library volume (uL)", "Library Volume", "UIDs Used", "Experiment",
        "P7", "P7 Primer", "MAF"
    ]

    target = "Cancer Status"

    # Convert labels to 0 (Healthy) and 1 (Cancer)
    y = df[target].replace({"Healthy": 0, "Cancer": 1})

    # Remove non-feature columns
    df = df.drop(columns=[col for col in non_features if col in df.columns], errors='ignore')

    # Drop columns with all NaN values
    df = df.dropna(axis=1, how='all')

    # Filter cohort if specified
    if cohort:
        df = df[df["Sample"].isin(cohort)]
        y = y[df["Sample"].isin(cohort)]

    # Replace NaNs with column mean
    X = df.fillna(df.mean())

    if verbose:
        print(f"Processed {f}: X shape = {X.shape}, y shape = {y.shape}")

    return X, y

"""
### Load Sample Metadata
Reads the sample list file and extracts different cohorts (Cohort1, Cohort2, and PON).
"""
DIRECTORY = "./"
sample_list_file = DIRECTORY + "AllSamples.MIGHT.Passed.samples.txt"
sample_list = pd.read_csv(sample_list_file, sep=" ", header=None, names=["library", "sample_id", "cohort"])

# Get sample IDs for different cohorts
cohort1 = sample_list[sample_list["cohort"] == "Cohort1"]["sample_id"]
cohort2 = sample_list[sample_list["cohort"] == "Cohort2"]["sample_id"]
PON = sample_list[sample_list["cohort"] == "PanelOfNormals"]["sample_id"]

"""
### Load and Preprocess Wise-1 Dataset for Cohort 1
Calls the `get_X_y` function to preprocess the data, returning `X` (features) and `y` (labels).
"""
X, y = get_X_y("WiseCondorX.Wise-1.csv.gz", root=DIRECTORY, cohort=cohort1)

"""
### Convert Data to TensorFlow Dataset
Transforms the Pandas DataFrame into a TensorFlow-compatible dataset for YDF training.
"""
df = X.copy()
df["Cancer Status"] = y
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(df, label="Cancer Status")

"""
### Train a Random Forest Model using Yggdrasil Decision Forests
Creates and trains a Random Forest classifier with default parameters.
"""
rf_model = tfdf.keras.RandomForestModel(task=tfdf.keras.Task.CLASSIFICATION)
rf_model.compile()

start_time = time.perf_counter()
rf_model.fit(train_ds)
end_time = time.perf_counter()

print(f"Random Forest Training Time: {end_time - start_time:.2f} seconds")

"""
### Train a Gradient Boosted Trees Model (Oblique Equivalent)
Uses YDF's Gradient Boosted Trees to capture complex decision boundaries.
"""
gbt_model = tfdf.keras.GradientBoostedTreesModel(task=tfdf.keras.Task.CLASSIFICATION)
gbt_model.compile()

start_time = time.perf_counter()
gbt_model.fit(train_ds)
end_time = time.perf_counter()

print(f"Gradient Boosted Trees Training Time: {end_time - start_time:.2f} seconds")

"""
### Evaluate Models on the Training Data
Computes and prints model evaluation metrics (e.g., accuracy, loss).
"""
rf_eval = rf_model.evaluate(train_ds, return_dict=True)
gbt_eval = gbt_model.evaluate(train_ds, return_dict=True)

print("\nRandom Forest Model Evaluation:", rf_eval)
print("Gradient Boosted Trees Model Evaluation:", gbt_eval)

"""
### Generate and Print Predictions
Predicts class probabilities using both trained models and prints sample predictions.
"""
rf_predictions = rf_model.predict(train_ds)
gbt_predictions = gbt_model.predict(train_ds)

print("\nSample RF Predictions:", rf_predictions[:5])
print("Sample GBT Predictions:", gbt_predictions[:5])