In [None]:
import torch
from mambular.base_models.mambular import Mambular
from mambular.base_models.tabtransformer import TabTransformer
from mambular.base_models.ft_transformer import FTTransformer
from mambular.base_models.mlp import MLP
from mambular.base_models.mambatab import MambaTab
from mambular.base_models.resnet import ResNet
from mambular.base_models.mambattn import MambAttention
from mambular.base_models.tabularnn import TabulaRNN
import pandas as pd
import numpy as np
from accelerate import Accelerator
from accelerate.utils import ProfileKwargs
import re
from torch.profiler import profile, ProfilerActivity


# Features (10-100) GPU efficiency

In [None]:

# Initialize an empty DataFrame to store the results
df_results = pd.DataFrame(
    columns=["Model", "Num Features", "Total CUDA Memory (MB)", "Total CUDA Time (ms)"]
)

# Set up the profiler with memory profiling enabled
profile_kwargs = ProfileKwargs(
    activities=["cpu", "cuda"], profile_memory=True, record_shapes=True
)
accelerator = Accelerator(cpu=False, kwargs_handlers=[profile_kwargs])

# Loop over different numbers of features
for n_features in range(10, 100, 10): 
    # Updated dictionaries for feature info
    cat_feature_info = {
        f"cat_feature_{i}": 10 for i in range(int(n_features/2))
    }  # 10 categories: 0 to 9
    num_feature_info = {
        f"num_feature_{i}": 64 for i in range(int(n_features/2))
    }  # 128-dimensional numerical features

    # Create random numerical and categorical features, and move to CUDA
    num_features = [torch.randn(32, 64).cuda() for _ in range(int(n_features/2))]
    cat_features = [
        torch.randint(low=0, high=10, size=(32, 1)).cuda() for _ in range(int(n_features/2))
    ]

    models = [
        Mambular(
            num_feature_info=num_feature_info,
            cat_feature_info=cat_feature_info,
            numerical_preprocessing="ple",
            n_bins=64,
            d_model=64,
        ).cuda(),
        FTTransformer(
            num_feature_info=num_feature_info,
            cat_feature_info=cat_feature_info,
            numerical_preprocessing="ple",
            n_bins=64,
            d_model=64,
            n_layers=5,
        ).cuda(),
        TabulaRNN(
            num_feature_info=num_feature_info,
            cat_feature_info=cat_feature_info,
            d_model=128,
            dim_feedforward=256,
            numerical_preprocessing="ple",
            n_bins=64,
            n_layers=4,
        ).cuda(),
        MLP(
            num_feature_info=num_feature_info,
            cat_feature_info=cat_feature_info,
            numerical_preprocessing="ple",
            n_bins=64,
            layer_sizes=[512, 256, 128, 32],
        ).cuda(),
        ResNet(
            num_feature_info=num_feature_info,
            cat_feature_info=cat_feature_info,
            numerical_preprocessing="ple",
            n_bins=64,
            layer_sizes=[512, 256, 16],
        ).cuda(),
        MambAttention(
            num_feature_info=num_feature_info,
            cat_feature_info=cat_feature_info,
            numerical_preprocessing="ple",
            n_bins=64,
            d_state=172,
        ).cuda(),
    ]

    # Iterate over the models
    for model in models:
        # Prepare the model using the accelerator
        #model = accelerator.prepare(model)

        # Profiling the model
        with profile(profile_memory=True, record_shapes=True) as prof:
            with torch.no_grad():
                outputs = model(num_features, cat_features)

        # Extract key metrics from profiler
        key_averages = prof.key_averages()
        key_avg_output = str(key_averages.total_average())



        # Extract cuda_memory_usage
        cuda_memory_match = re.search(r'cuda_memory_usage=(\d+)', key_avg_output)
        total_cuda_memory = int(cuda_memory_match.group(1)) / (1024 ** 2) if cuda_memory_match else 0.0  # Convert to MB

        # Extract cpu_memory_usage
        cpu_memory_match = re.search(r'cpu_memory_usage=(\d+)', key_avg_output)
        total_cpu_memory = int(cpu_memory_match.group(1)) / (1024 ** 2) if cpu_memory_match else 0.0  # Convert to MB

        # Extract self_cpu_time (convert from ms)
        cpu_time_match = re.search(r'self_cpu_time=([\d.]+)ms', key_avg_output)
        total_cpu_time = float(cpu_time_match.group(1)) if cpu_time_match else 0.0  # CPU time in ms

        # Extract self_cuda_time (convert from ms)
        cuda_time_match = re.search(r'self_cuda_time=([\d.]+)ms', key_avg_output)
        total_cuda_time = float(cuda_time_match.group(1)) if cuda_time_match else 0.0  # CUDA time in ms

        new_row = {
            "Model": model.__class__.__name__,
            "Num Features": n_features,
            "Total CPU Time (ms)": total_cpu_time,
            "Total CUDA Time (ms)": total_cuda_time,
            "Total CPU Memory (MB)": total_cpu_memory,
            "Total CUDA Memory (MB)": total_cuda_memory,
        }

        # Append the new row to the DataFrame using pd.concat
        df_results = pd.concat([df_results, pd.DataFrame([new_row])], ignore_index=True)

# Display the profiling results
print(df_results.head())


# Features (0-1000) GPU Efficiency. Batch Size is adapted to 8 to avoid crashes

In [None]:
import torch
from mambular.base_models.mambular import Mambular
from mambular.base_models.tabtransformer import TabTransformer
from mambular.base_models.ft_transformer import FTTransformer
from mambular.base_models.mlp import MLP
from mambular.base_models.resnet import ResNet
from mambular.base_models.mambattn import MambAttention
from mambular.base_models.tabularnn import TabulaRNN
from accelerate import Accelerator
from accelerate.utils import ProfileKwargs
import pandas as pd
import numpy as np
import re
import warnings
# Parse the string to extract values using regex
import re
warnings.filterwarnings("ignore")


import torch

# Initialize models with updated feature info


# Initialize an empty DataFrame to store the results
df_results = pd.DataFrame(
    columns=["Model", "Num Features", "Total CUDA Memory (MB)", "Total CUDA Time (ms)"]
)

# Set up the profiler with memory profiling enabled
profile_kwargs = ProfileKwargs(
    activities=["cpu", "cuda"], profile_memory=True, record_shapes=True
)
accelerator = Accelerator(cpu=False, kwargs_handlers=[profile_kwargs])

# Loop over different numbers of features
for n_features in range(10, 1000, 100):

    # Updated dictionaries for feature info
    cat_feature_info = {
        f"cat_feature_{i}": 10 for i in range(int(n_features/2))
    }  # 10 categories: 0 to 9
    num_feature_info = {
        f"num_feature_{i}": 64 for i in range(int(n_features/2))
    }  # 128-dimensional numerical features

    # Create random numerical and categorical features, and move to CUDA
    num_features = [torch.randn(8, 64).cuda() for _ in range(int(n_features/2))]
    cat_features = [
        torch.randint(low=0, high=10, size=(8, 1)).cuda() for _ in range(int(n_features/2))
    ]

    models = [
        Mambular(
            num_feature_info=num_feature_info,
            cat_feature_info=cat_feature_info,
            numerical_preprocessing="ple",
            n_bins=64,
            d_model=64,
        ).cuda(),
        FTTransformer(
            num_feature_info=num_feature_info,
            cat_feature_info=cat_feature_info,
            numerical_preprocessing="ple",
            n_bins=64,
            d_model=64,
            n_layers=5,
        ).cuda(),
        TabulaRNN(
            num_feature_info=num_feature_info,
            cat_feature_info=cat_feature_info,
            d_model=128,
            dim_feedforward=256,
            numerical_preprocessing="ple",
            n_bins=64,
            n_layers=4,
        ).cuda(),
        MLP(
            num_feature_info=num_feature_info,
            cat_feature_info=cat_feature_info,
            numerical_preprocessing="ple",
            n_bins=64,
            layer_sizes=[512, 256, 128, 32],
        ).cuda(),
        ResNet(
            num_feature_info=num_feature_info,
            cat_feature_info=cat_feature_info,
            numerical_preprocessing="ple",
            n_bins=64,
            layer_sizes=[512, 256, 16],
        ).cuda(),
        MambAttention(
            num_feature_info=num_feature_info,
            cat_feature_info=cat_feature_info,
            numerical_preprocessing="ple",
            n_bins=64,
            d_state=172,
        ).cuda(),
    ]

    # Iterate over the models
    for model in models:
        # Prepare the model using the accelerator
        #model = accelerator.prepare(model)

        # Profiling the model
        with profile(profile_memory=True, record_shapes=True) as prof:
            with torch.no_grad():
                outputs = model(num_features, cat_features)

        # Extract key metrics from profiler
        key_averages = prof.key_averages()
        key_avg_output = str(key_averages.total_average())



        # Extract cuda_memory_usage
        cuda_memory_match = re.search(r'cuda_memory_usage=(\d+)', key_avg_output)
        total_cuda_memory = int(cuda_memory_match.group(1)) / (1024 ** 2) if cuda_memory_match else 0.0  # Convert to MB

        # Extract cpu_memory_usage
        cpu_memory_match = re.search(r'cpu_memory_usage=(\d+)', key_avg_output)
        total_cpu_memory = int(cpu_memory_match.group(1)) / (1024 ** 2) if cpu_memory_match else 0.0  # Convert to MB

        # Extract self_cpu_time (convert from ms)
        cpu_time_match = re.search(r'self_cpu_time=([\d.]+)ms', key_avg_output)
        total_cpu_time = float(cpu_time_match.group(1)) if cpu_time_match else 0.0  # CPU time in ms

        # Extract self_cuda_time (convert from ms)
        cuda_time_match = re.search(r'self_cuda_time=([\d.]+)ms', key_avg_output)
        total_cuda_time = float(cuda_time_match.group(1)) if cuda_time_match else 0.0  # CUDA time in ms

        new_row = {
            "Model": model.__class__.__name__,
            "Num Features": n_features,
            "Total CPU Time (ms)": total_cpu_time,
            "Total CUDA Time (ms)": total_cuda_time,
            "Total CPU Memory (MB)": total_cpu_memory,
            "Total CUDA Memory (MB)": total_cuda_memory,
        }

        # Append the new row to the DataFrame using pd.concat
        df_results = pd.concat([df_results, pd.DataFrame([new_row])], ignore_index=True)

# Display the profiling results
print(df_results.head())


# GPU vs Embedding dimension -> Batch size of 32, fixed feature number of 12 to simulate average tabular dataset

In [None]:
import torch
from mambular.base_models.mambular import Mambular
from mambular.base_models.tabtransformer import TabTransformer
from mambular.base_models.ft_transformer import FTTransformer
from mambular.base_models.mlp import MLP
from mambular.base_models.resnet import ResNet
from mambular.base_models.mambattn import MambAttention
from mambular.base_models.tabularnn import TabulaRNN
from accelerate import Accelerator
from accelerate.utils import ProfileKwargs
import pandas as pd
import numpy as np
import re
import warnings
# Parse the string to extract values using regex
import re
warnings.filterwarnings("ignore")


import torch

# Initialize models with updated feature info


# Initialize an empty DataFrame to store the results
df_results = pd.DataFrame(
    columns=["Model", "Num Layers", "Total CUDA Memory (MB)", "Total CUDA Time (ms)"]
)

# Set up the profiler with memory profiling enabled
profile_kwargs = ProfileKwargs(
    activities=["cpu", "cuda"], profile_memory=True, record_shapes=True
)
accelerator = Accelerator(cpu=False, kwargs_handlers=[profile_kwargs])
n_features=12

# Loop over different numbers of features
for n_layers in range(4, 24):

    # Updated dictionaries for feature info
    cat_feature_info = {
        f"cat_feature_{i}": 10 for i in range(int(n_features/2))
    }  # 10 categories: 0 to 9
    num_feature_info = {
        f"num_feature_{i}": 64 for i in range(int(n_features/2))
    }  # 128-dimensional numerical features

    # Create random numerical and categorical features, and move to CUDA
    num_features = [torch.randn(32, 64).cuda() for _ in range(int(n_features/2))]
    cat_features = [
        torch.randint(low=0, high=10, size=(32, 1)).cuda() for _ in range(int(n_features/2))
    ]

    models = [
        Mambular(
            num_feature_info=num_feature_info,
            cat_feature_info=cat_feature_info,
            numerical_preprocessing="ple",
            n_bins=64,
            d_model=64,
            n_layers=n_layers
        ).cuda(),
        FTTransformer(
            num_feature_info=num_feature_info,
            cat_feature_info=cat_feature_info,
            numerical_preprocessing="ple",
            n_bins=64,
            d_model=64,
            n_layers=n_layers
        ).cuda(),
        TabulaRNN(
            num_feature_info=num_feature_info,
            cat_feature_info=cat_feature_info,
            d_model=128,
            dim_feedforward=256,
            numerical_preprocessing="ple",
            n_bins=64,
            n_layers=n_layers
        ).cuda(),
    ]

    # Iterate over the models
    for model in models:
        # Prepare the model using the accelerator
        #model = accelerator.prepare(model)

        # Profiling the model
        with profile(profile_memory=True, record_shapes=True) as prof:
            with torch.no_grad():
                outputs = model(num_features, cat_features)

        # Extract key metrics from profiler
        key_averages = prof.key_averages()
        key_avg_output = str(key_averages.total_average())



        # Extract cuda_memory_usage
        cuda_memory_match = re.search(r'cuda_memory_usage=(\d+)', key_avg_output)
        total_cuda_memory = int(cuda_memory_match.group(1)) / (1024 ** 2) if cuda_memory_match else 0.0  # Convert to MB

        # Extract cpu_memory_usage
        cpu_memory_match = re.search(r'cpu_memory_usage=(\d+)', key_avg_output)
        total_cpu_memory = int(cpu_memory_match.group(1)) / (1024 ** 2) if cpu_memory_match else 0.0  # Convert to MB

        # Extract self_cpu_time (convert from ms)
        cpu_time_match = re.search(r'self_cpu_time=([\d.]+)ms', key_avg_output)
        total_cpu_time = float(cpu_time_match.group(1)) if cpu_time_match else 0.0  # CPU time in ms

        # Extract self_cuda_time (convert from ms)
        cuda_time_match = re.search(r'self_cuda_time=([\d.]+)ms', key_avg_output)
        total_cuda_time = float(cuda_time_match.group(1)) if cuda_time_match else 0.0  # CUDA time in ms

        new_row = {
            "Model": model.__class__.__name__,
            "Num Layers": int(n_layers),
            "Total CPU Time (ms)": total_cpu_time,
            "Total CUDA Time (ms)": total_cuda_time,
            "Total CPU Memory (MB)": total_cpu_memory,
            "Total CUDA Memory (MB)": total_cuda_memory,
        }

        # Append the new row to the DataFrame using pd.concat
        df_results = pd.concat([df_results, pd.DataFrame([new_row])], ignore_index=True)

# Display the profiling results
print(df_results.head())
