## Get Pretrained model

In [None]:
import argparse
import torch
import torch.backends.cudnn as cudnn
from main.config import Config
import os.path as osp
import datetime
from pathlib import Path
from main.base import Tester
from human_models.human_models import SMPL, SMPLX
from tqdm import tqdm

In [2]:
cudnn.benchmark = True
# init config
time_str = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
root_dir = "D:/Msc_project/SMPLest-X_ToMe/"
config_path = "pretrained_models/smplest_x_h/config_base.py"
cfg = Config.load_config(config_path)
checkpoint_path = "pretrained_models/smplest_x_h/smplest_x_h.pth.tar"
exp_name = f'test_EHF_{time_str}'

In [3]:
testset = "EHF"
use_cache = False
test_batch_size = 16

new_config = {
    "data": {
        "testset": str(testset),
        "use_cache": use_cache,
    },
    "test":{
        "test_batch_size": int(test_batch_size),
    },
    "model": {
        "pretrained_model_path": checkpoint_path,
    },
    "log":{
        'exp_name':  exp_name,
        'output_dir': osp.join(root_dir, 'outputs', exp_name),
        'model_dir': osp.join(root_dir, 'outputs', exp_name, 'model_dump'),
        'log_dir': osp.join(root_dir, 'outputs', exp_name, 'log'),
        'result_dir': osp.join(root_dir, 'outputs', exp_name, 'result'),
    }
}

cfg.update_config(new_config)
cfg.prepare_log()
cfg.dump_config()

Config has been saved to D:/Msc_project/SMPLest-X_ToMe/outputs\test_EHF_20250811_134535/config.py


In [None]:
smpl_x = SMPLX(cfg.model.human_model_path)

# init tester
tester = Tester(cfg)
tester.logger.info(f"Using 1 GPU with bs={cfg.test.test_batch_size} per GPU.")
tester.logger.info(f'Testing [{checkpoint_path}] on datasets [{cfg.data.testset}]')

tester._make_batch_generator()
tester._make_model()

In [None]:
import tmu

tmu.patch.timm(tester.model.module.encoder, trace_source=True)
tester.model.module.encoder.r = 4
print(f"Applied TMU Unmerge. r is: {tester.model.module.encoder.r}")

tester.model

## Check the bottleneck

In [None]:
device = torch.device("cuda")
dummy_input = torch.randn(16, 3, 256, 192).to(device)

with torch.no_grad():
    _ = tester.model.module.encoder(dummy_input)


In [None]:
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)

device = torch.device("cuda")
dummy_input = torch.randn(16, 3, 256, 192).to(device)

print("Warming up the GPU...")
with torch.no_grad():
    for _ in range(3):
        _ = tester.model.module.encoder(dummy_input)

# 4. PERFORM TIMING
print("\n--- Profiling Model Components ---")
torch.cuda.synchronize(device) # Ensure all previous work is done

# --- Time the Encoder ---
with torch.no_grad():
    start_event.record()
    img_feat, task_tokens = tester.model.module.encoder(dummy_input)
    end_event.record()

torch.cuda.synchronize(device) # IMPORTANT: Wait for the operation to finish
encoder_time_ms = start_event.elapsed_time(end_event)
print(f"Encoder Time: {encoder_time_ms:.3f} ms")

# --- Time the Decoder ---
with torch.no_grad():
    start_event.record()
    output = tester.model.module.decoder(img_feat, task_tokens)
    end_event.record()

torch.cuda.synchronize(device) # IMPORTANT: Wait for the operation to finish
decoder_time_ms = start_event.elapsed_time(end_event)
print(f"Decoder Time: {decoder_time_ms:.3f} ms")

overhead = encoder_time_ms + decoder_time_ms
print(f"Total Measured Time: {overhead:.3f} ms")


In [None]:
import torch
import torch.profiler
from tqdm import tqdm

# Check for CUDA and set the device
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("CUDA not found. Running on CPU.")

# Move the model to the GPU
# Make sure your 'tester.model' is on the correct device before profiling
tester.model.to(device)

# Run a few batches without profiling to let caches and GPU resources warm up.
warmup_batches = 3
batch_generator = iter(tester.batch_generator)
print("Warming up GPU...")
for _ in range(warmup_batches):
    try:
        # Move warmup data to GPU as well to make it a realistic warmup
        inputs, targets, meta_info = next(batch_generator)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        targets = {k: v.to(device) for k, v in targets.items()}
        with torch.no_grad():
            _ = tester.model(inputs, targets, meta_info, 'test')
    except StopIteration:
        print("Warning: Not enough batches for a full warmup.")
        # Re-create the generator if it's exhausted
        batch_generator = iter(tester.batch_generator)
        break
print("Warmup complete.")

# Get the first (or next available) batch of data for profiling
try:
    inputs, targets, meta_info = next(batch_generator)
except StopIteration:
    print("Profiler did not run because the batch generator is empty.")
else:
    # Move the actual batch to the GPU
    print("Moving data to GPU for profiling...")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    targets = {k: v.to(device) for k, v in targets.items()}

    # Profile the model's execution on this single batch
    print("Starting profiler...")
    with torch.profiler.profile(
        activities=[
            torch.profiler.ProfilerActivity.CPU,
            torch.profiler.ProfilerActivity.CUDA, # This is now active
        ],
        with_stack=True,
        record_shapes=True
    ) as prof:
        with torch.no_grad():
            with torch.profiler.record_function("model_inference"):
                model_out = tester.model(inputs, targets, meta_info, 'test')
    print("Profiling complete.")

    # Sorting by 'cuda_time_total' is the best way to see what ran on the GPU
    print("\n--- Profiler Results (sorted by CUDA time) ---")
    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))

### Evaluate

In [11]:
# Evaluate
eval_result = {}
cur_sample_idx = 0
for itr, (inputs, targets, meta_info) in enumerate(tqdm(tester.batch_generator)):
    with torch.no_grad():
        model_out = tester.model(inputs, targets, meta_info, 'test')

    batch_size = model_out['img'].shape[0]

    out = {}
    for k, v in model_out.items():
        if isinstance(v, torch.Tensor):
            out[k] = v.cpu().numpy()
        elif isinstance(v, list):
            out[k] = v
        else:
            raise ValueError('Undefined type in out. Key: {}; Type: {}.'.format(k, type(v)))

    out = [{k: v[bid] for k, v in out.items()} for bid in range(batch_size)]

    # evaluate
    cur_eval_result = tester._evaluate(out, cur_sample_idx, smpl_x)
    for k, v in cur_eval_result.items():
        if k in eval_result:
            eval_result[k] += v
        else:
            eval_result[k] = v
    cur_sample_idx += len(out)

tester._print_eval_result(eval_result)

  0%|          | 0/7 [00:00<?, ?it/s]

Number of tokens left: 64
Number of image tokens: 192


 14%|█▍        | 1/7 [00:03<00:21,  3.66s/it]

Number of tokens left: 64
Number of image tokens: 192


 29%|██▊       | 2/7 [00:04<00:10,  2.16s/it]

Number of tokens left: 64
Number of image tokens: 192


 43%|████▎     | 3/7 [00:06<00:07,  1.78s/it]

Number of tokens left: 64
Number of image tokens: 192


 57%|█████▋    | 4/7 [00:07<00:04,  1.60s/it]

Number of tokens left: 64
Number of image tokens: 192


 71%|███████▏  | 5/7 [00:08<00:03,  1.51s/it]

Number of tokens left: 64
Number of image tokens: 192


100%|██████████| 7/7 [00:10<00:00,  1.03s/it]

Number of tokens left: 64
Number of image tokens: 192


100%|██████████| 7/7 [00:10<00:00,  1.50s/it]

PA MPVPE (All): 39.00 mm
PA MPVPE (L-Hands): 12.80 mm
PA MPVPE (R-Hands): 12.40 mm
PA MPVPE (Hands): 12.60 mm
PA MPVPE (Face): 4.95 mm

MPVPE (All): 74.57 mm
MPVPE (L-Hands): 43.61 mm
MPVPE (R-Hands): 43.96 mm
MPVPE (Hands): 43.78 mm
MPVPE (Face): 22.30 mm

PA MPJPE (Body): 43.05 mm
PA MPJPE (L-Hands): 12.90 mm
PA MPJPE (R-Hands): 12.45 mm
PA MPJPE (Hands): 12.67 mm

38.998660774249316,12.80182325512417,12.39556192831006,12.598692591717116,4.949148052468873,74.56845053224033,43.61167939317517,43.95607033350364,43.783874863339385,22.304173716348984




