---
title: "Forays into Vision Models"
author: "Ali Zaidi"
date: "2025-12-14"
categories: [Deep Learning]
description: "Now that we've explored the traditional machine learning route, lets see how a more robust vision based approach might work"
format:
  html:
    code-fold: true
jupyter: python3
---

### Going to try to use PoseC3D
<https://arxiv.org/pdf/2104.13586>

Which relies on a 3D heatmap volume instead of a graph sequence as the base representation of human skeletons. Compared to GCN-based methods, PoseConv3D is more effective in learning spatiotemporal features, more robust against pose estimation noises, and generalizes better in cross-dataset settings.

In [11]:
#| include: false
from fastai.vision.all import *
from eagle_swing.data_class import *
from eagle_swing.video_utils import *
from eagle_swing.find_landmarks import *
from eagle_swing.animate import *
import matplotlib.animation as animation

In [3]:
base_path = '../../../data/full_videos/ymirza'
#day_path = f"{base_path}/sep14"
#cleaned_df_paths = [file for file in get_files(day_path, extensions='.csv') if file.name == 'clean_lbls.csv']
cleaned_df_paths = [file for file in get_files(base_path, extensions='.csv') if file.name == 'clean_lbls.csv']
df_holder = []
for df_path in cleaned_df_paths:
    df_holder.append(pd.read_csv(df_path))
cleaned_df = pd.concat(df_holder).reset_index(drop=True)
#cleaned_df['swing_day'] = cleaned_df.pkl_path.map(lambda x: x.split('/')[0])
cleaned_df['pkl_path'] = cleaned_df.pkl_path.map(lambda x: f'{base_path}/{x}')
before_increment = 60
after_increment = 60
cleaned_df['start_idx'] = cleaned_df['first_higher_wrists_backswing_frame'] - before_increment
cleaned_df['end_idx'] = cleaned_df['first_higher_wrists_backswing_frame'] + after_increment
df5 = cleaned_df[cleaned_df.score.map(lambda x: x == 5)]
df1 = cleaned_df[cleaned_df.score.map(lambda x: x == 1)]
test_df = pd.concat([df5.iloc[:3], df1.iloc[:3]]).reset_index(drop=True)
rand_idxs = np.random.randint(0, len(cleaned_df), 6)
test_df = cleaned_df.iloc[rand_idxs]

SwExt_list = []
for idx, row in test_df.iterrows():
    SwExt_list.append(SwingExtractor(row))
clip_names = [SwExt_list[x].clip_name for x in range(len(SwExt_list))]
print(clip_names)

['IMG_1272_swing_5_score_2', 'IMG_1019_swing_6_score_5', 'IMG_1183_swing_3_score_1', 'IMG_1274_swing_1_score_2', 'IMG_1273_swing_10_score_5', 'IMG_1015_swing_4_score_2']


In [18]:
frames, _ = get_frames(f"{test_df.iloc[0].pkl_path[:-3]}mp4", resize_dim=None)
frames.shape

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10/10 [00:00<00:00, 155.38it/s]


(10, 1920, 1080, 3)

In [7]:
kps_holder = []
for idx, row in test_df.iloc[:4].iterrows():
    kps_holder.append(KpExtractor(row.pkl_path).kps[row.start_idx:row.end_idx])
len(kps_holder), [x.shape for x in kps_holder]

(4, [(120, 17, 3), (120, 17, 3), (120, 17, 3), (120, 17, 3)])

In [19]:
debug_heatmap_values(kps_holder[0], image_shape=(1920, 1080))

--- KEYPOINT DIAGNOSTIC REPORT ---
Input Shape: (120, 17, 3)
Original X Range: [375.88, 653.82]
Original Y Range: [751.39, 1423.40]
Confidence Score Range: [0.36, 1.10]

Scaled X Range: [19.49, 33.90] (Target: [0, 56])
Scaled Y Range: [21.92, 41.52] (Target: [0, 56])
Out of Bounds Points: 0 / 2040
Detected PIXEL keypoints. Scaled down to heatmap size.

Max Value in a Sample Heatmap: 0.921026

--- ANALYSIS ---
ðŸŸ¢ INFO: Data looks fine. The issue might be in the animation `vmax` parameter.


In [20]:
heatmap = generate_heatmap_robust(kps_holder[0], (1920, 1080))
heatmap.shape

Detected PIXEL keypoints. Scaled down to heatmap size.


(120, 17, 56, 56)

In [21]:
show_heatmap_animation(heatmap)

In [5]:
def show_heatmap_animation(heatmap_volume):
    T, K, H, W = heatmap_volume.shape
    
    # Collapse Keypoints: (T, K, H, W) -> (T, H, W)
    collapsed_video = np.max(heatmap_volume, axis=1)
    
    fig, ax = plt.subplots(figsize=(5, 5))
    ax.axis('off')
    
    # Create the initial image object
    # vmin/vmax ensure the colors don't flicker as intensities change
    img = ax.imshow(collapsed_video[0], cmap='inferno', vmin=0, vmax=1)
    title = ax.set_title(f"Frame 0/{T}")
    
    def update(frame):
        img.set_array(collapsed_video[frame])
        title.set_text(f"Frame {frame}/{T}")
        return img, title
    
    ani = animation.FuncAnimation(
        fig, 
        update, 
        frames=T, 
        interval=50, 
        blit=True
    )
    
    # CRITICAL STEP: Convert to HTML5 video to display in notebook
    plt.close() # Prevents the static "Frame 0" ghost image
    return HTML(ani.to_jshtml())


def generate_heatmap_robust(keypoints, image_shape=(1080, 1920), output_shape=(56, 56), sigma=1.0):
    """
    Robustly generates PoseC3D heatmaps by auto-detecting keypoint scale.
    
    Args:
        keypoints (np.ndarray): Shape (T, K, C). 
                                If max value <= 1.0, assumes normalized (0-1).
                                If max value > 1.0, assumes pixel coordinates.
        image_shape (tuple): (H, W) original resolution (only used if keypoints are pixels).
        output_shape (tuple): (h, w) target heatmap size (default 56x56).
    """
    T, K, C = keypoints.shape
    h, w = output_shape
    H, W = image_shape
    
    xs = keypoints[:, :, 0]
    ys = keypoints[:, :, 1]
    
    # --- AUTO-DETECTION LOGIC ---
    max_val = np.max(keypoints[:, :, :2])
    
    if max_val <= 1.0:
        # CASE A: Normalized Coordinates (0.0 - 1.0)
        # Simply multiply by heatmap dimensions
        xs = xs * w
        ys = ys * h
        print("Detected NORMALIZED keypoints (0-1). Scaled to heatmap size.")
        
    elif max_val <= max(h, w) * 1.5:
         # CASE B: Already Heatmap-Scaled (e.g., 0-56)
        # Do nothing
        print("Detected HEATMAP-SCALED keypoints. No scaling applied.")
        
    else:
        # CASE C: Original Pixel Coordinates (e.g., 0-1920)
        # Scale down: (x / W) * w
        xs = xs * (w / W)
        ys = ys * (h / H)
        print("Detected PIXEL keypoints. Scaled down to heatmap size.")
    
    # Use confidence if available
    scores = keypoints[:, :, 2] if C > 2 else np.ones((T, K))

    # --- GENERATE HEATMAPS ---
    # Create grid (h, w)
    xx, yy = np.meshgrid(np.arange(w), np.arange(h)) # (h, w)
    
    # Reshape for broadcasting
    # Grid: (1, 1, h, w)
    xx = xx[None, None, :, :]
    yy = yy[None, None, :, :]
    
    # Keypoints: (T, K, 1, 1)
    xs = xs[:, :, None, None]
    ys = ys[:, :, None, None]
    scores = scores[:, :, None, None]
    
    # Gaussian distance
    d2 = (xx - xs)**2 + (yy - ys)**2
    heatmap = np.exp(-d2 / (2.0 * sigma**2))
    
    # Apply scores
    heatmap = heatmap * scores
    
    return heatmap.astype(np.float32)


In [6]:
def debug_heatmap_values(keypoints, image_shape=(1080, 1920)):
    """
    Prints a diagnostic report of the keypoint data to find issues.
    """
    if keypoints is None or keypoints.size == 0:
        print("Error: keypoints array is empty!")
        return

    # --- 1. Check Input Data ---
    T, K, C = keypoints.shape
    h, w = 56, 56  # Standard PoseC3D heatmap size
    H, W = image_shape
    
    xs = keypoints[:, :, 0]
    ys = keypoints[:, :, 1]
    scores = keypoints[:, :, 2] if C > 2 else np.ones((T, K))

    print("--- KEYPOINT DIAGNOSTIC REPORT ---")
    print(f"Input Shape: {keypoints.shape}")
    print(f"Original X Range: [{xs.min():.2f}, {xs.max():.2f}]")
    print(f"Original Y Range: [{ys.min():.2f}, {ys.max():.2f}]")
    print(f"Confidence Score Range: [{scores.min():.2f}, {scores.max():.2f}]")
    
    # --- 2. Check Scaling ---
    if xs.max() > 1.0: # Pixel mode
        xs_scaled = xs * (w / W)
        ys_scaled = ys * (h / H)
    else: # Normalized mode
        xs_scaled = xs * w
        ys_scaled = ys * h
    
    print(f"\nScaled X Range: [{xs_scaled.min():.2f}, {xs_scaled.max():.2f}] (Target: [0, {w}])")
    print(f"Scaled Y Range: [{ys_scaled.min():.2f}, {ys_scaled.max():.2f}] (Target: [0, {h}])")
    
    # --- 3. Check for Out-of-Bounds Points ---
    oob_x = np.sum((xs_scaled < 0) | (xs_scaled >= w))
    oob_y = np.sum((ys_scaled < 0) | (ys_scaled >= h))
    total_pts = xs.size
    print(f"Out of Bounds Points: {max(oob_x, oob_y)} / {total_pts}")

    # --- 4. Generate & Check a SINGLE Heatmap ---
    # Use the robust generator
    single_heatmap = generate_heatmap_robust(keypoints[T//2:T//2+1]) # Middle frame
    print(f"\nMax Value in a Sample Heatmap: {single_heatmap.max():.6f}")
    
    print("\n--- ANALYSIS ---")
    if scores.max() < 0.01:
        print("ðŸ”´ CRITICAL: Confidence scores are all near zero. This is the most likely cause.")
    if max(oob_x, oob_y) > 0:
        print("ðŸ”´ CRITICAL: Keypoints are out of bounds. Check your `image_shape` parameter.")
    if single_heatmap.max() < 1e-5:
        print("ðŸŸ¡ WARNING: Max heatmap value is tiny. The plot will look black. Check sigma or scores.")
    if scores.max() > 0.01 and max(oob_x, oob_y) == 0:
        print("ðŸŸ¢ INFO: Data looks fine. The issue might be in the animation `vmax` parameter.")
