In [4]:
import numpy as np
import cv2
from pathlib import Path

FILE_PATH = "../training_dataset/sequences/NSL_Consonant_Multi/S3_NSL_Consonant_Prepared/S3_all_consonant_Phone_Camera/BA_2507_2577.npz"
OUTPUT_VIDEO = "skeleton_check.mp4"
WIDTH, HEIGHT = 800, 800
FPS = 60

# MediaPipe Connection Maps
HAND_CONNECTIONS = [
    (0, 1), (1, 2), (2, 3), (3, 4),      # Thumb
    (0, 5), (5, 6), (6, 7), (7, 8),      # Index
    (5, 9), (9, 10), (10, 11), (11, 12), # Middle
    (9, 13), (13, 14), (14, 15), (15, 16), # Ring
    (13, 17), (0, 17), (17, 18), (18, 19), (19, 20) # Pinky
]

POSE_CONNECTIONS = [
    (11, 12), (11, 13), (13, 15), # Left arm
    (12, 14), (14, 16),           # Right arm
    (11, 23), (12, 24), (23, 24)  # Torso
]

def draw_skeleton(data_path, output_path):
    # Load the npz file
    data = np.load(data_path)
    pose = data['pose']  # (Frames, 33, 4)
    lh = data['lh']      # (Frames, 21, 3)
    rh = data['rh']      # (Frames, 21, 3)
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, FPS, (WIDTH, HEIGHT))

    print(f"Generating video for {len(pose)} frames...")

    for i in range(len(pose)):
        # Create black canvas
        frame = np.zeros((HEIGHT, WIDTH, 3), dtype=np.uint8)

        # 1. DRAW HANDS (Wrist-Centric)
        # Since they are normalized around 0,0, we move them to centers
        # and scale them up for visibility
        lw_pose = pose[i][15] 
        rw_pose = pose[i][16]

        # Define centers based on actual Pose landmarks
        # If Pose exists, we attach hand to wrist. If not (Cropped), we use a default center.
        if not np.all(lw_pose[:2] == 0):
            l_center = (int(lw_pose[0] * WIDTH), int(lw_pose[1] * HEIGHT))
        else:
            l_center = (200, 400) # Default for cropped

        if not np.all(rw_pose[:2] == 0):
            r_center = (int(rw_pose[0] * WIDTH), int(rw_pose[1] * HEIGHT))
        else:
            r_center = (600, 400) # Default for cropped

        # Now apply this center to the drawing logic
        centers = {'left': l_center, 'right': r_center}
        # IMPORTANT: Since Pose and Hand are in different scales now, 
        # you might need to adjust this scale factor to match the body size
        hand_visual_scale = 200

        for side, hand_pts, color in [('left', lh[i], (255, 0, 0)), ('right', rh[i], (0, 0, 255))]:
            if np.all(hand_pts == 0): continue
            
            current_center = centers[side]
            
            for start, end in HAND_CONNECTIONS:
                # We add the normalized finger points to the actual wrist center
                p1 = (int(hand_pts[start][0] * hand_visual_scale + current_center[0]), 
                    int(hand_pts[start][1] * hand_visual_scale + current_center[1]))
                p2 = (int(hand_pts[end][0] * hand_visual_scale + current_center[0]), 
                    int(hand_pts[end][1] * hand_visual_scale + current_center[1]))
                cv2.line(frame, p1, p2, color, 2)

            # Draw points
            for pt in hand_pts:
                px = int(pt[0] * hand_visual_scale + centers[side][0])
                py = int(pt[1] * hand_visual_scale + centers[side][1])
                cv2.circle(frame, (px, py), 3, (255, 255, 255), -1)

        # 2. DRAW POSE (Full View)
        # Pose is in 0-1 range, we scale to full screen
        for start, end in POSE_CONNECTIONS:
            p1_raw = pose[i][start]
            p2_raw = pose[i][end]
            
            # Only draw if visibility is decent
            if p1_raw[3] > 0.5 and p2_raw[3] > 0.5:
                p1 = (int(p1_raw[0] * WIDTH), int(p1_raw[1] * HEIGHT))
                p2 = (int(p2_raw[0] * WIDTH), int(p2_raw[1] * HEIGHT))
                cv2.line(frame, p1, p2, (0, 255, 0), 2)

        # Add Frame Info
        cv2.putText(frame, f"Frame: {i}", (30, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
        out.write(frame)

    out.release()
    print(f"✅ Skeleton video saved to: {output_path}")

# Run the visualization
draw_skeleton(FILE_PATH, OUTPUT_VIDEO)

Generating video for 71 frames...
✅ Skeleton video saved to: skeleton_check.mp4


In [3]:
import numpy as np
import json
from pathlib import Path

def convert_enhanced_npz_to_json(npz_path, output_json_path, video_path_label=None):
    """
    Converts enhanced NPZ files back to JSON.
    Reconstructs original coordinates using: (normalized * scale) + wrist
    """
    # 1. Load the NPZ data
    data = np.load(npz_path)
    
    pose_array = data['pose']      # (Frames, 33, 4)
    lh_array = data['lh']          # (Frames, 21, 3)
    rh_array = data['rh']          # (Frames, 21, 3)
    lh_meta = data['lh_meta']      # (Frames, 4) -> [wx, wy, wz, scale]
    rh_meta = data['rh_meta']      # (Frames, 4) -> [wx, wy, wz, scale]
    
    # Extract video info [fps, width, height] saved in NPZ
    # If video_info isn't there, we fallback to defaults
    if 'video_info' in data:
        fps_orig, width, height = data['video_info']
    else:
        fps_orig, width, height = 60.0, 1920, 1080

    # The user specifically requested 'fps': 60 in the JSON
    fps_to_use = 60 

    # 2. Construct JSON Structure
    output_data = {
        'metadata': {
            'video_path': str(video_path_label) if video_path_label else "unknown",
            'fps': float(fps_to_use),
            'frame_width': int(width),
            'frame_height': int(height),
            'total_frames': int(pose_array.shape[0]),
            'frame_skip': 1,
            'hands_swapped': True
        },
        'frames': []
    }

    # 3. Process Frames
    for i in range(pose_array.shape[0]):
        frame_data = {
            'frame_number': i + 1,
            'timestamp': i / fps_to_use,
            'pose': None,
            'hands': {'left': None, 'right': None},
            'face': None
        }

        # --- Reconstruct Pose ---
        if not np.all(pose_array[i] == 0):
            frame_data['pose'] = [
                {'x': float(lm[0]), 'y': float(lm[1]), 'z': float(lm[2]), 'visibility': float(lm[3])}
                for lm in pose_array[i]
            ]

        # --- Reconstruct Left Hand ---
        # Formula: (normalized_coords * scale) + wrist_position
        if not np.all(lh_array[i] == 0):
            wx, wy, wz, scale = lh_meta[i]
            frame_data['hands']['left'] = [
                {
                    'x': float((lm[0] * scale) + wx),
                    'y': float((lm[1] * scale) + wy),
                    'z': float((lm[2] * scale) + wz)
                }
                for lm in lh_array[i]
            ]

        # --- Reconstruct Right Hand ---
        if not np.all(rh_array[i] == 0):
            wx, wy, wz, scale = rh_meta[i]
            frame_data['hands']['right'] = [
                {
                    'x': float((lm[0] * scale) + wx),
                    'y': float((lm[1] * scale) + wy),
                    'z': float((lm[2] * scale) + wz)
                }
                for lm in rh_array[i]
            ]

        output_data['frames'].append(frame_data)

    # 4. Save JSON
    with open(output_json_path, 'w') as f:
        json.dump(output_data, f, indent=2)
    
    print(f"Successfully converted {npz_path.name} to {output_json_path}")

# --- Example Usage ---
npz_file = Path("../training_dataset/sequences/NSL_CONSONANT_PART_1/S1_NSL_Consonant_Bright/BA.npz")
convert_enhanced_npz_to_json(npz_file, "keypoints.json", video_path_label="S1/A.MOV")

Successfully converted BA.npz to keypoints.json
