# Compare the keypoints on the original and the upsampled images

For the comparison, we're using torchvision's built in, pre-trained keypoint R-CNN network, as the original datasets (FDF256) keypoints were generated with this model.


**From PyTorch documentation**:
During inference, the model requires only the input tensors, and returns the post-processed predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as follows, where N is the number of detected instances:
- boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with 0 <= x1 < x2 <= W and 0 <= y1 < y2 <= H.
- labels (Int64Tensor[N]): the predicted labels for each instance
- scores (Tensor[N]): the scores or each instance
- keypoints (FloatTensor[N, K, 3]): the locations of the predicted keypoints, in [x, y, v] format.


In [1]:
import torch
import torchvision
from torchvision.models.detection import keypointrcnn_resnet50_fpn

In [2]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
import numpy as np
np.random.seed(42)

In [3]:
model = torchvision.models.detection.keypointrcnn_resnet50_fpn(pretrained=True)
model.eval()

KeypointRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(640, 672, 704, 736, 768, 800), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.

In [4]:
print(torch.cuda.is_available())
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
predictions = model(x)
print(predictions[0]['keypoints'].shape)

True
torch.Size([2, 17, 3])


In [5]:
from torchvision.io import read_image
from torch.utils.data import DataLoader
from fdf256dataset import FDF256Dataset

dataset_path = '/datadrive/FDF/dataset/val'
dataset = FDF256Dataset(dirpath=dataset_path, load_keypoints=True, transform=None, load_impath=True)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=22,
                            prefetch_factor=2, persistent_workers=True, pin_memory=True)

Could not load pyspng. Defaulting to pillow image backend.
Dataset loaded from: /datadrive/FDF/dataset/val. Number of samples:6531


In [6]:
predictions_path = '/datadrive/AACS2023/ESPCN/pred_ims'

# Annotations

The annotation tensor is a 7x2 size tensor where the keypoints are:
- nose
- left eye
- right eye
- left ear
- right ear
- left shoulder
- right shoulder

The model output tensor is a 17x3 tensor, where the coordinates are the x, y, and visibility (0 is invisible, 1 is visible). The keypoints are:
- nose
- left eye
- right eye
- left ear
- right ear
- left shoulder
- right shoulder
- left elbow
- right elbow
- left wrist
- right wrist
- left hip
- right hip
- left knee
- right knee
- left ankle
- right ankle


In [7]:
# from https://learnopencv.com/human-pose-estimation-using-keypoint-rcnn-in-pytorch/, rewritten a bit

from torch import Tensor


def filter_keypoints_per_person(all_keypoints, all_scores, confs, keypoint_threshold=2, conf_threshold=0.9):
    # iterate for every person detected
    for person_id in range(len(all_keypoints)):
      # check the confidence score of the detected person
      if confs[person_id]>conf_threshold:
        # grab the keypoint-locations for the detected person
        keypoints:Tensor = all_keypoints[person_id, ...]
        # grab the keypoint-scores for the keypoints
        scores: Tensor = all_scores[person_id, ...]
        # iterate for every keypoint-score
        for kp in range(len(scores)):
            # check the confidence score of detected keypoint
            if scores[kp] < keypoint_threshold:
                # convert the keypoint float-array to a python-list of integers
                keypoints[kp, 2] = 0
        return keypoints # return the first person with enough confidence
    
    return None

In [8]:
import latexify
import math

@latexify.with_latex
def DE(x, y, xhat, yhat, w) -> float:
    return math.sqrt((x - xhat)**2 + (y - yhat)**2) / w

DE

<latexify.frontend.LatexifiedFunction at 0x7f83e4927580>

In [9]:
def RMSE(truth, pred):
    sum = 0.0
    N = 0
    for i in range(0, 7):
        x, y, v = truth[i]
        x_hat, y_hat, v_hat = pred[i]
        if v == 0 or v_hat == 0:
            continue
        sum += ((x - x_hat)**2 + (y - y_hat)**2)
        N += 1
    
    if N == 0:
        return None
    
    return math.sqrt(sum / float(N))

In [10]:
import numpy as np
kpt_oks_sigmas = np.array([.26, .25, .25, .35, .35, .79, .79])/10.0 * 256# from https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py#L523


def OKS(y_true, y_pred, visibility, bbox_area: float):
    SCALE = bbox_area / float(256**2)
    # Compute the L2/Euclidean Distance
    distances = np.linalg.norm(y_pred - y_true, axis=-1)
    # Compute the exponential part of the equation
    exp_vector = np.exp(-(distances**2) / (2 * (SCALE**2) * (kpt_oks_sigmas**2)))
    # The numerator expression
    numerator = np.dot(exp_vector, visibility.astype(bool).astype(int))
    # The denominator expression
    denominator = np.sum(visibility.astype(bool).astype(int))
    return numerator / denominator

In [11]:
experiment_name = "ESPCN_kpts_compare"

In [12]:
from tqdm import tqdm
import os
from PIL import Image
from einops import rearrange
import matplotlib.pyplot as plt
import logging

logging.basicConfig(filename=experiment_name,
                    filemode='a',
                    format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.DEBUG)

logging.info("Running experiment for ESPCN and grounf truth generated image comaprison")

logger = logging.getLogger('ESPCN')

sum_rmse = 0.0
sum_de = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
sum_oks = 0.0

num_rmse = 0
num_de = 0
num_oks = 0

model = model.to('cuda')

for i in tqdm(range(len(dataset))):
        
    original_image = dataset[i]['img'] # tensor of shape (256, 256, 3)
    original_kpts = dataset[i]['keypoints'] # tensor of shape (7, 2)
    original_kpts = original_kpts * 256 # rescale with image size
    # Append the tensor with a visibility flag
    visibility = np.where(np.all((original_kpts >= 0) & (original_kpts <= 256), axis=1), 1.0, 0.0)
    original_kpts = np.concatenate((original_kpts, visibility.reshape(-1, 1)), axis=1)
    
    face_bbox = dataset[i]["face_bbox"]
    x0, y0, x1, y1 = face_bbox
    face_width: float = (x1 - x0).item()
    face_height: float = (y1 - y0).item()
    
    imname = str(dataset[i]['impath']).split('/')[-1]
    logger.info("Evaluating image " + imname)
    #logger.info("Original kpts = " + str(original_kpts))
    
    scaled_image_path = os.path.join(predictions_path, imname)
    with Image.open(scaled_image_path) as fp:
        image = np.array(fp, dtype=np.float32)
        image = image / 256.0
        image = rearrange(image, "h w c -> c h w")
        image = image[0:3, :, :] # remove the alpha channel, if exists
        image = torch.from_numpy(image).to('cuda')
    if image is None:
        continue
    
    predictions = model([image])
    pred_kpts: Tensor = predictions[0]['keypoints'] # tensor of shape (N, 17, 3)
    pred_kpts_scores: Tensor = predictions[0]['keypoints_scores']
    pred_scores: Tensor = predictions[0]['scores']
    filtered_pred_kpts = filter_keypoints_per_person(pred_kpts, pred_kpts_scores, pred_scores)
    if filtered_pred_kpts is not None:
        filtered_pred_kpts = filtered_pred_kpts[0:7] # shape (7, 3)
    else:
        continue
    #logger.info("Predicted kpts = " + str(filtered_pred_kpts))
    
    
    # Compare `filtered_pred_kpts` and `original_kpts`
    pred = filtered_pred_kpts.cpu().detach().numpy()
    
    # Detection error
    for i in range(0, 7):
        x, y, v = original_kpts[i]
        x_hat, y_hat, v_hat = pred[i]
        if v == 1 and v_hat == 1:
            de_i: float = DE(x, y, x_hat, y_hat, face_width)
            logger.info(f"DE for kpt {i} = {de_i}")
            sum_de[i] += de_i
            num_de += 1
        else:
            logger.warning("No DE for kpt " + str(i))    
    
    # RMSE
    rmse = RMSE(original_kpts, pred)
    logger.info("RMSE = " + str(rmse))
    if rmse is not None:
        sum_rmse += rmse
        num_rmse += 1
    
    # OKS
    oks: float = OKS(original_kpts[:, 0:1], pred[:, 0:1], original_kpts[:, 2].astype(int), (face_width * face_height))
    logger.info(f'OKS = {oks}')
    sum_oks += oks
    num_oks += 1
    
    
# Compute averages
avg_rmse = sum_rmse / num_rmse
avg_de = [de_kpt / num_de for de_kpt in sum_de]
avg_oks = sum_oks / num_oks

logger.info("--------EVALUATION COMPLETE--------")
logger.info(f'Average RMSE: {avg_rmse}')
logger.info(f'Average DEs: {avg_de}')
logger.info(f'Average OKS: {avg_oks}')

print("--------EVALUATION COMPLETE--------")
print(f'Average RMSE: {avg_rmse}')
print(f'Average DEs: {avg_de}')
print(f'Average OKS: {avg_oks}')


100%|██████████| 6531/6531 [08:27<00:00, 12.88it/s]

--------EVALUATION COMPLETE--------
Average RMSE: 23.67376179022041
Average DEs: [0.01654677528351126, 0.01646996469282185, 0.017457898626482077, 0.03558359172498275, 0.033763407241127134, 0.004134494685056277, 0.0055410073594415145]
Average OKS: 0.541443580829174



