In [8]:
import torch
import torchvision
from torchvision.models.detection import keypointrcnn_resnet50_fpn, KeypointRCNN_ResNet50_FPN_Weights

In [9]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
import numpy as np
np.random.seed(42)

In [10]:
print(torch.cuda.is_available())

True


In [11]:
model = torchvision.models.detection.keypointrcnn_resnet50_fpn(weights=KeypointRCNN_ResNet50_FPN_Weights.DEFAULT)
model.eval()

KeypointRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(640, 672, 704, 736, 768, 800), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.

In [12]:
data_folder = '/home/oem/Letöltések/Facialexp'
label_path = '/home/oem/Letöltések/Facialexp/labels_processed.csv'

In [13]:
import pandas as pd

labels = pd.read_csv(label_path, index_col='idx', sep=';')
labels.head()

Unnamed: 0_level_0,pth,label,relFCs,nose_x,nose_y,left_eye_x,left_eye_y,right_eye_x,right_eye_y,left_ear_x,...,right_ear_x,right_ear_y,left_sho_x,left_sho_y,right_sho_x,right_sho_y,x1,y1,x2,y2
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,anger/image0000006.jpg,7,0.873142,,,,,,,,...,,,,,,,,,,
1,anger/image0000060.jpg,0,0.852311,,,,,,,,...,,,,,,,,,,
2,anger/image0000061.jpg,0,0.800957,,,,,,,,...,,,,,,,,,,
3,anger/image0000066.jpg,2,0.843079,,,,,,,,...,,,,,,,,,,
4,anger/image0000106.jpg,0,0.849108,,,,,,,,...,,,,,,,,,,


In [14]:
# from https://learnopencv.com/human-pose-estimation-using-keypoint-rcnn-in-pytorch/, rewritten a bit

from torch import Tensor


def filter_keypoints_per_person(all_keypoints, all_scores, confs, bboxes, keypoint_threshold=0.7, conf_threshold=0.75):
    kpts_dict = []
    # iterate for every person detected
    for person_id in range(len(all_keypoints)):
      # check the confidence score of the detected person
      if confs[person_id]>conf_threshold:
        # grab the keypoint-locations for the detected person
        keypoints:Tensor = all_keypoints[person_id, ...]
        # grab the keypoint-scores for the keypoints
        scores: Tensor = all_scores[person_id, ...]
        # iterate for every keypoint-score
        for kp in range(len(scores)):
            # check the confidence score of detected keypoint
            if torch.sigmoid(scores[kp]) < keypoint_threshold:
                # convert the keypoint float-array to a python-list of integers
                keypoints[kp, 2] = 0
        kpts_dict.append({'conf': confs[person_id], 'kpts': keypoints, 'bbox': bboxes[person_id]})
    
    kpts_dict.sort(key=lambda x: x['conf'], reverse=True)
    
    return {'kpts': kpts_dict[0]['kpts'], 'bbox': kpts_dict[0]['bbox']} if len(kpts_dict) > 0 else None

In [15]:
from torch.utils.data import Dataset, DataLoader
import os
from PIL import Image
import torchvision.transforms.functional as TF


class FacialExpressionsDataset(Dataset):

    def __init__(self, csv_file, root_dir, transform=None):
        self.labels = pd.read_csv(csv_file, index_col='idx', sep=';')
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image_path = os.path.join(self.root_dir, self.labels.iloc[idx, 0])
        image = Image.open(image_path)
        image_tensor = TF.to_tensor(image)
        
        label = self.labels.iloc[idx, 1].astype('int')
        impath = self.labels.iloc[idx, 0]
        
        keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
        keypoints = torch.reshape(keypoints, shape=(7, 2))
        
        bounding_boxes = torch.tensor(data=self.labels.iloc[idx, 17:])

        sample = {'image': image_tensor, 'label': label, 'keypoints': keypoints, 'impath': impath, 'idx': idx, 'bbox': bounding_boxes}

        if self.transform:
            sample = self.transform(sample)

        return sample

In [16]:
dataset = FacialExpressionsDataset(csv_file=label_path, root_dir=data_folder, transform=None)
dataloader = DataLoader(dataset=dataset, batch_size=8, shuffle=False, num_workers=16)

The model output tensor is a 17x3 tensor, where the coordinates are the x, y, and visibility (0 is invisible, 1 is visible). The keypoints are:
- nose
- left eye
- right eye
- left ear
- right ear
- left shoulder
- right shoulder
- left elbow
- right elbow
- left wrist
- right wrist
- left hip
- right hip
- left knee
- right knee
- left ankle
- right ankle

In [17]:
from tqdm import tqdm

model = model.cuda()
keypoint_not_found = 0

with torch.inference_mode():
    for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
        images = batch['image'].cuda()
        predictions = model(images)
        for num, prediction in enumerate(predictions):
            pred_kpts: Tensor = prediction['keypoints'] # tensor of shape (N, 17, 3)
            pred_kpts_scores: Tensor = prediction['keypoints_scores']
            pred_scores: Tensor = prediction['scores']
            pred_bboxes: Tensor = prediction['boxes']
            filtered_pred = filter_keypoints_per_person(pred_kpts, pred_kpts_scores, pred_scores, pred_bboxes)
            if filtered_pred is None:
                image_id = batch['impath'][num]
                keypoint_not_found += 1
                #print(f'Filtered kpts is None for image {image_id}')
                continue
            filtered_pred_kpts = filtered_pred['kpts']
            filtered_pred_bbox = filtered_pred['bbox']
            image_id = batch['idx'][num].item()
            row = labels.iloc[image_id]
            for kpt_id in range(7):
                if filtered_pred_kpts[kpt_id][2].item() == 0:
                    continue # keypoint invisible
                labels.iloc[image_id, 3 + 2*kpt_id] = filtered_pred_kpts[kpt_id][0].item() / 96.0 # x
                labels.iloc[image_id, 3 + 2*kpt_id + 1] = filtered_pred_kpts[kpt_id][1].item() / 96.0 # y
            arr = np.array(filtered_pred_bbox.cpu()) / 96.0 # bbox x1, y1, x2, y2
            labels.iloc[image_id, -4:] = arr

print(f"Keypoints not found = {keypoint_not_found}, ratio = {keypoint_not_found / len(dataloader)}")

  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  bounding_boxes = torch.tensor(data=self.labels.iloc[idx, 17:])
  bounding_boxes = torch.tensor(data=self.labels.iloc[idx, 17:])
  bounding_boxes = torch.tensor(data=self.labels.iloc[idx, 17:])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  bounding_boxes = torch.tensor(data=self.labels.iloc[idx, 17:])
  bounding_boxes = torch.tensor(data=self.labels.iloc[idx, 17:])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.

Keypoints not found = 627, ratio = 0.17802385008517888





In [18]:
labels.head()
labels.to_csv(path_or_buf='/home/oem/Letöltések/Facialexp/labels_with_kpts.csv')

In [19]:
dflen = len(labels)
labels_filtered = labels[labels.nose_x.notnull()]
print(f'Dropping a total of {dflen - len(labels_filtered)}')
labels_filtered = labels_filtered[labels_filtered.nose_y.notnull()]
labels_filtered = labels_filtered[labels_filtered.left_eye_x.notnull()]
labels_filtered = labels_filtered[labels_filtered.left_eye_y.notnull()]
labels_filtered = labels_filtered[labels_filtered.right_eye_x.notnull()]
labels_filtered = labels_filtered[labels_filtered.right_eye_y.notnull()]
labels_filtered = labels_filtered[labels_filtered.left_ear_x.notnull()]
labels_filtered = labels_filtered[labels_filtered.left_ear_y.notnull()]
labels_filtered = labels_filtered[labels_filtered.right_ear_x.notnull()]
labels_filtered = labels_filtered[labels_filtered.right_ear_y.notnull()]
print(f'Dropping a total of {dflen - len(labels_filtered)}')

Dropping a total of 696
Dropping a total of 812


In [20]:
labels_filtered.describe()

Unnamed: 0,label,relFCs,nose_x,nose_y,left_eye_x,left_eye_y,right_eye_x,right_eye_y,left_ear_x,left_ear_y,right_ear_x,right_ear_y,left_sho_x,left_sho_y,right_sho_x,right_sho_y,x1,y1,x2,y2
count,27363.0,27363.0,27363.0,27363.0,27363.0,27363.0,27363.0,27363.0,27363.0,27363.0,27363.0,27363.0,7231.0,7231.0,11072.0,11072.0,27363.0,27363.0,27363.0,27363.0
mean,3.612652,0.798932,0.502271,0.575425,0.628848,0.437048,0.378033,0.440502,0.792719,0.533937,0.220019,0.53476,0.848383,0.884613,0.156329,0.896281,0.076231,0.061008,0.925355,0.982712
std,2.342842,0.058639,0.06617,0.063973,0.074071,0.059611,0.07996,0.064732,0.129359,0.10368,0.137101,0.109025,0.134115,0.134917,0.128549,0.122077,0.093367,0.079447,0.093848,0.026382
min,0.0,0.516357,0.000623,0.001655,0.000623,0.000625,0.000624,0.001655,0.000624,0.001973,0.000621,0.001655,0.000623,0.003121,0.000623,0.001872,0.0,0.0,0.091871,0.136066
25%,2.0,0.766139,0.480781,0.554979,0.62134,0.404855,0.351246,0.40743,0.755569,0.475988,0.149587,0.475326,0.810736,0.857546,0.069513,0.876388,0.015373,0.0063,0.898158,0.977322
50%,4.0,0.808588,0.501875,0.58136,0.632963,0.434604,0.369813,0.436637,0.807141,0.535065,0.202727,0.535228,0.884897,0.923363,0.125205,0.935202,0.043029,0.035198,0.955791,0.98652
75%,6.0,0.84175,0.52412,0.604199,0.652838,0.464929,0.38211,0.465447,0.863879,0.58527,0.252899,0.586928,0.929205,0.959375,0.210482,0.958926,0.106067,0.082886,0.984961,0.994555
max,7.0,0.899951,0.999378,0.992028,0.999378,0.978926,0.999377,0.982659,0.999377,0.984907,0.999164,0.996875,0.999377,0.998128,0.996641,0.998128,0.922501,0.864619,1.0,1.0


In [21]:
labels_filtered.to_csv(path_or_buf='/home/oem/Letöltések/Facialexp/labels_with_kpts.csv')