In [1]:
import torch
import torchvision
from torchvision.models.detection import keypointrcnn_resnet50_fpn, KeypointRCNN_ResNet50_FPN_Weights

In [2]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
import numpy as np
np.random.seed(42)

In [3]:
print(torch.cuda.is_available())

True


In [4]:
model = torchvision.models.detection.keypointrcnn_resnet50_fpn(weights=KeypointRCNN_ResNet50_FPN_Weights.DEFAULT)
model.eval()

KeypointRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(640, 672, 704, 736, 768, 800), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.

In [5]:
data_folder = '/home/oem/Letöltések/Facialexp'
label_path = '/home/oem/Letöltések/Facialexp/labels_processed.csv'

In [6]:
import pandas as pd

labels = pd.read_csv(label_path, index_col='idx', sep=';')
labels.head()

Unnamed: 0_level_0,pth,label,relFCs,nose_x,nose_y,left_eye_x,left_eye_y,right_eye_x,right_eye_y,left_ear_x,...,right_ear_x,right_ear_y,left_sho_x,left_sho_y,right_sho_x,right_sho_y,x1,y1,x2,y2
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,anger/image0000006.jpg,7,0.873142,,,,,,,,...,,,,,,,,,,
1,anger/image0000060.jpg,0,0.852311,,,,,,,,...,,,,,,,,,,
2,anger/image0000061.jpg,0,0.800957,,,,,,,,...,,,,,,,,,,
3,anger/image0000066.jpg,2,0.843079,,,,,,,,...,,,,,,,,,,
4,anger/image0000106.jpg,0,0.849108,,,,,,,,...,,,,,,,,,,


In [7]:
# from https://learnopencv.com/human-pose-estimation-using-keypoint-rcnn-in-pytorch/, rewritten a bit

from torch import Tensor


def filter_keypoints_per_person(all_keypoints, all_scores, confs, bboxes, keypoint_threshold=0.95, conf_threshold=0.95):
    kpts_dict = []
    # iterate for every person detected
    
    for person_id in range(len(all_keypoints)):
      # check the confidence score of the detected person
      if confs[person_id]>conf_threshold:
        # grab the keypoint-locations for the detected person
        keypoints:Tensor = all_keypoints[person_id, ...]
        # grab the keypoint-scores for the keypoints
        scores: Tensor = all_scores[person_id, ...]
        # iterate for every keypoint-score
        for kp in range(len(scores)):
            # check the confidence score of detected keypoint
            if scores[kp] < keypoint_threshold:
                # convert the keypoint float-array to a python-list of integers
                keypoints[kp, 2] = 0
        kpts_dict.append({'conf': confs[person_id], 'kpts': keypoints, 'bbox': bboxes[person_id]})
    
    kpts_dict.sort(key=lambda x: x['conf'], reverse=True)
    
    return {'kpts': kpts_dict[0]['kpts'], 'bbox': kpts_dict[0]['bbox']} if len(kpts_dict) > 0 else None

In [8]:
from torch.utils.data import Dataset, DataLoader
import os
from PIL import Image
import torchvision.transforms.functional as TF


class FacialExpressionsDataset(Dataset):

    def __init__(self, csv_file, root_dir, transform=None):
        self.labels = pd.read_csv(csv_file, index_col='idx', sep=';')
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image_path = os.path.join(self.root_dir, self.labels.iloc[idx, 0])
        image = Image.open(image_path)
        image_tensor = TF.to_tensor(image)
        
        label = self.labels.iloc[idx, 1].astype('int')
        impath = self.labels.iloc[idx, 0]
        
        keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
        keypoints = torch.reshape(keypoints, shape=(7, 2))
        
        bounding_boxes = torch.tensor(data=self.labels.iloc[idx, 17:])

        sample = {'image': image_tensor, 'label': label, 'keypoints': keypoints, 'impath': impath, 'idx': idx, 'bbox': bounding_boxes}

        if self.transform:
            sample = self.transform(sample)

        return sample

In [9]:
dataset = FacialExpressionsDataset(csv_file=label_path, root_dir=data_folder, transform=None)
dataloader = DataLoader(dataset=dataset, batch_size=8, shuffle=False, num_workers=16)

The model output tensor is a 17x3 tensor, where the coordinates are the x, y, and visibility (0 is invisible, 1 is visible). The keypoints are:
- nose
- left eye
- right eye
- left ear
- right ear
- left shoulder
- right shoulder
- left elbow
- right elbow
- left wrist
- right wrist
- left hip
- right hip
- left knee
- right knee
- left ankle
- right ankle

In [10]:
# DEBUG, DELETE
model = model.cuda()
with torch.inference_mode():
    for i, batch in enumerate(dataloader):
            images = batch['image'].cuda()
            predictions = model(images)
            for num, prediction in enumerate(predictions):
                pred_kpts: Tensor = prediction['keypoints'] # tensor of shape (N, 17, 3)
                pred_kpts_scores: Tensor = prediction['keypoints_scores']
                print(pred_kpts_scores)
                pred_scores: Tensor = prediction['scores']
                print(pred_scores)
                pred_bboxes: Tensor = prediction['boxes']
                filtered_pred = filter_keypoints_per_person(pred_kpts, pred_kpts_scores, pred_scores, pred_bboxes)
                if filtered_pred is None:
                    continue
                filtered_pred_kpts = filtered_pred['kpts']
                filtered_pred_bbox = filtered_pred['bbox']
                #print(filtered_pred_kpts)
                
            break

  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  bounding_boxes = torch.tensor(data=self.labels.iloc[idx, 17:])
  bounding_boxes = torch.tensor(data=self.labels.iloc[idx, 17:])
  bounding_boxes = torch.tensor(data=self.labels.iloc[idx, 17:])
  bounding_boxes = torch.tensor(data=self.labels.iloc[idx, 17:])
  bounding_boxes = torch.tensor(data=self.labels.iloc[idx, 17:])
  bounding_boxes = torch.tensor(data=self.labels.iloc[idx, 17:])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.label

tensor([[ 1.4249e+01,  1.3056e+01,  1.4464e+01,  1.2611e+01,  1.0596e+01,
         -8.1354e-01,  2.6864e+00, -1.7068e+00, -1.9834e+00,  2.6546e-01,
         -2.5318e+00, -3.6154e+00, -3.6106e+00, -4.6058e+00, -3.8986e+00,
         -3.3201e+00, -3.2944e+00],
        [ 4.7985e-01, -4.7455e-01,  7.7284e-03, -8.4020e-01,  2.1798e+00,
          1.5135e+00,  2.4388e+00, -2.6876e+00,  1.6743e+00, -3.5583e+00,
          1.1989e-01, -1.4984e+00,  2.8405e+00, -3.8037e+00, -1.0989e+00,
         -4.0801e+00, -2.1674e+00],
        [ 2.7010e+00,  5.6879e+00,  1.1195e+01,  1.6592e+00,  6.8692e+00,
         -1.2082e+00,  3.4982e+00, -2.1776e+00, -8.0329e-01, -2.2485e+00,
         -1.9064e+00, -2.7022e+00, -6.9303e-01, -2.4147e+00, -3.6483e-01,
         -3.3556e+00, -2.6812e+00],
        [ 1.1912e+01,  1.4638e+01,  1.0527e+01,  1.4018e+01,  4.7410e+00,
         -7.0299e-01, -2.4510e+00, -3.0048e+00, -3.1451e+00, -1.9247e+00,
         -5.1242e-01, -3.5360e+00, -2.8213e+00, -2.9146e+00, -4.4862e+00,
    

In [10]:
from tqdm import tqdm

model = model.cuda()
keypoint_not_found = 0

with torch.inference_mode():
    for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
        images = batch['image'].cuda()
        predictions = model(images)
        for num, prediction in enumerate(predictions):
            pred_kpts: Tensor = prediction['keypoints'] # tensor of shape (N, 17, 3)
            pred_kpts_scores: Tensor = prediction['keypoints_scores']
            pred_scores: Tensor = prediction['scores']
            pred_bboxes: Tensor = prediction['boxes']
            filtered_pred = filter_keypoints_per_person(pred_kpts, pred_kpts_scores, pred_scores, pred_bboxes)
            if filtered_pred is None:
                image_id = batch['impath'][num]
                keypoint_not_found += 1
                #print(f'Filtered kpts is None for image {image_id}')
                continue
            filtered_pred_kpts = filtered_pred['kpts']
            filtered_pred_bbox = filtered_pred['bbox']
            image_id = batch['idx'][num].item()
            row = labels.iloc[image_id]
            for kpt_id in range(7):
                if filtered_pred_kpts[kpt_id][2].item() == 0:
                    continue # keypoint invisible
                labels.iloc[image_id, 3 + 2*kpt_id] = filtered_pred_kpts[kpt_id][0].item() / 96.0 # x
                labels.iloc[image_id, 3 + 2*kpt_id + 1] = filtered_pred_kpts[kpt_id][1].item() / 96.0 # y
            bbox = np.array(filtered_pred_bbox.cpu()) / 96.0 # bbox x1, y1, x2, y2
            labels.iloc[image_id, -4:] = bbox

print(f"Keypoints not found = {keypoint_not_found}, ratio = {keypoint_not_found / len(dataloader)}")

  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  bounding_boxes = torch.tensor(data=self.labels.iloc[idx, 17:])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  bounding_boxes = torch.tensor(data=self.labels.iloc[idx, 17:])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[i

Keypoints not found = 4381, ratio = 1.2438955139125496





In [11]:
labels.head()
labels.to_csv(path_or_buf='/home/oem/Letöltések/Facialexp/labels_with_kpts_v3.csv')

In [12]:
dflen = len(labels)
labels_filtered = labels[labels.nose_x.notnull()]
print(f'Dropping a total of {dflen - len(labels_filtered)}')
labels_filtered = labels_filtered[labels_filtered.nose_y.notnull()]
labels_filtered = labels_filtered[labels_filtered.left_eye_x.notnull()]
labels_filtered = labels_filtered[labels_filtered.left_eye_y.notnull()]
labels_filtered = labels_filtered[labels_filtered.right_eye_x.notnull()]
labels_filtered = labels_filtered[labels_filtered.right_eye_y.notnull()]
labels_filtered = labels_filtered[labels_filtered.left_ear_x.notnull()]
labels_filtered = labels_filtered[labels_filtered.left_ear_y.notnull()]
labels_filtered = labels_filtered[labels_filtered.right_ear_x.notnull()]
labels_filtered = labels_filtered[labels_filtered.right_ear_y.notnull()]
print(f'Dropping a total of {dflen - len(labels_filtered)}')

Dropping a total of 4413
Dropping a total of 4446


In [13]:
labels_filtered.describe()

Unnamed: 0,label,relFCs,nose_x,nose_y,left_eye_x,left_eye_y,right_eye_x,right_eye_y,left_ear_x,left_ear_y,right_ear_x,right_ear_y,left_sho_x,left_sho_y,right_sho_x,right_sho_y,x1,y1,x2,y2
count,23729.0,23729.0,23729.0,23729.0,23729.0,23729.0,23729.0,23729.0,23729.0,23729.0,23729.0,23729.0,6457.0,6457.0,9951.0,9951.0,23729.0,23729.0,23729.0,23729.0
mean,3.687134,0.797975,0.502054,0.577524,0.631283,0.437127,0.373856,0.439711,0.798453,0.533256,0.211911,0.533384,0.852474,0.889832,0.152475,0.900643,0.070029,0.053737,0.929691,0.983591
std,2.310732,0.058665,0.058583,0.05702,0.064396,0.054066,0.069556,0.058136,0.117238,0.099361,0.1242,0.103975,0.126208,0.125977,0.121753,0.114898,0.086475,0.071128,0.087558,0.023861
min,0.0,0.516357,0.000623,0.001655,0.000623,0.000625,0.000624,0.001655,0.001441,0.003122,0.000623,0.001655,0.000623,0.003123,0.000623,0.001872,0.0,0.0,0.091871,0.222502
25%,2.0,0.765205,0.482038,0.556835,0.622274,0.406018,0.352347,0.408378,0.758866,0.478403,0.149212,0.477351,0.813955,0.863396,0.069342,0.879446,0.014152,0.0046,0.90283,0.978066
50%,4.0,0.807858,0.501785,0.582138,0.632676,0.438338,0.36975,0.439669,0.807138,0.535301,0.200779,0.535423,0.885778,0.92539,0.12353,0.936208,0.039338,0.031111,0.957851,0.987011
75%,6.0,0.840562,0.522756,0.604255,0.651165,0.465188,0.38061,0.465625,0.862017,0.583654,0.247366,0.584738,0.928985,0.959988,0.206659,0.959375,0.096576,0.073922,0.985974,0.994863
max,7.0,0.899951,0.999378,0.992028,0.999378,0.978087,0.999377,0.982516,0.999377,0.9844,0.999164,0.996875,0.999377,0.998128,0.996641,0.998128,0.922501,0.864619,1.0,1.0


In [14]:
labels_filtered.to_csv(path_or_buf='/home/oem/Letöltések/Facialexp/labels_with_kpts_v3.csv')