In [1]:
import torch
import torchvision
from torchvision.models.detection import keypointrcnn_resnet50_fpn, KeypointRCNN_ResNet50_FPN_Weights

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
import numpy as np
np.random.seed(42)

In [3]:
print(torch.cuda.is_available())

True


In [4]:
model = torchvision.models.detection.keypointrcnn_resnet50_fpn(weights=KeypointRCNN_ResNet50_FPN_Weights.DEFAULT)
model.eval()

KeypointRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(640, 672, 704, 736, 768, 800), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.

In [5]:
data_folder = '/home/oem/Letöltések/Facialexp'
label_path = '/home/oem/Letöltések/Facialexp/labels_processed.csv'

In [8]:
import pandas as pd

labels = pd.read_csv(label_path, index_col='idx', sep=';')
labels.head()

Unnamed: 0_level_0,pth,label,relFCs,nose_x,nose_y,left_eye_x,left_eye_y,right_eye_x,right_eye_y,left_ear_x,...,right_ear_x,right_ear_y,left_sho_x,left_sho_y,right_sho_x,right_sho_y,x1,y1,x2,y2
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,anger/image0000006.jpg,7,0.873142,,,,,,,,...,,,,,,,,,,
1,anger/image0000060.jpg,0,0.852311,,,,,,,,...,,,,,,,,,,
2,anger/image0000061.jpg,0,0.800957,,,,,,,,...,,,,,,,,,,
3,anger/image0000066.jpg,2,0.843079,,,,,,,,...,,,,,,,,,,
4,anger/image0000106.jpg,0,0.849108,,,,,,,,...,,,,,,,,,,


In [9]:
# from https://learnopencv.com/human-pose-estimation-using-keypoint-rcnn-in-pytorch/, rewritten a bit

from torch import Tensor


def filter_keypoints_per_person(all_keypoints, all_scores, confs, bboxes, keypoint_threshold=0.7, conf_threshold=0.75):
    kpts_dict = []
    # iterate for every person detected
    for person_id in range(len(all_keypoints)):
      # check the confidence score of the detected person
      if confs[person_id]>conf_threshold:
        # grab the keypoint-locations for the detected person
        keypoints:Tensor = all_keypoints[person_id, ...]
        # grab the keypoint-scores for the keypoints
        scores: Tensor = all_scores[person_id, ...]
        # iterate for every keypoint-score
        for kp in range(len(scores)):
            # check the confidence score of detected keypoint
            if torch.sigmoid(scores[kp]) < keypoint_threshold:
                # convert the keypoint float-array to a python-list of integers
                keypoints[kp, 2] = 0
        kpts_dict.append({'conf': confs[person_id], 'kpts': keypoints, 'bbox': bboxes[person_id]})
    
    kpts_dict.sort(key=lambda x: x['conf'], reverse=True)
    
    return {'kpts': kpts_dict[0]['kpts'], 'bbox': kpts_dict[0]['bbox']} if len(kpts_dict) > 0 else None

In [15]:
from torch.utils.data import Dataset, DataLoader
import os
from PIL import Image
import torchvision.transforms.functional as TF


class FacialExpressionsDataset(Dataset):

    def __init__(self, csv_file, root_dir, transform=None):
        self.labels = pd.read_csv(csv_file, index_col='idx', sep=';')
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image_path = os.path.join(self.root_dir, self.labels.iloc[idx, 0])
        image = Image.open(image_path)
        image_tensor = TF.to_tensor(image)
        
        label = self.labels.iloc[idx, 1].astype('int')
        impath = self.labels.iloc[idx, 0]
        
        keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
        keypoints = torch.reshape(keypoints, shape=(7, 2))
        
        bounding_boxes = torch.tensor(data=self.labels.iloc[idx, 17:])

        sample = {'image': image_tensor, 'label': label, 'keypoints': keypoints, 'impath': impath, 'idx': idx, 'bbox': bounding_boxes}

        if self.transform:
            sample = self.transform(sample)

        return sample

In [34]:
dataset = FacialExpressionsDataset(csv_file=label_path, root_dir=data_folder, transform=None)
dataloader = DataLoader(dataset=dataset, batch_size=8, shuffle=False, num_workers=16)

The model output tensor is a 17x3 tensor, where the coordinates are the x, y, and visibility (0 is invisible, 1 is visible). The keypoints are:
- nose
- left eye
- right eye
- left ear
- right ear
- left shoulder
- right shoulder
- left elbow
- right elbow
- left wrist
- right wrist
- left hip
- right hip
- left knee
- right knee
- left ankle
- right ankle

In [35]:
from tqdm import tqdm

model = model.cuda()
keypoint_not_found = 0

with torch.inference_mode():
    for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
        images = batch['image'].cuda()
        predictions = model(images)
        for num, prediction in enumerate(predictions):
            pred_kpts: Tensor = prediction['keypoints'] # tensor of shape (N, 17, 3)
            pred_kpts_scores: Tensor = prediction['keypoints_scores']
            pred_scores: Tensor = prediction['scores']
            pred_bboxes: Tensor = prediction['boxes']
            filtered_pred = filter_keypoints_per_person(pred_kpts, pred_kpts_scores, pred_scores, pred_bboxes)
            if filtered_pred is None:
                image_id = batch['impath'][num]
                keypoint_not_found += 1
                #print(f'Filtered kpts is None for image {image_id}')
                continue
            filtered_pred_kpts = filtered_pred['kpts']
            filtered_pred_bbox = filtered_pred['bbox']
            image_id = batch['idx'][num].item()
            row = labels.iloc[image_id]
            for kpt_id in range(7):
                if filtered_pred_kpts[kpt_id][2].item() == 0:
                    continue # keypoint invisible
                labels.iloc[image_id, 3 + 2*kpt_id] = filtered_pred_kpts[kpt_id][0].item() # x
                labels.iloc[image_id, 3 + 2*kpt_id + 1] = filtered_pred_kpts[kpt_id][1].item() # y
            arr = np.array(filtered_pred_bbox.cpu()) # bbox x1, y1, x2, y2
            labels.iloc[image_id, -4:] = arr

print(f"Keypoints not found = {keypoint_not_found}, ratio = {keypoint_not_found / len(dataloader)}")

  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  keypoints = torch.tensor(data=[self.labels.iloc[idx, 3:17]])
  bounding_boxes = torch.tensor(data=self.labels.iloc[i

Keypoints not found = 627, ratio = 0.17802385008517888





In [36]:
labels.head()
labels.to_csv(path_or_buf='/home/oem/Letöltések/Facialexp/labels_with_kpts.csv')

In [39]:
dflen = len(labels)
labels_filtered = labels[labels.nose_x.notnull()]
print(f'Dropping a total of {dflen - len(labels_filtered)}')
labels_filtered = labels_filtered[labels_filtered.nose_y.notnull()]
labels_filtered = labels_filtered[labels_filtered.left_eye_x.notnull()]
labels_filtered = labels_filtered[labels_filtered.left_eye_y.notnull()]
labels_filtered = labels_filtered[labels_filtered.right_eye_x.notnull()]
labels_filtered = labels_filtered[labels_filtered.right_eye_y.notnull()]
labels_filtered = labels_filtered[labels_filtered.left_ear_x.notnull()]
labels_filtered = labels_filtered[labels_filtered.left_ear_y.notnull()]
labels_filtered = labels_filtered[labels_filtered.right_ear_x.notnull()]
labels_filtered = labels_filtered[labels_filtered.right_ear_y.notnull()]
print(f'Dropping a total of {dflen - len(labels_filtered)}')

Dropping a total of 696
Dropping a total of 812


In [41]:
labels_filtered.describe()

Unnamed: 0,label,relFCs,nose_x,nose_y,left_eye_x,left_eye_y,right_eye_x,right_eye_y,left_ear_x,left_ear_y,right_ear_x,right_ear_y,left_sho_x,left_sho_y,right_sho_x,right_sho_y,x1,y1,x2,y2
count,27363.0,27363.0,27363.0,27363.0,27363.0,27363.0,27363.0,27363.0,27363.0,27363.0,27363.0,27363.0,7231.0,7231.0,11072.0,11072.0,27363.0,27363.0,27363.0,27363.0
mean,3.612652,0.798932,48.21803,55.240801,60.369376,41.956625,36.291123,42.288217,76.101044,51.257967,21.121819,51.336995,81.444746,84.922888,15.007583,86.042993,7.318223,5.856727,88.834068,94.340378
std,2.342842,0.058639,6.352322,6.141401,7.110814,5.722672,7.676114,6.21432,12.418442,9.953237,13.161733,10.466435,12.875064,12.952017,12.340749,11.719368,8.963269,7.626872,9.009366,2.532645
min,0.0,0.516357,0.059808,0.158888,0.059802,0.059991,0.059869,0.158888,0.059907,0.189404,0.059631,0.158888,0.059776,0.299662,0.059788,0.179679,0.0,0.0,8.819621,13.06233
25%,2.0,0.766139,46.155012,53.277985,59.648626,38.866083,33.719639,39.113327,72.534622,45.694881,14.360335,45.63131,77.830639,82.324413,6.673236,84.13327,1.475823,0.604796,86.223171,93.822941
50%,4.0,0.808588,48.18,55.810574,60.764473,41.721992,35.502094,41.917191,77.485573,51.366257,19.46183,51.381924,84.950096,88.642868,12.019644,89.779373,4.130826,3.379025,91.755928,94.705902
75%,6.0,0.84175,50.315552,58.003139,62.672415,44.633158,36.682531,44.682924,82.932369,56.185884,24.278262,56.345064,89.203693,92.099998,20.206264,92.05685,10.182467,7.957049,94.556267,95.477303
max,7.0,0.899951,95.940254,95.234734,95.940254,93.976944,95.940216,94.335266,95.940239,94.551033,95.919724,95.699997,95.94017,95.820274,95.677528,95.82029,88.560104,83.003395,96.0,96.0


In [42]:
labels_filtered.to_csv(path_or_buf='/home/oem/Letöltések/Facialexp/labels_with_kpts.csv')