## Generating the serialized detection results for the target using provided ground truth tracklet labels
It is a pre-processing step for generating the pose representation maps for the action recognition. the output of this code will be saved as .pkl format in the output folder 

In [1]:
# import required packages
import sys
import os
import io
import base64
import cPickle as pickle
import os.path as osp
import numpy as np
import pylab
import math
import time
import pylab
import shutil
from copy import deepcopy

### Definition of requires functions and variables

In [2]:
def _id_or_index(ix, val):
    if len(val) == 0:
        return val
    else:
        return val[ix]
def robust_pickle_dump(data_dict, file_name):
    file_name = os.path.abspath(file_name)
    with open(file_name, 'wb') as f:
        pickle.dump(data_dict, f, pickle.HIGHEST_PROTOCOL)
def find_element_in_list(element, list_element):
    """
    finding index of first appearance of an element in a given list
    if the specified element does not exist returns None
    inputs:
        element: query element to be found
        list_element: input list for searching the query
    outputs:
        index_element: index of the query elemnt in the list
    """
    try:
        index_element = list_element.index(element)
        return index_element
    except ValueError:
        return None


# finding the bounding boxes of the specified target
def _get_target_tracklet(track_id, all_boxes, all_tracks, st_fr=None):
    """
    finding the target tracklet which is the list of the bounding boxes in consecutive
    frames which locate same person by given the track_id of the target
    input:
        st_fr: start frame of the target appearance
        track_id: track ID of the target
        all_boxes: the list of the all detected bounding boxes with the human detection person
        all_tracks: list of all the track IDs for the detected bounding boxes
    output:
        end_fr: frame number of the end of a tracklet
        st_fr: frame number of the start of a tracklet
        target_box: list of the bounding boxes surrounding the target

    """
    if st_fr is None:
        search = True
        st_fr = 0
        while search:
            indx = find_element_in_list(track_id, all_tracks[st_fr])
            if indx is None:
                st_fr += 1
            else:
                search = False
    fr_last = len(all_tracks) - 1
    count = st_fr
    exist = True
    target_box = []
    while exist:
        indx = find_element_in_list(track_id, all_tracks[count])
        if indx is not None:
            target_box.append(all_boxes[count][indx])
            if count < fr_last:
                count += 1
            elif count == fr_last:
                end_fr = count
                exist = False
        else:
            end_fr = count
            exist = False
    return end_fr, st_fr, target_box
def boxes_area(boxes):
    """
    calculating the area of the input bounding boxes
    :param boxes: the list of bounding boxes in the form [x0, y0, x1, y1, conf(optional)]
    :return: the area calculated for each bounding box in the list
    """
    w = (boxes[:, 2] - boxes[:, 0] + 1)
    h = (boxes[:, 3] - boxes[:, 1] + 1)
    areas = w * h
    assert np.all(areas >= 0), 'Negative areas founds'
    return areas


def bbox_overlaps(boxes, query_boxes):
    """
    Parameters
    ----------
    boxes: (N, 4) ndarray of float
    query_boxes: (K, 4) ndarray of float
    Returns
    -------
    overlaps: (N, K) ndarray of overlap between boxes and query_boxes
    """
    N = boxes.shape[0]
    # print('N={}'.format(N))
    K = query_boxes.shape[0]
    # print('K={}'.format(K))
    overlaps = np.zeros((N, K), dtype=DTYPE)
    for k in range(K):
        box_area = (
                (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
                (query_boxes[k, 3] - query_boxes[k, 1] + 1)
        )
        for n in range(N):
            iw = (
                    min(boxes[n, 2], query_boxes[k, 2]) -
                    max(boxes[n, 0], query_boxes[k, 0]) + 1
            )
            if iw > 0:
                ih = (
                        min(boxes[n, 3], query_boxes[k, 3]) -
                        max(boxes[n, 1], query_boxes[k, 1]) + 1
                )
                if ih > 0:
                    ua = float(
                        (boxes[n, 2] - boxes[n, 0] + 1) *
                        (boxes[n, 3] - boxes[n, 1] + 1) +
                        box_area - iw * ih
                    )
                    overlaps[n, k] = iw * ih / ua
    return overlaps
DTYPE = np.float32
height = 1080  # frame height in pixel
width = 1920  # frame width in pixel

### Creating a dictionary containing the name of the all pose estimation results for video dataset

In [3]:
all_det_track_results = {'03121601001':['Kinect_1484230300411_Video_flipped_detections_withTracks.pkl',
                              'Kinect_1484236642988_Video_flipped_detections_withTracks.pkl'],
               '03121601002':['Kinect_1484837025135_Video_Flipped_detections_withTracks.pkl',
                              'Kinect_1484852850634_Video_flipped_detections_withTracks.pkl'],
               '03121601003':['Kinect_1485442263970_Fixed_Video_Flipped_detections_withTracks.pkl',
                              'Kinect_1485453235022_Video_flipped_detections_withTracks.pkl'],
               '03121601004':['Kinect_1486044609777_Video_flipped_detections_withTracks.pkl',
                              'Kinect_1486049379590_Video_flipped_detections_withTracks.pkl'],
               '03121601005':['Kinect_1486055718051_Video_flipped_detections_withTracks.pkl',
                              'Kinect_1486062723338_Video_flipped_detections_withTracks.pkl'],
               '03121601006':['Kinect_1487170521651_Fixed_Video_Flipped_detections_withTracks.pkl',
                              'Kinect_1487252582676_Fixed_Video_Flipped_detections_withTracks.pkl'],
               '03121601007':['Kinect_1487773684672_Video_flipped_detections_withTracks.pkl',
                              'Kinect_1488375395995_Fixed_Video_Flipped_detections_withTracks.pkl'],
               '03121601008':['Kinect_1487859349689_Video_flipped_detections_withTracks.pkl',
                              'Kinect_1487864851230_Video_flipped_detections_withTracks.pkl'],
               '03121601009':['Kinect_1488985159337_Video_flipped_detections_withTracks.pkl',
                              'Kinect_1488991782221_Video_flipped_detections_withTracks.pkl'],
               '03121601010':['Kinect_1489081164119_Video_flipped_detections_withTracks.pkl',
                              'Kinect_1489582979282_Video_flipped_detections_withTracks.pkl'],
               '03121601011':['Kinect_1489410472438_Video_flipped_detections_withTracks.pkl',
                             'Kinect_1489416963560_Video_flipped_detections_withTracks.pkl'],
               '03121601012':['Kinect_1490279720993_Video_flipped_detections_withTracks.pkl',
                             'Kinect_1490299676346_Video_flipped_detections_withTracks.pkl'],
               '03121601013':['Kinect_1489586406980_Video_Flipped_detections_withTracks.pkl',
                             'Kinect_1489596563256_Video_Flipped_detections_withTracks.pkl'],
               '03121601014':['Kinect_1490793487534_Video_flipped_detections_withTracks.pkl',
                             'Kinect_1491481607487_Video_Flipped_detections_withTracks.pkl'],
               '03121601015':['Kinect_1490879543456_Video_Flipped_detections_withTracks.pkl',
                             'Kinect_1490885684191_video_Flipped_detections_withTracks.pkl'],
               '03121601016':['Kinect_1491399264372_Video_flipped_detections_withTracks.pkl',
                             'Kinect_1491408408518_Video_flipped_detections_withTracks.pkl'],
               '03121601017':['Kinect_1491571272034_Video_flipped_detections_withTracks.pkl',
                             'Kinect_1491576389630_Video_flipped_detections_withTracks.pkl'],
               '03121601018':['Kinect_1492697256228_Video_Flipped_detections_withTracks.pkl',
                             'Kinect_1493126049741_Video_flipped_detections_withTracks.pkl'],
               '03121601019':['Kinect_1493905943020_Video_flipped_detections_withTracks.pkl',
                              'Kinect_1494422856653_Video_Flipped_detections_withTracks.pkl'],
               '03121601020':['Kinect_1492786250052_Video_flipped_detections_withTracks.pkl',
                              'Kinect_1492792155821_Video_flipped_detections_withTracks.pkl'],
               '03121601021':['Kinect_1493214639781_Video_flipped_detections_withTracks.pkl',
                              'Kinect_1493222216696_Video_flipped_detections_withTracks.pkl'],
               '03121601022':['Kinect_1495028284074_Video_flipped_detections_withTracks.pkl',
                              'Kinect_1495037457240_Video_flipped_detections_withTracks.pkl'],
               '03121601023':['Kinect_1495638523488_Video_flipped_detections_withTracks.pkl',
                              'Kinect_1495649327569_Video_flipped_detections_withTracks.pkl'],
               '03121601024':['Kinect_1495638523488_Video_flipped_detections_withTracks.pkl',
                              'Kinect_1495646631951_Video_flipped_detections_withTracks.pkl'],
               '03121601025':['Kinect_1495724260698_Video_flipped_detections_withTracks.pkl',
                              'Kinect_1496323552600_Video_flipped_detections_withTracks.pkl'],
               '03121601027':['Kinect_1503497547252_Video_Flipped_detections_withTracks.pkl',
                              'Kinect_1504101708237_Video_Flipped_detections_withTracks.pkl'],
               '03121601028':['Kinect_1503581189725_Video_Flipped_detections_withTracks.pkl',
                             'Kinect_1504181872815_Video_Flipped_detections_withTracks.pkl'],
               '03121601029':['Kinect_1504627411616_Video_Flipped_detections_withTracks.pkl',
                             'Kinect_1504635058283_Video_Flipped_detections_withTracks.pkl'],
               '03121601030':['Kinect_1507211606614_Video_Flipped_detections_withTracks.pkl',
                             'Kinect_1507220967931_Video_Flipped_detections_withTracks.pkl'],
               '03121601031':['Kinect_1508335131827_Video_Flipped_detections_withTracks.pkl',
                             'Kinect_1508348636573_Video_Flipped_detections_withTracks.pkl'],
               '03121601032':['Kinect_1510160593496_Video_Flipped_detections_withTracks.pkl',
                             'Kinect_1510168880679_Video_Flipped_detections_withTracks.pkl'],
               '03121601033':['Kinect_1512057807777_Video_Flipped_detections_withTracks.pkl',
                             'Kinect_1512068815485_Video_flipped_detections_withTracks.pkl'],
               '03121601034':['Kinect_1516897009358_Video_Flipped_detections_withTracks.pkl',
                             'Kinect_1516904509468_Video_Flipped_detections_withTracks.pkl'],
               '03121601035':['Kinect_1522328709409_Video_flipped_detections_withTracks.pkl',
                             'Kinect_1522335428199_Video_flipped_detections_withTracks.pkl'],         
               '03121601036':['Kinect_1524143477281_Video_flipped_detections_withTracks.pkl',
                             'Kinect_1524156291481_Video_flipped_detections_withTracks.pkl']
              }

### Reading detection and tracking results

In [4]:
base_path = '../../../PoseEstimation/DetectAndTrack/DetectAndTrack/outputs/TuftsVideo/HUBB121601_Tufts_Kinect_Videos_Flipped'
subj_id = '03121601034'
visit_no = 1
file_name = all_det_track_results[subj_id][visit_no - 1]
print('processing output results for patient ID:{} saved in {} ').format(subj_id, file_name)
det_track_path = osp.join(base_path, subj_id, 'Visit_%02d' % visit_no, file_name)
with open(det_track_path, 'rb') as res:
    dets = pickle.load(res)
all_boxes = dets['all_boxes'][1] # bbox:[x0, y0, x1, y1, score]: w=x1-x0; h=y1-y0
cfg = dets['cfg']
if 'all_keyps' in dets:
    all_keyps = dets['all_keyps'][1]
else:
    all_keyps = None
if 'all_tracks' in dets:
    all_tracks = dets['all_tracks'][1]
else:
    all_tracks = None
    
all_tracks_np = np.array(all_tracks)
n_tracks = np.amax(all_tracks_np)[0] + 1
n_frames = len(all_tracks)
print('There are {} individual tracks  in {} total number of frames'.format(n_tracks, n_frames))

processing output results for patient ID:03121601034 saved in Kinect_1516897009358_Video_Flipped_detections_withTracks.pkl 
There are 924 individual tracks  in 109006 total number of frames


### Extract all the tracklets in this format: [[[st0], [list of boxes]], [[st1], [list of boxes]], …..]
### Extract the highest confidence bounding box from each tracklet

In [5]:
tracklet_list =[]
id_list =[]
th_box_conf = 0.94
# it is not the optimized way to create the tracklet_list
# TODO: change the search mw=ethod to a more optimized scheme
for i in range(n_tracks):
    end_fr, st_fr, target_box = _get_target_tracklet(i , all_boxes, all_tracks, st_fr = None)
    hconf_box_ind = np.argmax(np.array(target_box)[:, 4])
    hconf_box = np.max(np.array(target_box)[:, 4])
    key_pts = all_keyps[st_fr+hconf_box_ind][all_tracks[st_fr+hconf_box_ind].index(i)]
    key_pts_conf = key_pts[2,:]
    key_pts_num = sum(key_pts_conf > 2.3)
    # pruning based on the detection confidence, number of key points and tracklet length
    length = end_fr - st_fr
    if hconf_box > th_box_conf and length > 8 and key_pts_num > 6:
        tracklet_list.append([st_fr, i, hconf_box_ind, target_box])
        id_list.append(i)
#second level of pruning based on overlaps with other bounding boxes
print('remaining list of IDs after pruning based on track length, number of keypoints in detection '
      'and lowest confidence of {} is:\n{}'.format(th_box_conf, id_list) )

remaining list of IDs after pruning based on track length, number of keypoints in detection and lowest confidence of 0.94 is:
[0, 5, 12, 17, 19, 21, 49, 51, 69, 74, 78, 80, 82, 83, 85, 87, 90, 92, 93, 98, 99, 108, 114, 120, 123, 132, 139, 140, 143, 148, 150, 151, 174, 175, 176, 180, 182, 183, 187, 188, 189, 191, 197, 199, 209, 211, 216, 220, 223, 224, 225, 226, 229, 232, 233, 235, 236, 243, 249, 251, 252, 254, 255, 257, 262, 267, 270, 272, 276, 278, 279, 282, 284, 295, 306, 311, 318, 320, 322, 324, 325, 326, 328, 332, 333, 335, 337, 338, 339, 341, 342, 343, 344, 345, 346, 347, 348, 349, 352, 354, 355, 356, 361, 379, 384, 385, 418, 419, 423, 426, 459, 477, 481, 487, 489, 491, 493, 500, 502, 505, 506, 511, 561, 574, 604, 621, 622, 625, 629, 630, 631, 634, 635, 639, 640, 642, 644, 658, 660, 668, 670, 671, 672, 674, 684, 701, 708, 711, 712, 715, 722, 737, 742, 746, 747, 758, 759, 761, 763, 766, 767, 770, 771, 778, 779, 783, 784, 787, 788, 789, 794, 796, 797, 802, 803, 805, 810, 811, 846, 8

### Create a data frame from the list: [[fr#, track_id, [bbox]]], [fr#, track_id, [bbox]], …]

In [6]:
data_list =[]
for info in tracklet_list:
    #print(info)
    st = info[0]
    max_ind = info[2]
    track_id =info[1]
    fr_no = st + max_ind
    data_list.append([fr_no, track_id, info[3][max_ind][0:4]])
print('{} tracklets are remained after pruning'.format(len(data_list)))

202 tracklets are remained after pruning


### Get the patient related track ids from the ground truth folder

In [7]:
# read the true labels from files
base_path = '../gt_tracking_labels/TuftsVideos'
true_id_path = osp.join(base_path, subj_id, 'Visit_%02d' % visit_no)
true_id = {}
patient_list = [ i for i in os.listdir(osp.join(true_id_path, 'patient')) if i.endswith('.jpg')]
for name in patient_list:
    track_id = int(name.split('_')[0])
    true_id[track_id] = 0
true_id_list = [track_id for (track_id, label) in sorted(true_id.items(), reverse=False) 
                if find_element_in_list(track_id, id_list) is not None ] 
print('Traklet IDs corrresponding to the target are:\n{}'
      '\nNumber of generated tracklets for the target is:{}'.format(true_id_list, len(true_id_list)))

Traklet IDs corrresponding to the target are:
[0, 92, 99, 108, 123, 132, 279, 282, 318, 320, 324, 344, 355, 361, 658, 660, 668, 670, 671, 672, 674, 684, 701, 722, 737, 742, 747, 758, 761, 767, 771, 779, 784, 788, 796, 797, 803, 805, 811, 846, 880, 886, 888, 906]
Number of generated tracklets for the target is:44


### Create the final detection tracking results for the selected target
if in a frame there are more than one detection associated to the same identity the one which has the smaller bbox area will be removed

In [8]:
all_tracks_pruned = []
all_boxes_pruned = []
all_keyps_pruned = []
for i in range(n_frames):
    track_ids = all_tracks[i]
    idx = 0
    tracks_pruned = []
    boxes_pruned = []
    keyps_pruned = []
    for j in track_ids:
        if find_element_in_list(j, true_id_list) is not None:
            tracks_pruned.append(j)
            boxes_pruned.append(np.expand_dims(all_boxes[i][idx],axis=0))
            keyps_pruned.append(all_keyps[i][idx])
        idx += 1
    if len(tracks_pruned) > 1:
        area_pruned = [boxes_area(boxes_pruned[i]) for i in range(len(boxes_pruned))]
        keep_id = area_pruned.index(max(area_pruned))
        tracks_pruned = tracks_pruned[keep_id]
        boxes_pruned = boxes_pruned[keep_id]
        keyps_pruned = keyps_pruned[keep_id]
    all_tracks_pruned.append(tracks_pruned)
    all_boxes_pruned.append(boxes_pruned)
    all_keyps_pruned.append(keyps_pruned)

### Writing back the final results of detections for the target 
all results for detected bounding boxes and estimated keypoints will be written in a .pkl file

In [9]:
out_path = osp.join('../outputs/TuftsVideos',subj_id, 'Visit_%02d' % visit_no)
dest_name = file_name.split('_')[0] + '_' + file_name.split('_')[1]+ '_target_tracking.pkl'
dest_file = osp.join(out_path, dest_name)
if osp.exists(out_path):
    shutil.rmtree(out_path)
os.makedirs(out_path)
robust_pickle_dump(
    dict(target_boxes=all_boxes_pruned,
         target_keyps=all_keyps_pruned),
    dest_file)