# Basic Setup

1. Connect Google Drive and setup dependencies

In [0]:
from google.colab import drive
drive.mount('/gdrive')
! ln -s '/gdrive/My Drive/Kitten' /content/
! pip install visdom

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive
Collecting visdom
[?25l  Downloading https://files.pythonhosted.org/packages/97/c4/5f5356fd57ae3c269e0e31601ea6487e0622fedc6756a591e4a5fd66cc7a/visdom-0.1.8.8.tar.gz (1.4MB)
[K     |████████████████████████████████| 1.4MB 9.0MB/s 
Collecting torchfile (from visdom)
  Downloading https://files.pythonhosted.org/packages/91/af/5b305f86f2d218091af657ddb53f984ecbd9518ca9fe8ef4103a007252c9/torchfile-0.1.0.tar.gz
Collecting websocket-client (from visdom)
[?25l  Downloadi

In [0]:
# change to AlphaPose directory for importing modules
% cd '/content/Kitten/AlphaPose'

/gdrive/My Drive/Kitten/AlphaPose


2. Import from everywhere

In [0]:
# import modules

import torch
from torch import optim
from torch.autograd import Variable
import torch.nn.functional as F
import torchvision.transforms as transforms

import torch.nn as nn
import torch.utils.data
import torch.utils.data as data

import os
import sys
import cv2
import json
import time
import numpy as np
import math

from matplotlib import pyplot as plt

from opt import opt
from tqdm import tqdm

from yolo.preprocess import prep_image
from yolo.darknet import Darknet
from yolo.util import dynamic_write_results

from dataloader import Mscoco, crop_from_dets
from SPPE.src.utils.eval import getPrediction
from SPPE.src.utils.img import load_image, cropBox, im_to_torch
from SPPE.src.main_fast_inference import InferenNet
from pPose_nms import pose_nms, write_json
from fn import vis_frame

# from PIL import Image

3. Paths

In [0]:
base_dir = '/content/Kitten/data/merged'
train_folder = os.path.join(base_dir, 'train')
val_folder = os.path.join(base_dir, 'val')

4. Load labels with corresponding index numbers

In [0]:
classes = os.listdir(train_folder)
classes = sorted(classes)
key_dict = {}

for k, v in enumerate(classes):
    key_dict[k] = v

print(key_dict) # ensure that the classes are extracted correctly

{0: 'ChairPose', 1: 'ChildPose', 2: 'Dabbing', 3: 'HandGun', 4: 'HandShake', 5: 'HulkSmash', 6: 'KoreanHeart', 7: 'KungfuCrane', 8: 'KungfuSalute', 9: 'Salute', 10: 'WarriorPose'}


7. Define some helper functions for pose extraction

In [0]:
opt.vis_dim = 64

In [0]:
def load_models():
    # Load yolov3 model
    det_model = Darknet(os.path.join("yolo/cfg/yolov3-spp.cfg"))
    det_model.load_weights(os.path.join('models/yolo/yolov3-spp.weights'))
    det_model.net_info['height'] = opt.inp_dim
    det_model.cuda()
    det_model.eval()

    # Load pose model
    pose_dataset = Mscoco()
    pose_model = InferenNet(4 * 1 + 1, pose_dataset)
    pose_model.cuda()
    pose_model.eval()
    
    return det_model, pose_model

def letterbox_image(img, inp_dim=(opt.vis_dim, opt.vis_dim), bg = 128):
    '''resize image with unchanged aspect ratio using padding'''
    img_w, img_h = img.shape[1], img.shape[0]
    w, h = inp_dim
    new_w = int(img_w * min(w / img_w, h / img_h))
    new_h = int(img_h * min(w / img_w, h / img_h))
    resized_image = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_CUBIC)

    canvas = np.full((inp_dim[1], inp_dim[0], 3), bg)

    canvas[(h - new_h) // 2:(h - new_h) // 2 + new_h, (w - new_w) // 2:(w - new_w) // 2 + new_w, :] = resized_image

    return canvas

def predict_numpy_image(img_np, det_model, pose_model):
    """
    This functions reads from a single numpy array of pixel values and returns 
    all the pose estimations found sucessfully, None otherwise
    """
    
    inp_dim = int(opt.inp_dim)
    
    im_dim = img_np.shape[1], img_np.shape[0]
    im_dim = torch.FloatTensor(im_dim).repeat(1,2)
    
    img = letterbox_image(img_np, (inp_dim, inp_dim))
    img = img.transpose((2, 0, 1)).copy()
    img = torch.from_numpy(img).float().div(255.0).unsqueeze(0)

    det_inp_dim = int(det_model.net_info['height'])

    with torch.no_grad():
        
        im_dim = torch.FloatTensor(im_dim).repeat(1,2)    
        # Human Detection using yolov3
        img = img.cuda()
        prediction = det_model(img, CUDA=True)


        # NMS process
        dets = dynamic_write_results(prediction, opt.confidence,
                            opt.num_classes, nms=True, nms_conf=opt.nms_thesh)
        try:
            dets = dets.cpu()
        except:
            return None
        
        im_dim_list = torch.index_select(im_dim, 0, dets[:, 0].long())
        scaling_factor = torch.min(det_inp_dim / im_dim, 1)[0].view(-1, 1)

        # coordinate transfer
        dets[:, [1, 3]] -= (det_inp_dim - scaling_factor * im_dim_list[:, 0].view(-1, 1)) / 2
        dets[:, [2, 4]] -= (det_inp_dim - scaling_factor * im_dim_list[:, 1].view(-1, 1)) / 2


        dets[:, 1:5] /= scaling_factor
        for j in range(dets.shape[0]):
            dets[j, [1, 3]] = torch.clamp(dets[j, [1, 3]], 0.0, im_dim_list[j, 0])
            dets[j, [2, 4]] = torch.clamp(dets[j, [2, 4]], 0.0, im_dim_list[j, 1])
        boxes = dets[:, 1:5]
        scores = dets[:, 5:6]

        boxes = boxes[dets[:,0]==0]

        inps = torch.zeros(boxes.size(0), 3, opt.inputResH, opt.inputResW)
        pt1 = torch.zeros(boxes.size(0), 2)
        pt2 = torch.zeros(boxes.size(0), 2)
        scores = scores[dets[:,0]==0]
        
        # pose estimation

        inp = im_to_torch(img_np)
        inps, pt1, pt2 = crop_from_dets(inp, boxes, inps, pt1, pt2)

        inps = inps.cuda()
        hm = pose_model(inps)
        hm = hm.cpu()


    if boxes is not None:
        # filter detections
        preds_hm, preds_img, preds_scores = getPrediction(hm, pt1, pt2, 
                                                          opt.inputResH, opt.inputResW,
                                                          opt.outputResH, opt.outputResW)
        result = pose_nms(boxes, scores, preds_img, preds_scores)

        if len(result):
            return result

def process_results(result, img_dim):
    """
    Due to multiple poses detected, we assume that the keypoints with the 
    largest bounding box is the one of interest
    """
    img_h, img_w = img_dim
    kps = []
    scores = []
    areas = []
    coordinates = []
    for human in result:
        kp_preds = human['keypoints']
        kp_scores = human['kp_score']
        kp_preds = torch.cat((kp_preds, torch.unsqueeze((kp_preds[5,:]+kp_preds[6,:])/2,0)))
        kp_scores = torch.cat((kp_scores, torch.unsqueeze((kp_scores[5,:]+kp_scores[6,:])/2,0)))
        for n in range(kp_scores.shape[0]):
            if kp_scores[n] <= 0.05:
                kp_preds[n] = torch.zeros_like(kp_preds[n])
                continue
        kps.append(kp_preds)
        scores.append(kp_scores)
        
        x_nonzero = kp_preds[:,0]
        y_nonzero = kp_preds[:,1]
        x_nonzero = x_nonzero[x_nonzero != 0]
        y_nonzero = y_nonzero[y_nonzero != 0]
        
        min_x = int(torch.min(x_nonzero))
        min_y = int(torch.min(y_nonzero))
        max_x = int(torch.max(x_nonzero))
        max_y = int(torch.max(y_nonzero))
        
        area = (max_x - min_x) * (max_y - min_y)
        areas.append(float(area))
        
        coor = (min_y, max_y, min_x, max_x)
        coordinates.append(coor)
        
        
    if len(kps):
        # return the set of keypoints covering largest area and corner coordinates
        idx = np.argmax(areas)
        return kps[idx], scores[idx], coordinates[idx]
    
def vis_kps(kp_preds, kp_scores, coor, orig_dim):
    '''
    kps: keypoints of a single person
    coor: (min_y, max_y, min_x, max_x)
    orig_dim: dimensions of the original image
    return rendered letter-boxed image
    '''
    l_pair = [
        (0, 1), (0, 2), (1, 3), (2, 4),  # Head
        (5, 6), (5, 7), (7, 9), (6, 8), (8, 10),
        (17, 11), (17, 12),  # Body
        (11, 13), (12, 14), (13, 15), (14, 16)
    ]

    p_color = [(0, 255, 255), (0, 191, 255),(0, 255, 102),(0, 77, 255), (0, 255, 0), #Nose, LEye, REye, LEar, REar
                (77,255,255), (77, 255, 204), (77,204,255), (191, 255, 77), (77,191,255), (191, 255, 77), #LShoulder, RShoulder, LElbow, RElbow, LWrist, RWrist
                (204,77,255), (77,255,204), (191,77,255), (77,255,191), (127,77,255), (77,255,127), (0, 255, 255)] #LHip, RHip, LKnee, Rknee, LAnkle, RAnkle, Neck
    line_color = [(0, 215, 255), (0, 255, 204), (0, 134, 255), (0, 255, 50), 
                (77,255,222), (77,196,255), (77,135,255), (191,255,77), (77,255,77), 
                (77,222,255), (255,156,127), 
                (0,127,255), (255,127,77), (0,77,255), (255,77,36)]
    
    height,width = orig_dim
    min_y, max_y, min_x, max_x = coor
    
    bleed = 8
    min_x = max(min_x - bleed, 0) // 2
    min_y = max(min_y - bleed, 0) // 2
    max_x = min(max_x + bleed, width) // 2
    max_y = min(max_y + bleed, height) // 2
    
    # create black background
    img = np.zeros((int(height/2), int(width/2), 3), dtype=np.uint8)
    part_line = {}
    # Draw keypoints
    for n in range(kp_scores.shape[0]):
        cor_x, cor_y = int(kp_preds[n, 0]), int(kp_preds[n, 1])
        if kp_scores[n] <= 0.05:
            continue
        part_line[n] = (int(cor_x/2), int(cor_y/2))
        bg = img.copy()
        cv2.circle(bg, (int(cor_x/2), int(cor_y/2)), 2, p_color[n], -1)
        # Now create a mask of logo and create its inverse mask also
        transparency = max(0, min(1, kp_scores[n]))
        img = cv2.addWeighted(bg, transparency, img, 1-transparency, 0)
    # Draw limbs
    for i, (start_p, end_p) in enumerate(l_pair):
        if start_p in part_line and end_p in part_line:
            start_xy = part_line[start_p]
            end_xy = part_line[end_p]
            bg = img.copy()

            X = (start_xy[0], end_xy[0])
            Y = (start_xy[1], end_xy[1])
            mX = np.mean(X)
            mY = np.mean(Y)
            length = ((Y[0] - Y[1]) ** 2 + (X[0] - X[1]) ** 2) ** 0.5
            angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1]))
            stickwidth = (kp_scores[start_p] + kp_scores[end_p]) + 1
            polygon = cv2.ellipse2Poly((int(mX),int(mY)), 
                                       (int(length/2), stickwidth), 
                                       int(angle), 0, 360, 1)
            cv2.fillConvexPoly(bg, polygon, line_color[i])
            transparency = max(0, min(1, 0.5*(kp_scores[start_p] + 
                                              kp_scores[end_p])))
            img = cv2.addWeighted(bg, transparency, img, 1-transparency, 0)
                
    cropped_img = img[min_y:max_y, min_x:max_x, :]
    vis = letterbox_image(cropped_img, bg=0)
    
    return vis
    
def img_to_vis(img_list, del_list = None):
    """
    Take in an array of numpy images as input and output a tensor with the 
    visualisation of each one
    """
    dim = opt.vis_dim
    
    # load models
    det_model, pose_model = load_models()
    
    vis_list = []
    for i in range(len(img_list)):
        orig_dim = img_list[i].shape[:2]
        result = predict_numpy_image(img_list[i], det_model, pose_model)
        if result is not None:
            # when there is at least one pose detection
            (keypoints, scores, coor) = process_results(result, orig_dim)
            vis = vis_kps(keypoints, scores, coor, orig_dim)
            vis_list.append(vis)
            continue
        else:
            # either no humans are detected successfully
            # or pose estimation failed for detected humans
            vis_list.append(np.zeros((dim, dim, 3), dtype=np.uint8))
            if del_list is not None:
                del_list.append(i)
                
    return np.stack(vis_list)

8. Convert images into keypoint visualisations

In [0]:
train_del = []
val_del = []
if not load_vis:
    x_train_vis = img_to_vis(x_train, train_del)
    x_val_vis = img_to_vis(x_val, val_del)
    # BGR to RGB
    x_train_vis = x_train_vis[:,:,:,::-1].transpose((0, 3, 1, 2)).copy()
    x_val_vis = x_val_vis[:,:,:,::-1].transpose((0, 3, 1, 2)).copy()
    
    # delete non-detections to avoid interference with training
    x_train = [item for itr, item in enumerate(x_train) if itr not in train_del]
    x_train_vis = np.delete(x_train_vis, train_del, 0)
    y_train = np.delete(y_train, train_del, 0)
    x_val = [item for itr, item in enumerate(x_val) if itr not in val_del]
    x_val_vis = np.delete(x_val_vis, val_del, 0)
    y_val = np.delete(y_val, val_del, 0)

NameError: ignored

# Augmentation

In [0]:
import imgaug as ia
from imgaug import augmenters as iaa

augmentation_seq = iaa.Sequential([
    iaa.Multiply((0.9, 1.1)),
    iaa.Fliplr(0.5),
    iaa.Affine(rotate=(-20, 20), scale=(0.9, 1.1), translate_px=(-3, 3)),
    iaa.PerspectiveTransform(scale=(0.0, 0.05), keep_size=True)
])

augmentations = 10
x_train_vis_aug = []

transposed = np.transpose(x_train_vis, (0, 2, 3, 1)).astype(np.uint8)

for i in range(augmentations):
    augmentation_batch = []
    for image in transposed:
        augmentation_batch.append(augmentation_seq.augment_image(image))
    x_train_vis_aug.append(augmentation_batch)

x_train_vis_aug = np.array(x_train_vis_aug)
x_train_vis_aug = np.transpose(x_train_vis_aug, (0, 1, 4, 2, 3))

# Pose Estimation to Pose Classification

All are pytorch tensors

Training keypoints:  `x_train_vis`

Validation keypoints: `x_val_vis`

Training ground truth: `y_train`

Validation ground truth: `y_val`

## Convert to tensors

In [0]:
x_train_vis = torch.tensor(x_train_vis, dtype=torch.float32)
x_train_vis_aug = torch.tensor(x_train_vis_aug, dtype=torch.float32)
x_val_vis = torch.tensor(x_val_vis, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
y_val = torch.tensor(y_val, dtype=torch.long)

In [0]:
x_train_vis /= 256.0
x_train_vis_aug /= 256.0
x_val_vis /= 256.0

## *Define neural network here*

In [0]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(16 * 13 * 13, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(512, 128)
        self.fc4 = nn.Linear(256, 11)
        self.do  = nn.Dropout()

    def forward(self, x):
        x = self.pool(F.leaky_relu(self.conv1(x)))
        x = self.do(x)
        x = self.pool(F.leaky_relu(self.conv2(x)))
        x = self.do(x)
        x = x.view(-1, self.num_flat_features(x))
        x = self.fc1(x)
        x = self.do(x)
        x = F.leaky_relu(x)
        x = self.fc2(x)
        x = self.do(x)
        x = F.leaky_relu(x)
#         x = self.fc3(x)
#         x = self.do(x)
#         x = F.relu(x)
        x = self.fc4(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
    
model = Model()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adadelta(model.parameters(), lr=0.1)

## Load model

Method 1

In [0]:
best_model_path = ''
infer_model = torch.load(best_model_path)

Method 2

In [0]:
# infer_model = Model()
# infer_model.load_state_dict(torch.load('/content/Kitten/test.pth'))
# infer_model.eval()

Model(
  (fc1): Linear(in_features=34, out_features=256, bias=True)
  (fc2): Linear(in_features=256, out_features=256, bias=True)
  (fc3): Linear(in_features=512, out_features=128, bias=True)
  (fc4): Linear(in_features=256, out_features=11, bias=True)
  (do): Dropout(p=0.5)
)

## Evaluation

In [0]:
! pip install folium==0.2.1 imgaug==0.2.5 pillow==5.4.1
! pip install --force-reinstall --no-warn-script-location -q https://ai-camp.s3-us-west-2.amazonaws.com/AiCampEval-1.7-py3-none-any.whl

[31mERROR: Invalid requirement: 'pillow=5.4.1'
= is not a valid operator. Did you mean == ?[0m


In [0]:
from AiCampEval import eval_submit

def evaluate_images(list_of_np_arrays):
    new_h = 480
    for img in list_of_np_arrays:
        width, height = img.shape[:2]
        new_w = width * new_h // height
        img = resized_image = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
    preprocessed_imgs = img_to_vis( list_of_np_arrays )
    preprocessed_imgs = preprocessed_imgs[:,:,:,::-1].transpose((0, 3, 1, 2)).copy()
    infer_model.eval()
    preprocessed_imgs = torch.tensor(preprocessed_imgs, dtype=torch.float32)
    preprocessed_imgs = preprocessed_imgs.cuda()
    preprocessed_imgs.div(256.)
    output = infer_model(preprocessed_imgs)
    output = output.cpu()
    pred = torch.argmax(output, 1).numpy()
    # Convert the list of class_indices to a list of predictions (in str)
    # predictions is a list of labels: ['KoreanHeart', 'KoreanHeart','ChairPose',.......]
    predictions = [ key_dict[k] for k in pred ]
    return predictions

In [0]:
TEAM_ID = "Team Chi"
SUBMISSION_TYPE = "testset_11classes_1_01010"

eval_submit(evaluate_images, SUBMISSION_TYPE, TEAM_ID)


Predicting batch 1/1...
Loading pose model from ./models/sppe/duc_se.pth

Total time taken for model evaluation: 40.847s

Submitting predictions
{'accuracy': 0.6014,
 'submission_time': '2019-06-13 09:57:07.789606+08:00',
 'submission_type': 'testset_11classes_1_01010',
 'team_id': 'Team Chi'}
