# Dataset Writer for Norm-Split 모델

In [1]:
import cv2
import copy
import numpy as np
import os, json, glob
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from tqdm import tqdm
from tensorflow import keras
from matplotlib.pyplot import figure

In [5]:
dir_output = '/mnt/sata3/everyone-tfrecord2'
dir_input = '/mnt/sata4/everyone'
image_dir = 'frames'
json_dir = 'data'
landmarkd_dir = 'landmark2'
tfrecord_dir = 'tfrecord2'
temptfrecord_dir = 'temp_record'

phases = ['train', 'val']

In [10]:
def getStandard3DFacePoints():
    faceModel = 'standard3DFace.json'

    if not os.path.isfile(faceModel):
        print(faceModel, 'not found!')
        return None

    with open(faceModel, 'r') as f:
        data = json.load(f)

    eye_right   = np.array( data['eye_right'],   dtype='float32')
    eye_left    = np.array( data['eye_left'],    dtype='float32')
    nose        = np.array( data['nose'],        dtype='float32')
    nose_bottom = np.array( data['nose_bottom'], dtype='float32')
    jaws        = np.array( data['jaws'],        dtype='float32')

    return np.vstack((eye_right, eye_left, nose, nose_bottom, jaws))

In [11]:
# noinspection PyShadowingNames
def estimateHeadPosition(refined_landmarks, position, camera_matrix, camera_distortion):
#     print('refined_landmarks:', refined_landmarks, 'position:',position, 'camera_matrix:',camera_matrix, 'camera_distortion:', camera_distortion)
    ret, rvec, tvec = cv2.solvePnP(position, refined_landmarks,
                                   camera_matrix, camera_distortion, flags=cv2.SOLVEPNP_EPNP)
    ret, rvec, tvec = cv2.solvePnP(position, refined_landmarks, camera_matrix, camera_distortion, rvec, tvec, True)

    return rvec, tvec

In [12]:
def get_plane(p1, p2, p3):
    # These two vectors are in the plane
    v1 = p3 - p1
    v2 = p2 - p1

    # the cross product is a vector normal to the plane
    cp = np.cross(v1, v2)
    a, b, c = cp

    # This evaluates a * x3 + b * y3 + c * z3 which equals d
    d = np.dot(cp, p3)

    # The equation is ax + by + cz = d
    # But we want to have ax + by + c = z
#     print('The equation is {0}x + {1}y + {2}z = {3}'.format(a, b, c, d))
    a = - a / c
    b = - b / c
    c = d / c
    return a, b, c

In [13]:
def get_R(rotation_matrix, center):
    distance = np.linalg.norm(center)
    hRx = rotation_matrix[:, 0]
    
    forward = (center / distance).reshape(3)
    
    down = np.cross(forward, hRx)
    down /= np.linalg.norm(down)
    
    right = np.cross(down, forward)
    right /= np.linalg.norm(right)
    
    return np.c_[right, down, forward].T

In [14]:
'''
    We need modified S that does not scale but move Z by k distance
    make plane that formed by le, re, nose_tip
    ax + by + c = z
    S = |   1      0      0  |
        |   0      1      0  |
        | -ak/c  -bk/c  1+k/c|

    S moves (x, y, z) to (x, y, z + k)
'''

def calculate_S(pt1, pt2, pt3, R, target_distance):
    pt1 = np.dot(R, pt1)
    pt2 = np.dot(R, pt2)
    pt3 = np.dot(R, pt3)
    
    p1 = np.reshape(pt1, 3)
    p2 = np.reshape(pt2, 3)
    p3 = np.reshape(pt3, 3)
    
    a, b, c = get_plane(p1, p2, p3)
    k = target_distance - c
    
    S = np.array([
        [1.0, 0.0, 0.0],
        [0.0, 1.0, 0.0],
        [-a*k/c, -b*k/c, 1+k/c],
    ])
    return S

In [18]:
def read_data(json_path, image_path):

    # load image and data files
    image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
    if not os.path.isfile(image_path):
        print(image_path, 'not exists!')
        return None

    with open(json_path, 'r') as f:
        data = json.load(f)

#     with open(landmark_path, 'r') as f:
#         landmark_data = json.load(f)


    params, dists = getFocalLengthAndDistortion(data['deviceName'])
    if params is None:
        return None

    fx, fy = params[0], params[1]
    if image.shape[1] > image.shape[0]:  # width > height
        cx, cy = params[3], params[2]
    else:
        cx, cy = params[2], params[3]

    camera_distortion = np.hstack((dists, 0))

    camera_matrix = np.array([
        [fx,  0, cx],
        [ 0, fy, cy],
        [ 0,  0, 1 ]
    ])

    landmarks = getLandmarksFromJson(data)

    if landmarks is None:
        print(image_path, 'has no landmark information!')
        return None

    landmarks, face_3d = removeOutsideImage(image, landmarks, copy.deepcopy(standardFace))
    # make to 3-D array
    landmarks = landmarks.reshape(-1, 1, 2)
    face_3d = face_3d.reshape(-1, 1, 3)

    lookat = np.array([-data['XCam'], -data['YCam'], 0])
    # cm to mm
    lookat = lookat * 10
    lookat = lookat.reshape((3, 1))
    
    return image, camera_matrix, camera_distortion, landmarks, face_3d, lookat

In [19]:
def normalizeDataAndGaze(image, face, camera_matrix, look_vector, head_rotate, landmarks):
    # normalizing face area
    nose_tip = face[:, 15].reshape((3, 1))
    eye_center = np.array([sum(x) for x in face[:, 0:12]]) / 12
    re = np.array([sum(x) for x in face[:, 0:6]]) / 6
    le = np.array([sum(x) for x in face[:, 6:12]]) / 6

    center = eye_center.reshape((3, 1))
    
    gaze_data = []
    warped_image = []
    R_list = []
    for eye in [re, le]:
        R = get_R(head_rotate, eye)
        S = calculate_S(re, le, nose_tip, R, normalize_distance)
        W = np.dot(np.dot(eye_norm, S), np.dot(R, np.linalg.inv(camera_matrix)))  # transformation matrix
        image_warped = cv2.warpPerspective(image, W, normalize_eye_size)  # image normalization

        eye = eye.reshape((3, 1))
        g = look_vector - eye
        g = np.dot(R, g)
        g = g / (-g[2])
        
        warped_image.append(image_warped)
        gaze_data.append(g)
        R_list.append(R)
        if debug_print:
            drawCVImage(image_warped)
    
    center = (nose_tip + eye_center.reshape((3, 1))) / 2
    R = get_R(head_rotate, center)
    S = calculate_S(re, le, nose_tip, R, normalize_distance)
    W = np.dot(np.dot(face_norm, S), np.dot(R, np.linalg.inv(camera_matrix)))  # transformation matrix
    image_warped = cv2.warpPerspective(image, W, normalize_face_size)  # image normalization
    transformed_lks = cv2.perspectiveTransform(landmarks, W)

    warped_image.append(image_warped)
    R_list.append(R)
    if debug_print:
        drawLandmarks(image_warped, transformed_lks.reshape((-1, 2)))

#     right_gaze = gaze_data[0]
#     right_eye = eye_right.reshape((3, 1))
#     original_right_gaze = np.dot(np.linalg.inv(R), right_gaze)
#     multiplier = - np.divide(right_eye[2], original_right_gaze[2])
#     target = np.add(right_eye, np.multiply(original_right_gaze, multiplier))
#     print('right_gaze:', right_gaze, ', right_eye:', right_eye, ', target(re):', target, ', target(true)', look_vector)
    
    return warped_image, np.array(gaze_data), R_list #cv2.Rodrigues(np.dot(R, hR))[0]

In [20]:
def do_normalize(subject, frame):
    name = subject + '_' + frame
    json_path = os.path.join(input_json_dir, subject, name) + '.json'
    image_path = os.path.join(input_image_dir, subject, name) + '.jpg'
#     landmark_path = os.path.join(input_landmark_dir, data_name) + '.json'
    
    try:
        image, camera_matrix, camera_distortion, landmarks, face_3d, lookat = read_data(json_path, image_path)
    except:
        print('Failed to read data : ', data_name)
        return None
    
    # undistort landmark points and image
    landmarks = cv2.undistortPoints(landmarks, camera_matrix, camera_distortion, P=camera_matrix)
    image_undistorted = cv2.undistort(image, camera_matrix, camera_distortion)
    
#     drawCVImage(image)
    # pnp R and T
    hr, ht = estimateHeadPosition(landmarks, face_3d, camera_matrix, camera_distortion)
    face_3d = face_3d.reshape(-1, 3).T
    ht = ht.reshape((3, 1))

    # Rodrigues expression to 3x3 rotation matrix
    hR = cv2.Rodrigues(hr)[0]  # rotation matrix
    translate = ht.reshape((3, 1))
    face = np.dot(hR, face_3d) + translate  # 3D positions of facial landmarks

    # warped image, rotated gaze vector, face R, S, W, 3D face points
    image_warped, gaze_vector, R = normalizeDataAndGaze(image_undistorted, 
                                                        face,
                                                        camera_matrix,
                                                        lookat,
                                                        hR,
                                                        landmarks)


    eye_right = np.array([sum(x) for x in face[:, 0:6]]) / 6
    eye_left = np.array([sum(x) for x in face[:, 6:12]]) / 6
    
#     if abs(hR[2,0]) + abs(hR[2,1]) > abs(hR[2,2]) or hR[2,2]< 0:
#         print(hR)
#         drawCVImage(image)
#         drawLandmarks(image_undistorted, np.reshape(landmarks, [-1, 2]))
    
    right_eye_pose = cv2.Rodrigues(np.dot(R[0], hR))[0]
    left_eye_pose = cv2.Rodrigues(np.dot(R[1], hR))[0]
    face_pose = cv2.Rodrigues(np.dot(R[2], hR))[0]
    
    
    eyes = np.append(eye_right, eye_left)
    poses = np.reshape(np.concatenate((right_eye_pose, left_eye_pose, face_pose), axis=0), (-1))

#     if out_print:
#         print(record_data['gaze_pixel'])
#         drawCVImage(image)
#         print(record_data['orientation'])
#         print(eyes)    
#         print(eye_poses)

    return image_warped, gaze_vector, eyes, poses, R[0], R[1], R[2], lookat

In [23]:
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _bytes_feature2(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def make_example(subject_name, frame_name, re, le, we, gaze, eyes, poses, rR, lR, cR, gaze2d):
    feature = {
        'subject': _bytes_feature(subject_name),
        'frame': _bytes_feature(frame_name),
        'img_re': _bytes_feature(re),
        'img_le': _bytes_feature(le),
        'img_we': _bytes_feature(we),
        'gaze': _float_feature(np.reshape(gaze, (-1)).tolist()),
        'eyes': _float_feature(np.reshape(eyes, (-1)).tolist()),
        'poses': _float_feature(np.reshape(poses, (-1)).tolist()),
        'rR': _float_feature(np.reshape(rR, (-1)).tolist()),
        'lR': _float_feature(np.reshape(lR, (-1)).tolist()),
        'cR': _float_feature(np.reshape(cR, (-1)).tolist()),
        'gaze2d': _float_feature(np.reshape(gaze2d, (-1)).tolist()),
    }
    return tf.train.Example(features=tf.train.Features(feature=feature))

def _parse_image_function(example_raw):
  # Parse the input tf.Example proto using the dictionary above.
  return tf.io.parse_single_example(example_raw, image_feature_description)