In [None]:
!pip install --upgrade scipy
!pip install scikit-video
!pip install ffmpeg-python
!pip install scikit-image
!pip uninstall opencv-python opencv-contrib-python -y
!pip install opencv-contrib-python
!pip install pyrender trimesh imageio
!pip install PyOpenGL PyOpenGL_accelerate
!pip install numpy==1.20.3


In [None]:
import numpy as np
import skvideo.io
from skimage import io
import matplotlib.pyplot as plt
import cv2
import pyrender
import trimesh
import imageio

In [None]:
cv2.__version__

In [None]:
# Loading the video and frames as images, normalized
path = 'Data/IMG_3862.mp4'
video = skvideo.io.vreader(path)

frames = []
for frame in video:
    frames.append(frame)
    
print("# frames: ", len(frames))
print("Frame shape: ", frames[0].shape)

In [None]:
# Choosing keypoints and world coordinate representation
image = frames[0]
# io.imsave("firstFrame.jpg", image)

# Manually selecting 2D/ 3D points in (x, y)/ (x, y, z) format
points2d = [(333, 1271), (323, 1205), (320, 1130), 
            (252, 1174), (242, 1105), (239, 1039), 
            (484, 1195), (482, 1139), (473, 1057), (622, 1127), (621, 1072), (614, 994), (747, 1067), (751, 1014), (746, 935),
            (381, 975), (305, 900), (167, 957), (516, 919), (433, 850), (648, 865), (560, 802)]

points3d = [(0.0, 0.0, 0.0), (0.0, 0.0, 1.5), (0.0, 0.0, 3.0),
           (0.0, 3.0, 0.0), (0.0, 3.0, 1.5), (0.0, 3.0, 3.0),
           (3.0, 0.0, 0.0), (3.0, 0.0, 1.5), (3.0, 0.0, 3.0), (6.0, 0.0, 0.0), (6.0, 0.0, 1.5), (6.0, 0.0, 3.0), (9.0, 0.0, 0.0), (9.0, 0.0, 1.5), (9.0, 0.0, 3.0),
           (3.0, 3.0, 3.0), (3.0, 6.0, 3.0), (0.0, 6.0, 3.0), (6.0, 3.0, 3.0), (6.0, 6.0, 3.0), (9.0, 3.0, 3.0), (9.0, 6.0, 3.0)]


In [None]:
print(points2d)

In [None]:
# Implementing tracking
def get_patch(point, patch_size=8):
    x_top_left = int(point[0] - patch_size / 2)
    y_top_left = int(point[1] - patch_size / 2)
    
    return (x_top_left, y_top_left, patch_size, patch_size)

trackers = [cv2.legacy.TrackerMedianFlow_create() for _ in points2d]
first_frame_bgr = cv2.cvtColor(frames[0], cv2.COLOR_RGB2BGR)
for point, tracker in zip(points2d, trackers):
    patch = get_patch(point)
    tracker.init(first_frame_bgr, patch)

updated_trackers = [points2d]
for frame in frames[1:]:
    frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
    frame_points = []
    for tracker in trackers:
        success, patch = tracker.update(frame_bgr)
        if success:
            x_center = int(patch[0] + patch[2] / 2)
            y_center = int(patch[1] + patch[3] / 2)
            frame_points.append((x_center, y_center))
        else:
            frame_points.append(None)
            
    updated_trackers.append(frame_points)

In [None]:
print(updated_trackers[0])

In [None]:
# Creating tracking video
def create_tracking_video(frames, tracked_points, out_path, fps=30):
    height, width, _ = frames[0].shape
    
    writer = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
    
    for frame, pts in zip(frames, tracked_points):
        for point in pts:
            if point is not None:
                cv2.circle(frame, (int(point[0]), int(point[1])), 5, (0, 255, 0), -1)
                
        frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        writer.write(frame_bgr)
        
    writer.release()
    
create_tracking_video(frames, updated_trackers, 'Output/tracking.mp4')
        

In [None]:
# Camera calibration
def solve_projection_matrix(points2D, points3D):
    A = []
    B = []

    # Only use points that are successfully tracked
    valid_points = [(p2d, p3d) for p2d, p3d in zip(points2D, points3D) if p2d is not None]

    for p2d, p3d in valid_points:
        X, Y, Z = p3d
        u, v = p2d
        
        A.append([X, Y, Z, 1, 0, 0, 0, 0, -u*X, -u*Y, -u*Z])
        B.append(u)
        
        A.append([0, 0, 0, 0, X, Y, Z, 1, -v*X, -v*Y, -v*Z])
        B.append(v)
        
    A = np.array(A)
    B = np.array(B)

    projection_matrix, _, _, _ = np.linalg.lstsq(A, B, rcond=None)
    projection_matrix = np.append(projection_matrix, 1).reshape(3, 4)
    
    return projection_matrix

projection_matrices = [solve_projection_matrix(frame_points, points3d) for frame_points in updated_trackers]

In [None]:
print(projection_matrices[0])
print(projection_matrices[1])
print(len(projection_matrices))

In [None]:
# Projecting cube onto scene
axis_pts = np.float32([[0,0,3], [0,3,3], [3,3,3], [3,0,3],
                      [0,0,6], [0,3,6], [3,3,6], [3,0,6]])

def draw(img, imgpts):
    imgpts = np.int32(imgpts).reshape(-1,2)
    # draw ground floor in green
    img = cv2.drawContours(img, [imgpts[:4]],-1,(0,255,0),-3)
    # draw pillars in blue color
    for i,j in zip(range(4),range(4,8)):
        img = cv2.line(img, tuple(imgpts[i]), tuple(imgpts[j]),(255),3)
    # draw top layer in red color
    img = cv2.drawContours(img, [imgpts[4:]],-1,(0,0,255),3)
    return img

def project_points(axis_pts, projection_matrix):
    # Convert to homogenous coordinates by adding a row of 1s
    ones = np.ones((axis_pts.shape[0], 1))
    homogenous_axis_pts = np.hstack([axis_pts, ones])

    # Apply the projection matrix
    projected_pts = np.dot(projection_matrix, homogenous_axis_pts.T).T

    # Convert back to non-homogenous coordinates
    projected_pts = projected_pts[:, :2] / projected_pts[:, 2, np.newaxis]

    return projected_pts


def create_cube_video(frames, axis_pts, projection_matrices, output_path, fps=30):
    height, width, _ = frames[0].shape
    video_writer = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))

    for frame, proj_matrix in zip(frames, projection_matrices):
        projected_pts = project_points(axis_pts, proj_matrix)
        frame_with_cube = draw(frame.copy(), projected_pts)
        video_writer.write(frame_with_cube)

    video_writer.release()

create_cube_video(frames, axis_pts, projection_matrices, 'Output/cube.mp4')


# Bells and Whistles

In [None]:
# Loading the video and frames as images, normalized
path = 'Data/IMG_3862.mp4'
video = skvideo.io.vreader(path)

frames = []
for frame in video:
    frames.append(frame)
    
print("# frames: ", len(frames))
print("Frame shape: ", frames[0].shape)

In [None]:
# Loading mesh and setting up render
duck_path = 'Data/Duck.glb'
duck = trimesh.load(duck_path, file_type='glb')
duckmesh = pyrender.Mesh.from_trimesh(list(duck.geometry.values())[0])

scene = pyrender.Scene()
scene.add(duckmesh)

In [None]:
def overlay_frames(original_frame, rendered_frame):
    alpha = 0.5
    # rendered_frame = cv2.resize(rendered_frame, (original_frame.shape[1], original_frame.shape[0]))

    # Blend the two images
    combined_frame = cv2.addWeighted(original_frame, 1 - alpha, rendered_frame, alpha, 0)

    return combined_frame

def save_video(frames, output_path, fps=30, frame_size=(1080, 1920)):
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  
    video = cv2.VideoWriter(output_path, fourcc, fps, frame_size)

    for frame in frames:
        video.write(frame)

    video.release()

renderer = pyrender.OffscreenRenderer(viewport_width=1080, viewport_height=1920)
rendered_frames = []
for i, P in enumerate(projection_matrices):
    
    # Decomposing projection matrix
    K, R, t, _, _, _, _ = cv2.decomposeProjectionMatrix(P)
    t = ((t / t[3])[:3]).reshape(3)
    K = K[:3, :3]
    
    camera = pyrender.IntrinsicsCamera(fx=K[0, 0], fy=K[1, 1], cx=K[0, 2], cy=K[1, 2])
    
    # The extrinsic matrix is the camera pose
    camera_pose = np.eye(4)
    camera_pose[:3, :3] = R
    camera_pose[:3, 3] = t.squeeze()
    
    scene.add(camera, pose=camera_pose)
    color, _ = renderer.render(scene)
    
    combined_frame = overlay_frames(frames[i], color)
    rendered_frames.append(combined_frame)
    
    scene.remove_node(camera)
    
save_video(rendered_frames, 'Output/duck.mp4')