In [None]:
import argparse
import cv2
import sys
import numpy as np

In [34]:
def get_frame_time(cap) -> int:
  return int(cap.get(cv2.CAP_PROP_POS_MSEC))

def between(cap, lower: int, upper: int) -> bool:
    return lower <= get_frame_time(cap) < upper

In [35]:
def gray_scale(frame):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    gray = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
    return gray

In [36]:
def gaussian_blur(frame, kernel_size):
    blurred = cv2.GaussianBlur(frame, (kernel_size, kernel_size), 15)
    return blurred

In [37]:
def bilateral_blur(frame, d):
    filtered = cv2.bilateralFilter(frame, d, 50, 50)
    return filtered

In [38]:
def bgr_mask(frame, lower_bound, upper_bound):
    mask = cv2.inRange(frame, lower_bound, upper_bound)
    return mask

In [39]:
def morphology_operation(mask, method, kernel):
    mask_enhanced = cv2.morphologyEx(mask, method, kernel)
    return mask_enhanced

In [40]:
def sobel_filter(frame, ksize, scale=1, delta=0):
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    kx, ky = cv2.getDerivKernels(dx=1, dy=0, ksize=ksize, normalize=False, ktype=cv2.CV_64F)
    sobel_kernel_x = np.outer(kx, ky)
    abs_sum_x = np.sum(np.abs(sobel_kernel_x))
    sobel_x = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=ksize, scale=1/abs_sum_x, delta=delta) # Normalize the brightness.

    kx, ky = cv2.getDerivKernels(dx=0, dy=1, ksize=ksize, normalize=False, ktype=cv2.CV_64F)
    sobel_kernel_y = np.outer(kx, ky)
    abs_sum_y = np.sum(np.abs(sobel_kernel_y))
    sobel_y = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=ksize, scale=1/abs_sum_y, delta=delta)

    abs_sobel_x = cv2.convertScaleAbs(sobel_x)
    abs_sobel_y = cv2.convertScaleAbs(sobel_y)

    sobel_combined = cv2.addWeighted(abs_sobel_x, 0.5, abs_sobel_y, 0.5, 0)
    rescaled_frame = cv2.normalize(sobel_combined, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX)

    return rescaled_frame

In [41]:
def hough_trans(frame, blur_size, dp, param1, param2):
    frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    frame_blur = cv2.medianBlur(frame_gray, blur_size) # Better than Gaussian, faster than Bilateral.
    circles = cv2.HoughCircles(frame_blur, cv2.HOUGH_GRADIENT, dp=dp, minDist=100,
                            param1=param1, param2=param2, minRadius=0, maxRadius=0)

    if circles is not None:
        circles = np.uint16(np.around(circles))
        for c in circles[0, :]:
            cv2.circle(frame, (c[0], c[1]), c[2], (0, 255, 0), 2)
            cv2.circle(frame, (c[0], c[1]), 2, (0, 0, 255), 3)

    return frame

In [42]:
def template_matching(template_path, frame, alpha, probability_mode=0):
    template = cv2.imread(template_path)
    tH, tW = template.shape[:2]

    edges_video = sobel_filter(frame, 3)
    edges_template = sobel_filter(template, 5)

    result_color = cv2.matchTemplate(frame, template, cv2.TM_SQDIFF_NORMED)
    result_edges = cv2.matchTemplate(edges_video, edges_template, cv2.TM_SQDIFF_NORMED)
    
    final_result = alpha * result_color + (1 - alpha) * result_edges # The match result considers weighted color and edges matching.
    _, _, minLoc, _ = cv2.minMaxLoc(final_result)
    top_left = minLoc
    bottom_right = (top_left[0] + tW, top_left[1] + tH) # Rectangular size is set to the template size
    cv2.rectangle(frame, top_left, bottom_right, (0, 255, 0), 3)

    output_frame = frame
    output_frame_probability = None
    
    if probability_mode:
        likelihood = 1 - final_result  # Reverse of match result = probability
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        # Resize as the likelihood has a smaller dimension due to correlation operation.
        likelihood_upsampled = cv2.resize(likelihood, (gray.shape[1], gray.shape[0]), interpolation=cv2.INTER_LINEAR)
        likelihood_scaled = np.uint8(255 * likelihood_upsampled) # Change to normal values.
        output_frame_probability = cv2.cvtColor(likelihood_scaled, cv2.COLOR_GRAY2BGR)

    return output_frame, output_frame_probability
        

In [43]:
def find_contours(mask, threshold):
    # Retrieve outmost contour.
    initial_contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = []
    for contour in initial_contours:
        # Filter out small contours as they are noises.
        if cv2.contourArea(contour) > threshold:
            contours.append(contour)
    return contours

In [44]:
def collision_pos_cal(prev_detections, detections):
    """
    This method is based on the idea that when there's a collision, there are rectangulars merged.
    So the rectangular that has an abnormal shape indicates it has the collision.
    This method is only safe when other objects' shapes do not change vastly.
    The collision place is assumed to be the center of the merged rectangular.
    """
    avg_width = np.mean([br[0] - tl[0] for tl, br in prev_detections])
    avg_height = np.mean([br[1] - tl[1] for tl, br in prev_detections])
    for tl, br in detections:
        curr_width = br[0] - tl[0]
        curr_height = br[1] - tl[1]
        # The merged rectangular is the one which gets bigger.
        if curr_width > 1.5 * avg_width:  
            collision_center = ((tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2)
            break
        elif curr_height > 1.5 * avg_height:
            collision_center = ((tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2)
            break
    return collision_center

In [45]:
def collision_effect(collision_center, frame):
    explosion_img = cv2.imread("explosion.png", cv2.IMREAD_UNCHANGED)
    eh, ew = explosion_img.shape[:2]
    cx, cy = collision_center
    
    # Top-left position of the effect
    top_left_x = cx - ew // 2
    top_left_y = cy - eh // 2

    # Calculate bottom-right corner
    bottom_right_x = top_left_x + ew
    bottom_right_y = top_left_y + eh

    # Effect region
    roi_width = bottom_right_x - top_left_x
    roi_height = bottom_right_y - top_left_y

    frame_roi = frame[top_left_y:bottom_right_y, top_left_x:bottom_right_x]
    explosion_roi = explosion_img[:roi_height, :roi_width]
    _, _, _, explosion_a = cv2.split(explosion_roi)

    # Normalize the alpha mask to [0.0, 1.0]
    alpha = explosion_a.astype(float) / 255.0
    inv_alpha = 1.0 - alpha

    # Perform alpha blending
    for c in range(3):
        frame_roi[:, :, c] = (alpha * explosion_roi[:, :, c] + inv_alpha * frame_roi[:, :, c]).astype(frame_roi.dtype)
    
    return frame

In [46]:
def wrap_text(text, max_width):
    words = text.split()
    if not words:
        return []

    lines = []
    current_line = words[0]

    # If the text length exceeds the maximum length, automatically start a new line for it.
    for word in words[1:]:
        test_line = current_line + " " + word
        (test_width, _), _ = cv2.getTextSize(test_line, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)
        if test_width > max_width:
            lines.append(current_line)
            current_line = word
        else:
            current_line = test_line

    lines.append(current_line)  
    return lines

def add_subtitles_to_frame(subtitles, frame, current_time):
    active_subtitles = [
        s for s in subtitles if s["start"] <= current_time <= s["end"]
    ]

    lines_to_draw = []
    for subtitle in active_subtitles:
        wrapped_lines = wrap_text(subtitle["text"], 1500)
        lines_to_draw.extend(wrapped_lines)

    # text_sizes[i] = ((width, height), baseline)
    text_sizes = [
        cv2.getTextSize(line, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)
        for line in lines_to_draw
    ]
    
    max_line_width = max(sz[0][0] for sz in text_sizes) if text_sizes else 0
    max_line_height = max(sz[0][1] for sz in text_sizes) if text_sizes else 0

    # Padding for rectangulars, which are behind captions to make captions more clear.
    rect_width = max_line_width + 40     
    rect_height = max_line_height + 20

    frame_height, frame_width = frame.shape[:2]
    total_lines = len(lines_to_draw)
    total_block_height = rect_height * total_lines

    # 10 pixels are kept between the rectangular bottom and the frame bottom.
    y_start = frame_height - total_block_height - 10

    for i, line in enumerate(lines_to_draw):
        # Calculate the rectangular position and plot it.
        x_rect = (frame_width - rect_width) // 2 # So that the rectangular is put in the middle.
        y_rect = y_start + i * rect_height
        cv2.rectangle(
            frame,
            (x_rect, y_rect),
            (x_rect + rect_width, y_rect + rect_height),
            (0, 0, 0), 
            thickness=-1
        )

        # Get this line's position info.
        (line_width, line_height), baseline = cv2.getTextSize(line, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)
        x_text = x_rect + (rect_width - line_width) // 2 
        y_text = int(y_rect + (rect_height + line_height) / 2 - baseline / 2) 

        cv2.putText(
            frame,
            line,
            (x_text, y_text),
            cv2.FONT_HERSHEY_SIMPLEX,
            1.0,
            (255, 255, 255),
            2
        )

    return frame

In [47]:
def main(input_video_file: str, output_video_file: str) -> None:
    cap = cv2.VideoCapture(input_video_file)

    if cap is None or not cap.isOpened():
      raise RuntimeError('The file was not found or is not a proper video.')

    fps = int(round(cap.get(5)))
    frame_width = int(cap.get(3))
    frame_height = int(cap.get(4))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')       
    out = cv2.VideoWriter(output_video_file, fourcc, fps, (frame_width, frame_height))

    prev_num_objs = 0
    collision_frames = 0
    while cap.isOpened():
        ret, frame = cap.read()
        subtitles = []
        if ret:
            if cv2.waitKey(28) & 0xFF == ord('q'):
                break

            time = get_frame_time(cap)

            # Switch between color and gray for the first 4 seconds.
            if between(cap, 0, 500) or between(cap, 1000, 1500) or between(cap, 2000, 2500) or between(cap, 3000, 3500):
                frame = gray_scale(frame)
            subtitles = [
                {
                    "start": 0,
                    "end": 5000,
                    "text": "Switching the color between BGR and gray is done using cv2.cvtColor()."
                }
            ]

            # Gaussian blur.
            if between(cap, 4000, 8000):
                kernel_size = int(1 + (time - 4000) * (50 - 1) / (8000 - 4000)) # Change kernel size from 1 to 50.
                if kernel_size % 2 == 0:
                    kernel_size += 1
                frame = gaussian_blur(frame, kernel_size)
            
                subtitles = [
                    {
                        "start": 4000,
                        "end": 8000,
                        "text": f'''Gaussian blur, kernel_size = {kernel_size}'''
                    },
                    {
                        "start": 4000,
                        "end": 8000,
                        "text": f'''Gaussian filter calculates weighted averages by only considering spatial proximity, 
                                    but Bilateral filter considers both spatial and intensity proximity. n
                                    If two pixels\' intensities have a large difference, then the corresponding weight will be small. 
                                    Therefore, the two pixels do not affect each other very much, preserving the edges in the image.'''
                    }
                ]
            
            # Bilateral blur.
            if between(cap, 8000, 12000):
                kernel_size = int(1 + (time - 8000) * (50 - 1) / (12000 - 8000)) # Change kernel size from 1 to 50.
                if kernel_size % 2 == 0:
                    kernel_size += 1
                frame = bilateral_blur(frame, kernel_size)
            
                subtitles = [
                    {
                        "start": 8000,
                        "end": 12000,
                        "text": f'''Bilateral blur, kernel_size = {kernel_size}'''
                    },
                    {
                        "start": 8000,
                        "end": 12000,
                        "text": f'''Gaussian filter calculates weighted averages by only considering spatial proximity, 
                                    but Bilateral filter considers both spatial and intensity proximity.
                                    If two pixels\' intensities have a large difference, then the corresponding weight will be small. 
                                    Therefore, the two pixels do not affect each other very much, preserving the edges in the image.
                                    In terms of the effect, Gaussian blur looks like myopia, Bilateral polishes surfaces.'''
                    }
                ]

            # Grab objects using BGR channels.
            if between(cap, 12000, 20000):
                lower_bound = np.array([0, 0, 99])
                upper_bound = np.array([42, 120, 255])
                mask = bgr_mask(frame, lower_bound, upper_bound)
                frame = cv2.cvtColor(mask, cv2.COLOR_GRAY2BGR)

                kernel = np.ones((10, 10), np.uint8)
                mask_enhanced = morphology_operation(mask, cv2.MORPH_OPEN, kernel)
                mask_enhanced = morphology_operation(mask_enhanced, cv2.MORPH_OPEN, kernel)

                added_pixels = cv2.subtract(mask_enhanced, mask)
                deleted_pixels = cv2.subtract(mask, mask_enhanced)
                frame[added_pixels == 255] = (0, 255, 0)
                frame[deleted_pixels == 255] = (0, 0, 255)
            
                subtitles = [
                    {
                        "start": 12000,
                        "end": 20000,
                        "text": f'''Tangerine are moslty identified. Other white places are noises introduced by the light.
                                    To remove the noises, OPENING is used. Red pixels are the pixels removed by erosion and green ones are 
                                    the pixels added by dilation.'''
                    }
                ]

            # Sobel edges filter.
            if between(cap, 20000, 25000):
                kernel_size = int(1 + (time - 20000) * (31 - 1) / (25000 - 20000))
                if kernel_size % 2 == 0:
                    kernel_size += 1

                edges = sobel_filter(frame, ksize=kernel_size)
                frame = cv2.applyColorMap(edges, cv2.COLORMAP_DEEPGREEN)
            
                subtitles = [
                    {
                        "start": 20000,
                        "end": 25000,
                        "text": f'''Sobel_kernel_size = {kernel_size}'''
                    },
                    {
                        "start": 20000,
                        "end": 25000,
                        "text": f'''As the kernel size increases, edges get thicker. They actually get brighter as they calculate more area and the central 
                                    weight are larger, but the brightness is normalized before showing each frame.'''
                    }
                ]

            # Hough transform.
            if between(cap, 25000, 30000):
                blursize1 = 15
                dp1 = 1.3
                param11 = 50
                param21 = 65
                frame = hough_trans(frame, blur_size=blursize1, dp=dp1, param1=param11, param2 = param21)
                subtitles = [
                    {
                        "start": 25000,
                        "end": 30000,
                        "text": f'''blur_size = {blursize1}, dp = {dp1}, param1 = {param11}, param2 = {param21}'''
                    }
                ]
            # Try a new set for Hough Transform to explore the effect of different parameters.
            if between(cap, 30000, 35000):
                blursize2 = 11
                dp2 = 1.5
                param12 = 40
                param22 = 60
                frame = hough_trans(frame, blur_size=blursize2, dp=dp2, param1=param12, param2 = param22)
                subtitles = [
                    {
                        "start": 30000,
                        "end": 35000,
                        "text": f'''blur_size = {blursize2}, dp = {dp2}, param1 = {param12}, param2 = {param22}'''
                    },
                    {
                        "start": 30000,
                        "end": 35000,
                        "text": f'''For this new set, which has a decrased blur size, param1 and param2 and an increased
                                    dp, more noise circles are shown, vice versa. The new set leaves more noise edges due to
                                    small blur kernel size, and they are not filtered out due to the decreased param1.
                                    Param2 is decreased which also means the voting threshold is easier to be met, leading to more
                                    noise cirlces. And with an increased dp, though it is coarser, noises can affect more buckets, thus 
                                    there are more noise circles.'''
                    }
                ]

            # Template matching.
            if between(cap, 35000, 37000):
                frame, _ = template_matching('template.png', frame, 0.7, 0)
                subtitles = [
                    {
                        "start": 35000,
                        "end": 37000,
                        "text": f'''The template is a tangerine image.'''
                    }
                ]
            if between(cap, 37000, 40000):
                _, frame = template_matching('template.png', frame, 0.7, 1)
                subtitles = [
                    {
                        "start": 37000,
                        "end": 40000,
                        "text": f'''The template is a tangerine image.'''
                    }
                ]
            
            # Free-style part.
            if between(cap, 41000, 43000):
                lower_bound = np.array([0, 0, 99])
                upper_bound = np.array([42, 120, 255])
                mask = bgr_mask(frame, lower_bound, upper_bound)
                kernel = np.ones((30, 30), np.uint8)
                mask = morphology_operation(mask, cv2.MORPH_DILATE, kernel)
                frame = cv2.inpaint(frame, mask, 3, cv2.INPAINT_TELEA)
                subtitles = [
                    {
                        "start": 41000,
                        "end": 43000,
                        "text": f'''Make tangerines disappear.'''
                    }
                ]

            if between(cap, 43000, 55000):
                lower_bound = np.array([0, 0, 99])
                upper_bound = np.array([42, 120, 255])
                mask = bgr_mask(frame, lower_bound, upper_bound)
                kernel = np.ones((10, 10), np.uint8)
                mask = morphology_operation(mask, cv2.MORPH_OPEN, kernel)
                contours = find_contours(mask, 1000)

                detections = []
                for cnt in contours:
                        x, y, w, h = cv2.boundingRect(cnt) # Get the object's shape.
                        detections.append(((x, y), (x + w, y + h)))
                
                # Mark objects using rectangulars.
                for i, (tl, br) in enumerate(detections):
                    cv2.rectangle(frame, tl, br, (0, 255, 0), 2)
                    cv2.putText(frame, f"ID: {i}", tl, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
                
                # Detect collision by checking whether there is reduction in rectangulars num.
                curr_num_objs = len(detections)
                if curr_num_objs < prev_num_objs:
                    if detections: # Make sure the reduction in rectangulars num is due to collision not the object left the scene.
                        collision_center = collision_pos_cal(prev_detections, detections)
                        if collision_center:
                            collision_frames = 3
                # Update previous state
                prev_num_objs = curr_num_objs
                prev_detections = detections.copy()  # Store current detections for the next frame.

                # Draw firework effect
                if collision_frames > 0:
                    if collision_center:
                        frame = collision_effect(collision_center, frame)
                    collision_frames -= 1

                subtitles = [
                    {
                        "start": 43000,
                        "end": 55000,
                        "text": f'''Multi-objects detection. Collision detection with effect. Robust to low-brightness.'''
                    }
                ]

            if between(cap, 55000, 58000):
                lower_bound = np.array([0, 0, 130])
                upper_bound = np.array([32, 165, 255])
                mask = bgr_mask(frame, lower_bound, upper_bound)
                kernel = np.ones((30, 30), np.uint8)
                mask = morphology_operation(mask, cv2.MORPH_DILATE, kernel)
                frame = cv2.inpaint(frame, mask, 3, cv2.INPAINT_TELEA)

                subtitles = [
                    {
                        "start": 55000,
                        "end": 58000,
                        "text": f'''Make tangerines disappear again.'''
                    }
                ]
                
            frame = add_subtitles_to_frame(subtitles, frame, time)
            out.write(frame)

            if cv2.waitKey(25) & 0xFF == ord('q'):
                break

        # Break the loop
        else:
            break

        cv2.imshow("Output", frame)
    cap.release()
    out.release()
    cv2.destroyAllWindows()

In [48]:
INPUT_FILE_NAME = 'my_video.mp4'
OUTPUT_FILE_NAME = 'my_processed_video.mp4'

main(INPUT_FILE_NAME, OUTPUT_FILE_NAME)