In [56]:
from PIL import Image
import os, sys
import cv2
import numpy as np
from imutils.object_detection import non_max_suppression
import random

In [57]:
random.seed(42)

In [65]:
# some hyperparameter
frame_interval = 25
east_confidence = 0.5
opflow_min_area_threshold = 700
opflow_height_increase = 0
num_frames_for_overlap_check = 3
rolling_text_confidence = 0.2 # if the overlapping area / east box area > 0.2, this is a part of the rolling text

In [7]:
# file path
vidpath = '../data/static_moving.mp4'

### Extract frames

In [46]:
def files(vidpath):
    base_name = os.path.basename(vidpath)
    vidname, _ = os.path.splitext(base_name)  # Split the base name to get the name without the extension
    
    try: # Remove image frames path if it already exists
        os.remove(f"{vidname}_image_frames")
    except OSError:
        pass
    if not os.path.exists(f"{vidname}_image_frames"): # create the directory if it does not already exist
        os.makedirs(f"{vidname}_image_frames")

    src_vid = cv2.VideoCapture(vidpath) # open the video file
    return(src_vid)
    
def process(src_vid, frame_interval):
    base_name = os.path.basename(vidpath)
    vidname, _ = os.path.splitext(base_name)  # Split the base name to get the name without the extension
    saved_frame_nums = []
    
    index = 0
    while src_vid.isOpened():
        ret, frame = src_vid.read() 
        if not ret: # break at the end of the video (ret returns True if frame is succesfully read)
            break
        name = f'./{vidname}_image_frames/frame{str(index)}.png'

        if index % frame_interval == 0: # every 50th frame will be saved: can adjust this number to capture more or less frames
            print('Extracting frame...' + name)
            cv2.imwrite(name, frame)
            saved_frame_nums.append(index)
        index = index + 1
        
    src_vid.release()
    cv2.destroyAllWindows()
    
    video_frames_dir = f"./{vidname}_image_frames"
    
    return saved_frame_nums, video_frames_dir

In [47]:
vid = files(vidpath)
saved_frame_nums, video_frames_dir = process(vid, frame_interval)

Extracting frame..../static_moving_image_frames/frame0.png
Extracting frame..../static_moving_image_frames/frame25.png
Extracting frame..../static_moving_image_frames/frame50.png
Extracting frame..../static_moving_image_frames/frame75.png
Extracting frame..../static_moving_image_frames/frame100.png
Extracting frame..../static_moving_image_frames/frame125.png
Extracting frame..../static_moving_image_frames/frame150.png
Extracting frame..../static_moving_image_frames/frame175.png
Extracting frame..../static_moving_image_frames/frame200.png
Extracting frame..../static_moving_image_frames/frame225.png
Extracting frame..../static_moving_image_frames/frame250.png
Extracting frame..../static_moving_image_frames/frame275.png
Extracting frame..../static_moving_image_frames/frame300.png


### Optical Flow

In [28]:
def draw_horizontal_flow(img, flow, step=30):
    """Draw optical flow vectors on the video."""
    h, w = img.shape[:2]
    y, x = np.mgrid[step//2:h:step, step//2:w:step].reshape(2, -1).astype(int)
    fx = flow[y, x, 0]
    fy = flow[y, x, 1]

    # Keep only leftward motion vectors (negative horizontal flow)
    fx[fx > 0] = 0

    lines = np.vstack([x, y, x + fx, y + fy]).T.reshape(-1, 2, 2)
    lines = np.int32(lines + 0.5)

    vis = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
    for (x1, y1), (x2, y2) in lines:
        cv2.arrowedLine(vis, (x1, y1), (x2, y2), (0, 255, 0), 1, tipLength=0.5)
    return vis

def detect_and_save_screengrab(video_path):
    cap = cv2.VideoCapture(video_path)
    
    # Get total number of frames and calculate halfway point
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    halfway_frame = total_frames // 2

    ret, first_frame = cap.read()
    if not ret:
        print("Failed to read the video file.")
        return

    prev_gray = cv2.cvtColor(first_frame, cv2.COLOR_BGR2GRAY)

    # Initialize a mask for consistent leftward movement
    consistent_motion_mask = None
    consistency_threshold = 0.8  # Consistent movement threshold (percentage of frames)
    num_frames = 0

    saved_screengrab = False
    screengrab_frame = None

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        
        flow = cv2.calcOpticalFlowFarneback(prev_gray, gray, None, 0.5, 5, 15, 3, 5, 1.2, 0)
        
        # Calculate the mask of leftward movement
        leftward_motion_mask = flow[..., 0] < 0
        
        # Initialize the consistent motion mask
        if consistent_motion_mask is None:
            consistent_motion_mask = np.zeros_like(leftward_motion_mask, dtype=np.float32)
        
        # Update the consistent leftward movement mask
        consistent_motion_mask[leftward_motion_mask] += 1
        num_frames += 1

        # Calculate the ratio of consistent leftward movement
        consistent_ratio = consistent_motion_mask / num_frames
        consistent_leftward_regions = consistent_ratio > consistency_threshold

        # Apply morphological operations to fill in the gaps
        consistent_leftward_mask = consistent_leftward_regions.astype(np.uint8)
        kernel = np.ones((5, 5), np.uint8)
        consistent_leftward_mask = cv2.morphologyEx(consistent_leftward_mask, cv2.MORPH_CLOSE, kernel)

        # Create an image for highlighting
        consistent_leftward_img = np.zeros_like(frame)
        consistent_leftward_img[consistent_leftward_mask == 1] = [0, 255, 0]  # Green

        vis = draw_horizontal_flow(gray, flow)

        # Combine visualization with consistent leftward movement highlight
        combined_vis = cv2.addWeighted(vis, 0.7, consistent_leftward_img, 0.3, 0)

        # Save screengrab at halfway point
        if num_frames == halfway_frame and not saved_screengrab:
            screengrab_frame = combined_vis.copy()
            saved_screengrab = True
        
        prev_gray = gray.copy()

    # Save the screengrab outside the loop to avoid delay
    if screengrab_frame is not None:
        vidname = str(video_path)
        cv2.imwrite(f'{vidname}_screengrab_halfway.png', screengrab_frame)
    
    cap.release()
    cv2.destroyAllWindows()

In [29]:
########## CALL TO FUNCTION:
detect_and_save_screengrab(vidpath)

#### Draw a bounding box around the detected regions

In [53]:
def extract_bounding_boxes(vidpath, height_increase=0, opflow_min_area_threshold=700):
    opflow_box_coordinates = []
    screengrab_path = f'{vidpath}_screengrab_halfway.png'
    
    screengrab = cv2.imread(screengrab_path)
    if screengrab is None:
        print("Failed to read the screengrab.")
        return

    # Convert to HSV and create a mask for the green color
    hsv = cv2.cvtColor(screengrab, cv2.COLOR_BGR2HSV)
    lower_green = np.array([40, 40, 40])
    upper_green = np.array([80, 255, 255])
    mask = cv2.inRange(hsv, lower_green, upper_green)

    # Find contours
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Draw bounding boxes around significant motion regions
    for contour in contours:
        if cv2.contourArea(contour) > opflow_min_area_threshold:  # Adjust the threshold for minimum area as needed
            x, y, w, h = cv2.boundingRect(contour)
            # Increase the height of the bounding box
            y = max(0, y - height_increase // 2)
            h = h + height_increase
            # Ensure the bounding box is within the image boundaries
            h = min(h, screengrab.shape[0] - y)
            cv2.rectangle(screengrab, (x, y), (x + w, y + h), (0, 255, 0), 2)
            opflow_box_coordinates += [[x, y, x+w, y+h]]

    # Save and display the image with bounding boxes
    cv2.imwrite(f'{vidpath}_screengrab_with_bounding_boxes.png', screengrab)
    
    return opflow_box_coordinates

########## CALL TO FUNCTION:
opflow_box_coordinates = extract_bounding_boxes(vidpath, opflow_height_increase, opflow_min_area_threshold) # Add extra height to ensure no text is cut off from the top/bottom

### EAST for text detection

In [52]:
def east_detect(image, output_path, east_confidence):
    ''' uses EAST detection to get x,y coordinates of all bounding boxes of input image 
    
    param images: paths to the image
    param output_path: path where to save copy of original image with bounding boxes drawn on top
    
    returns list of tuples: each tuple contains (x_min, y_min, x_max, y_max) of each detected bounding box

    code from: https://medium.com/technovators/scene-text-detection-in-python-with-east-and-craft-cbe03dda35d5
    '''
    
    
    layerNames = [
    	"feature_fusion/Conv_7/Sigmoid",
    	"feature_fusion/concat_3"]
    
    orig = image.copy()
    
    if len(image.shape) == 2:
        image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
    
    (H, W) = image.shape[:2]
    
    # set the new width and height and then determine the ratio in change
    # for both the width and height: Should be multiple of 32
    (newW, newH) = (672, 672)
    
    rW = W / float(newW)
    rH = H / float(newH)
    
    # resize the image and grab the new image dimensions
    image = cv2.resize(image, (newW, newH))
    
    (H, W) = image.shape[:2]
    
    net = cv2.dnn.readNet("frozen_east_text_detection.pb")
    
    blob = cv2.dnn.blobFromImage(image, 1.0, (W, H),
    	(123.68, 116.78, 103.94), swapRB=True, crop=False)
    
    net.setInput(blob)
    
    (scores, geometry) = net.forward(layerNames)
    
    (numRows, numCols) = scores.shape[2:4]
    rects = []
    confidences = []
    # loop over the number of rows
    for y in range(0, numRows):
        # extract the scores (probabilities), followed by the geometrical
        # data used to derive potential bounding box coordinates that
        # surround text
        scoresData = scores[0, 0, y]
        xData0 = geometry[0, 0, y]
        xData1 = geometry[0, 1, y]
        xData2 = geometry[0, 2, y]
        xData3 = geometry[0, 3, y]
        anglesData = geometry[0, 4, y]
    
        for x in range(0, numCols):
    		# if our score does not have sufficient probability, ignore it
            # Set minimum confidence as required
            if scoresData[x] < east_confidence:
                continue
    		# compute the offset factor as our resulting feature maps will
            #  x smaller than the input image
            (offsetX, offsetY) = (x * 4.0, y * 4.0)
            # extract the rotation angle for the prediction and then
            # compute the sin and cosine
            angle = anglesData[x]
            cos = np.cos(angle)
            sin = np.sin(angle)
            # use the geometry volume to derive the width and height of
            # the bounding box
            h = xData0[x] + xData2[x]
            w = xData1[x] + xData3[x]
            # compute both the starting and ending (x, y)-coordinates for
            # the text prediction bounding box
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
            startX = int(endX - w)
            startY = int(endY - h)
            # add the bounding box coordinates and probability score to
            # our respective lists
            rects.append((startX, startY, endX, endY))
            confidences.append(scoresData[x])
                        
    boxes = non_max_suppression(np.array(rects), probs=confidences)
    # loop over the bounding boxes
    coordinates = []
    for (startX, startY, endX, endY) in boxes:
        # scale the bounding box coordinates based on the respective
        # ratios
        startX = int(startX * rW)
        startY = int(startY * rH)
        endX = int(endX * rW)
        endY = int(endY * rH)
        # draw the bounding box on the image
        # startX,startY ------
        # |                 |
        # |                 |
        # |                 |
        # ----------endX,endY
        out_image = cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2)
        # out_image = cv2.rectangle(out_image, (146, 888), (1920, 1002), (0, 255, 0), 2)
        coordinates += [[startX, startY, endX, endY]]
    
    cv2.imwrite(f"{output_path}", out_image)
    return coordinates


In [51]:
coordinates_list= []

for frame_number in saved_frame_nums:
	frame_path = video_frames_dir + f"/frame{frame_number}.png" # path to individual frame being processed

	base_name = os.path.basename(frame_path)
	frame_name, _ = os.path.splitext(base_name) # get just the individual frame name without the extension

	if not os.path.isdir(f"{video_frames_dir}/output"):
		os.makedirs(f"{video_frames_dir}/output") # makes output directory for the frame being processed
	
	image = cv2.imread(frame_path) 
	coordinates = east_detect(image, f"{video_frames_dir}/output/out_{frame_name}_EAST.png", east_confidence) # get coordinates of bounding boxes and save image to new output dir for that frame

	coordinates_list += [coordinates]


### Calculate the overlapping area of optic flow rectangles and east rectangles.

randomly get {N} frames to do this:
* For each opflow rectangle, get the overlapping area sum 
* The rectangle with the largest overlapping area -> rolling text region \
-> some sanity check for rectangles?

In [54]:
opflow_box_coordinates

[[87, 896, 136, 1000], [146, 888, 1920, 1002]]

In [50]:
coordinates_list

[[[1065, 506, 1220, 570],
  [1548, 906, 1862, 975],
  [705, 503, 1077, 570],
  [1062, 901, 1248, 980],
  [1274, 901, 1540, 983]],
 [[1065, 506, 1220, 570],
  [1197, 908, 1520, 977],
  [728, 904, 914, 977],
  [702, 503, 1074, 570],
  [1531, 906, 1702, 981],
  [1734, 912, 1917, 981],
  [937, 896, 1214, 978]],
 [[1762, 917, 1931, 975],
  [1062, 504, 1220, 570],
  [402, 903, 577, 975],
  [877, 904, 1191, 973],
  [702, 503, 1077, 570],
  [1194, 904, 1371, 983],
  [1368, 903, 1751, 990],
  [597, 896, 874, 985],
  [1051, 626, 1105, 657]],
 [[1065, 504, 1222, 570],
  [1425, 919, 1648, 975],
  [868, 906, 1034, 980],
  [1657, 901, 1822, 977],
  [705, 503, 1077, 570],
  [551, 906, 851, 975],
  [262, 901, 534, 983],
  [60, 903, 242, 978],
  [1034, 903, 1411, 991],
  [1054, 626, 1105, 657]],
 [[1085, 914, 1300, 973],
  [1062, 506, 1222, 572],
  [1717, 904, 1908, 985],
  [202, 911, 514, 978],
  [1322, 904, 1485, 975],
  [705, 503, 1077, 570],
  [1494, 909, 1697, 985],
  [0, 903, 182, 975],
  [522, 9

In [59]:
def calculate_overlap_area(rect1, rect2):
    """
    Calculate the overlapping area between two rectangles.
    
    rect1 and rect2 are tuples in the form (startX, startY, endX, endY)
    """
    x1_max = max(rect1[0], rect2[0])
    y1_max = max(rect1[1], rect2[1])
    x2_min = min(rect1[2], rect2[2])
    y2_min = min(rect1[3], rect2[3])
    
    overlap_width = max(0, x2_min - x1_max)
    overlap_height = max(0, y2_min - y1_max)
    
    return overlap_width * overlap_height


def sum_overlapping_areas(main_rect, rectangles):
    """
    Calculate the sum of overlapping areas of multiple rectangles with the main rectangle.
    
    main_rect is a tuple in the form (startX, startY, endX, endY)
    rectangles is a list of tuples in the same form
    """
    total_overlap_area = 0
    for rect in rectangles:
        total_overlap_area += calculate_overlap_area(main_rect, rect)
    
    return total_overlap_area


In [62]:

overlap_areas = []
choose_frames = random.sample(list(range(len(saved_frame_nums))), num_frames_for_overlap_check)

for opflow_box_coordinate in opflow_box_coordinates:
    total_overlap = 0
    for frame_index in choose_frames:
        total_overlap += sum_overlapping_areas(opflow_box_coordinate, coordinates_list[frame_index])
    overlap_areas.append(total_overlap)

rolling_box_index = np.argmax(np.array(overlap_areas))
rolling_box = opflow_box_coordinates[rolling_box_index]

#### After getting the rolling box

* For all the east boxes in each frame, calculate their overlapping area with the rolling box
* If overlapping area / east box area >= rolling_text_confidence, this is a part of the rolling text
* Crop this part and save to the folder in each frame. \
-> crop each word from left to right?