In [1]:
from PIL import Image
import os, sys
import cv2
import numpy as np
from imutils.object_detection import non_max_suppression
import random

In [2]:
random.seed(42)

#### You can change some hyperparameters here

In [3]:
# some hyperparameter
frame_interval = 25 
east_confidence = 0.5
opflow_min_area_threshold = 700
opflow_height_increase = 0
num_frames_for_overlap_check = 3
rolling_text_confidence = 0.2 # if the overlapping area / east box area > 0.2, this is a part of the rolling text

#### You can change the video path here

In [4]:
# file path
vidpath = 'seq2.mp4'

### Extract frames

In [5]:
def files(vidpath):
    base_name = os.path.basename(vidpath)
    vidname, _ = os.path.splitext(base_name)  # Split the base name to get the name without the extension
    
    try: # Remove image frames path if it already exists
        os.remove(f"{vidname}_image_frames")
    except OSError:
        pass
    if not os.path.exists(f"{vidname}_image_frames"): # create the directory if it does not already exist
        os.makedirs(f"{vidname}_image_frames")

    src_vid = cv2.VideoCapture(vidpath) # open the video file
    return(src_vid)
    
def process(src_vid, frame_interval):
    base_name = os.path.basename(vidpath)
    vidname, _ = os.path.splitext(base_name)  # Split the base name to get the name without the extension
    saved_frame_nums = []
    
    index = 0
    while src_vid.isOpened():
        ret, frame = src_vid.read() 
        if not ret: # break at the end of the video (ret returns True if frame is succesfully read)
            break
        name = f'./{vidname}_image_frames/frame{str(index)}.png'

        if index % frame_interval == 0: # every {frame_interval} frame will be saved: can adjust this number to capture more or less frames
            print('Extracting frame...' + name)
            cv2.imwrite(name, frame)
            saved_frame_nums.append(index)
        index = index + 1
        
    src_vid.release()
    cv2.destroyAllWindows()
    
    video_frames_dir = f"./{vidname}_image_frames"
    
    return saved_frame_nums, video_frames_dir

In [6]:
vid = files(vidpath)
saved_frame_nums, video_frames_dir = process(vid, frame_interval)

Extracting frame..../seq2_image_frames/frame0.png
Extracting frame..../seq2_image_frames/frame25.png
Extracting frame..../seq2_image_frames/frame50.png
Extracting frame..../seq2_image_frames/frame75.png
Extracting frame..../seq2_image_frames/frame100.png
Extracting frame..../seq2_image_frames/frame125.png
Extracting frame..../seq2_image_frames/frame150.png
Extracting frame..../seq2_image_frames/frame175.png
Extracting frame..../seq2_image_frames/frame200.png
Extracting frame..../seq2_image_frames/frame225.png
Extracting frame..../seq2_image_frames/frame250.png
Extracting frame..../seq2_image_frames/frame275.png
Extracting frame..../seq2_image_frames/frame300.png
Extracting frame..../seq2_image_frames/frame325.png
Extracting frame..../seq2_image_frames/frame350.png
Extracting frame..../seq2_image_frames/frame375.png
Extracting frame..../seq2_image_frames/frame400.png
Extracting frame..../seq2_image_frames/frame425.png
Extracting frame..../seq2_image_frames/frame450.png
Extracting frame.

### Optical Flow

In [7]:
import time

In [8]:
def detect_screengrab_boundingboxes(video_path):
    ''' Detects significant leftward motion in the input video. 
        Returns coordinates of bounding boxes of significant leftward motion
    '''
    vidname = vidname = str(video_path)
    cap = cv2.VideoCapture(video_path)
    
    # Get total number of frames and calculate frame at which point to save
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    save_frame = total_frames // 1.5  # save at ~75%

    ret, first_frame = cap.read() # Get the first frame: if not readable ret returns False
    if not ret:
        print("Failed to read the video.")
        return

    # Convert first frame to grayscale
    prev_gray = cv2.cvtColor(first_frame, cv2.COLOR_BGR2GRAY)

    # Initialize a mask for consistent leftward movement
    consistent_motion_mask = None
    consistency_threshold = 0.8  # Consistent movement threshold (percentage of frames)
    num_frames = 0

    saved_screengrab = False
    screengrab_frame = None
    bounding_boxes = []

    while cap.isOpened(): # Iterate through each frame of the video
        ret, frame = cap.read()
        if not ret:
            break
        
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        # Compute dense optical flow between current and previous frames using Farneback method
        # "flow" is a 3d array dimensions (img height, img width, 2)
        # Channel 1: Horizontal Movement. Channel 2: Vertical Movement
        flow = cv2.calcOpticalFlowFarneback(prev_gray, gray, None, 0.5, 5, 15, 3, 5, 1.2, 0)
        
        # Calculate the mask of leftward movement (negative horizontal movement)
        leftward_motion_mask = flow[..., 0] < 0
        
        # Initialize the consistent motion mask
        # with the same shape as leftward_motion_mask to accumulate counts of leftward motion
        if consistent_motion_mask is None:
            consistent_motion_mask = np.zeros_like(leftward_motion_mask, dtype=np.float32)
        
        # Update the consistent leftward movement mask where leftward motion is true
        consistent_motion_mask[leftward_motion_mask] += 1
        num_frames += 1

        # Calculate the ratio of consistent leftward movement
        consistent_ratio = consistent_motion_mask / num_frames
        consistent_leftward_regions = consistent_ratio > consistency_threshold

        # Apply morphological closing to fill in the gaps
        consistent_leftward_mask = consistent_leftward_regions.astype(np.uint8)
        kernel = np.ones((5, 5), np.uint8) 
        consistent_leftward_mask = cv2.morphologyEx(consistent_leftward_mask, cv2.MORPH_CLOSE, kernel)

        # Create an image for highlighting the image where there is consistent leftward motion
        consistent_leftward_img = np.zeros_like(frame)
        consistent_leftward_img[consistent_leftward_mask == 1] = [0, 255, 0]  # Green

        # Use morphological closing (to fill small gaps) and closing (to remove noise) operations to clean leftward-motion mask
        kernel = np.ones((5, 5), np.uint8)
        cleaned_mask = cv2.morphologyEx(consistent_leftward_mask, cv2.MORPH_CLOSE, kernel)
        cleaned_mask = cv2.morphologyEx(cleaned_mask, cv2.MORPH_OPEN, kernel)
        
        # Save screengrab at save_frame point and extract the bounding boxes
        if num_frames == save_frame and not saved_screengrab:
            # Convert original frame to grayscale
            gray_frame = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)

            # Overlay green highlight on grayscale frame
            overlayed_img = cv2.addWeighted(gray_frame, 0.7, consistent_leftward_img, 0.3, 0)
            screengrab_frame = overlayed_img.copy()
            saved_screengrab = True

            contours, _ = cv2.findContours(cleaned_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            for cnt in contours:
                x, y, w, h = cv2.boundingRect(cnt)
                # Only add the bounding box if the width is at least 1.7x the height
                if w >= 1.7 * h:
                    bounding_boxes.append((x, y, w, h))
                    # Draw bounding box on the screengrab
                    cv2.rectangle(screengrab_frame, (x, y), (x+w, y+h), (0, 255, 0), 2)  # Green bounding box

            cv2.imwrite(f'{vidname}_75percent_80threshold.png', screengrab_frame)
    
            return bounding_boxes
    
        prev_gray = gray.copy()
    
    cap.release()
    cv2.destroyAllWindows()

    return "fail"

In [9]:
start = time.time()
opflow_box_coordinates = detect_screengrab_boundingboxes(vidpath)
print(f"{vidpath} time taken: {time.time()-start}")

seq2.mp4 time taken: 15.93065595626831


### EAST for text detection

In [10]:
def east_detect(image, output_path, east_confidence):
    ''' uses EAST detection to get x,y coordinates of all bounding boxes of input image 
    
    param images: paths to the image
    param output_path: path where to save copy of original image with bounding boxes drawn on top
    
    returns list of tuples: each tuple contains (x_min, y_min, x_max, y_max) of each detected bounding box

    code from: https://medium.com/technovators/scene-text-detection-in-python-with-east-and-craft-cbe03dda35d5
    '''
    
    
    layerNames = [
    	"feature_fusion/Conv_7/Sigmoid",
    	"feature_fusion/concat_3"]

    orig = image.copy()
    
    if len(image.shape) == 2:
        image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
    
    (H, W) = image.shape[:2]
    
    # set the new width and height and then determine the ratio in change
    # for both the width and height: Should be multiple of 32
    (newW, newH) = (320, 320)
    
    rW = W / float(newW)
    rH = H / float(newH)
    
    # resize the image and grab the new image dimensions
    image = cv2.resize(image, (newW, newH))
    
    (H, W) = image.shape[:2]
    
    net = cv2.dnn.readNet("frozen_east_text_detection.pb")
    
    blob = cv2.dnn.blobFromImage(image, 1.0, (W, H),
    	(123.68, 116.78, 103.94), swapRB=True, crop=False)
    
    net.setInput(blob)
    
    (scores, geometry) = net.forward(layerNames)
    
    (numRows, numCols) = scores.shape[2:4]
    rects = []
    confidences = []
    # loop over the number of rows
    for y in range(0, numRows):
        # extract the scores (probabilities), followed by the geometrical
        # data used to derive potential bounding box coordinates that
        # surround text
        scoresData = scores[0, 0, y]
        xData0 = geometry[0, 0, y]
        xData1 = geometry[0, 1, y]
        xData2 = geometry[0, 2, y]
        xData3 = geometry[0, 3, y]
        anglesData = geometry[0, 4, y]
    
        for x in range(0, numCols):
    		# if our score does not have sufficient probability, ignore it
            # Set minimum confidence as required
            if scoresData[x] < east_confidence:
                continue
    		# compute the offset factor as our resulting feature maps will
            #  x smaller than the input image
            (offsetX, offsetY) = (x * 4.0, y * 4.0)
            # extract the rotation angle for the prediction and then
            # compute the sin and cosine
            angle = anglesData[x]
            cos = np.cos(angle)
            sin = np.sin(angle)
            # use the geometry volume to derive the width and height of
            # the bounding box
            h = xData0[x] + xData2[x]
            w = xData1[x] + xData3[x]
            # compute both the starting and ending (x, y)-coordinates for
            # the text prediction bounding box
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
            startX = int(endX - w)
            startY = int(endY - h)
            # add the bounding box coordinates and probability score to
            # our respective lists
            rects.append((startX, startY, endX, endY))
            confidences.append(scoresData[x])
                        
    boxes = non_max_suppression(np.array(rects), probs=confidences)
    # loop over the bounding boxes
    coordinates = []

    if len(boxes) == 0: # if no text/bounding boxes detected
        return coordinates
        
    if len(boxes) >= 1:
        for (startX, startY, endX, endY) in boxes:
            # scale the bounding box coordinates based on the respective
            # ratios
            startX = int(startX * rW)
            startY = int(startY * rH)
            endX = int(endX * rW)
            endY = int(endY * rH)
            # draw the bounding box on the image
            # startX,startY ------
            # |                 |
            # |                 |
            # |                 |
            # ----------endX,endY
            out_image = cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2)
            # out_image = cv2.rectangle(out_image, (146, 888), (1920, 1002), (0, 255, 0), 2)
            coordinates += [[startX, startY, endX, endY]]
        
        cv2.imwrite(f"{output_path}", out_image)
        return coordinates


In [11]:
coordinates_list= []

for frame_number in saved_frame_nums:
    frame_path = video_frames_dir + f"/frame{frame_number}.png"  # path to individual frame being processed

    
    base_name = os.path.basename(frame_path)
    frame_name, _ = os.path.splitext(base_name)  # get just the individual frame name without the extension

    
    if not os.path.isdir(f"{video_frames_dir}/output"):
        os.makedirs(f"{video_frames_dir}/output")  # makes output directory for the frame being processed
    
    image = cv2.imread(frame_path)

    coordinates = east_detect(image, f"{video_frames_dir}/output/out_{frame_name}_EAST.png", east_confidence)  # get coordinates of bounding boxes and save image to new output dir for that frame
        
    coordinates_list += [coordinates]


### Calculate the overlapping area of optic flow rectangles and east rectangles.

randomly get {N} frames to do this:
* For each opflow rectangle, get the overlapping area sum 
* The rectangle with the largest overlapping area -> rolling text region \
-> some sanity check for rectangles?

In [12]:
opflow_box_coordinates

[(0, 203, 640, 60)]

In [13]:
coordinates_list

[[],
 [[496, 208, 610, 249]],
 [[510, 207, 636, 249], [398, 211, 502, 247]],
 [[410, 207, 632, 249], [260, 210, 394, 250]],
 [[172, 210, 282, 249], [538, 210, 638, 253], [292, 205, 538, 247]],
 [[426, 208, 538, 247],
  [70, 210, 172, 249],
  [186, 205, 426, 246],
  [540, 208, 636, 247]],
 [[320, 210, 430, 247],
  [434, 214, 534, 249],
  [80, 205, 322, 247],
  [548, 208, 640, 249],
  [6, 216, 62, 249]],
 [[0, 205, 206, 250],
  [326, 210, 424, 247],
  [212, 208, 320, 249],
  [434, 208, 624, 258]],
 [[536, 213, 640, 247],
  [100, 210, 212, 252],
  [336, 210, 520, 253],
  [216, 214, 322, 250],
  [0, 210, 98, 250]],
 [[224, 205, 404, 255],
  [2, 210, 106, 249],
  [108, 210, 206, 249],
  [428, 208, 574, 247],
  [572, 208, 644, 252]],
 [[322, 211, 456, 249],
  [112, 207, 302, 255],
  [468, 205, 638, 249],
  [6, 211, 94, 247]],
 [[212, 211, 348, 247], [368, 210, 616, 247], [12, 205, 198, 256]],
 [[104, 213, 240, 250],
  [260, 210, 514, 249],
  [564, 205, 644, 247],
  [-2, 210, 84, 255]],
 [[13

In [14]:
def calculate_overlap_area(rect1, rect2):
    """
    Calculate the overlapping area between two rectangles.
    
    rect1 and rect2 are tuples in the form (startX, startY, endX, endY)
    """
    x1_max = max(rect1[0], rect2[0])
    y1_max = max(rect1[1], rect2[1])
    x2_min = min(rect1[2], rect2[2])
    y2_min = min(rect1[3], rect2[3])
    
    overlap_width = max(0, x2_min - x1_max)
    overlap_height = max(0, y2_min - y1_max)
    
    if x1_max >= x2_min or y1_max >= y2_min:
        return 0
    
    return overlap_width * overlap_height


def sum_overlapping_areas(main_rect, rectangles):
    """
    Calculate the sum of overlapping areas of multiple rectangles with the main rectangle.
    
    main_rect is a tuple in the form (startX, startY, endX, endY)
    rectangles is a list of tuples in the same form
    """
    total_overlap_area = 0
    for rect in rectangles:
        total_overlap_area += calculate_overlap_area(main_rect, rect)
    
    return total_overlap_area


In [15]:

overlap_areas = []
choose_frames = random.sample(list(range(len(saved_frame_nums))), num_frames_for_overlap_check)

for opflow_box_coordinate in opflow_box_coordinates:
    total_overlap = 0
    for frame_index in choose_frames:
        total_overlap += sum_overlapping_areas(opflow_box_coordinate, coordinates_list[frame_index])
    overlap_areas.append(total_overlap)

rolling_box_index = np.argmax(np.array(overlap_areas))
rolling_box = opflow_box_coordinates[rolling_box_index]

#### After getting the rolling box

* For all the east boxes in each frame, calculate their overlapping area with the rolling box
* If overlapping area / east box area >= rolling_text_confidence, this is a part of the rolling text
* Crop this part and save to the folder in each frame. \
TBD: The cropped text images should follow a left to right sequence

In [16]:
# TODO: The cropped text images should follow a left to right sequence

In [17]:
def crop_and_save_image(image_path, coords, output_path):
    ''' saves crops of original image according to specified coordinates 

    param image_path: path to original image
    param coordinates: coordinates of bounding boxes in tuple (x_min, y_min, x_max, y_max)
    param output_path: path of output file'''
    
    img = cv2.imread(image_path)
    x_min, y_min, x_max, y_max = coords
    cropped_img = img[y_min:y_max, x_min:x_max]

    # Save
    cv2.imwrite(output_path, cropped_img)


In [18]:
for frame_num, rects in zip(saved_frame_nums, coordinates_list):
    frame_path = f"{video_frames_dir}/frame{frame_num}.png"
    output_path = f"{video_frames_dir}/croppedFrame{frame_num}"
    if os.path.isdir(output_path):
        print("The cropped folders are there! You can delete them :)")
        continue
    else:
        os.makedirs(output_path)
        
    for idx, rect in enumerate(rects):
        overlap_area = calculate_overlap_area(rolling_box, rect)
        rect_area = (rect[2] - rect[0]) * (rect[3] - rect[1])
        if rect_area > 0 and (overlap_area / rect_area) >= rolling_text_confidence:
            final_outout_path = f"{output_path}/{idx}.jpg"
            crop_and_save_image(frame_path, rect, final_outout_path)
            print(f"Saved cropped image to {final_outout_path} for rectangle {rect}")