We'll use OCR code from here: https://towardsdatascience.com/optical-character-recognition-ocr-with-less-than-12-lines-of-code-using-python-48404218cccb

And the video code here: https://stackoverflow.com/a/29317298

In [4]:
import os
import cv2
import pytesseract
from uuid import uuid4
import numpy as np
import ffmpeg

In [5]:
OUT_DIR = '../data/processed/videos'
IN_DIR = '../data/raw/videos'

The regular OCR for tesseract is not fast enough. We'll need to use the fast langpacks

In [None]:
cap = cv2.VideoCapture('./data/raw/videos/oregon_cut.avi')
got_frame, _ = cap.read()
if not got_frame:
    raise Exception('No frames :(')

previous = ""
writer = None

total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_count = 0
while True:
    got_frame, frame = cap.read()

    # Crop the frame to just the text
    cropped = frame[910:1020, 910:950]

    # Threshold code from the article
    gray = cv2.cvtColor(cropped, cv2.COLOR_RGB2GRAY)
    gray, img_bin = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    gray = cv2.bitwise_not(img_bin)

    s = pytesseract.image_to_string(gray, lang='eng')

    if previous == "30" and s == "29":
        # match start
        size = (frame.shape[1], frame.shape[0])
        writer = cv2.VideoWriter(f'{OUT_DIR}/match-{uuid4()}.avi', -1, 20.0, size)

    corner_pixel = cropped[0, 0]
    background_red = corner_pixel[0] < 120 and corner_pixel[1] < 120 and corner_pixel[2] > 240
    if previous == "1" and s == "0" and background_red:
        # match end
        writer.release()
        writer = None

    if writer is not None:
        writer.write(frame)

    previous = s
    print(f'{frame_count}/{total_frames}; Detected: {s};', end='\r')
    frame_count += 1

cap.release()
cv2.destroyAllWindows()

Unfortunately, tesseract is just way too slow. Even using the fast language packs, it did considerably slower than real time

In [None]:
cap = cv2.VideoCapture('./data/raw/videos/oregon_cut.avi')
got_frame, _ = cap.read()
if not got_frame:
    raise Exception('No frames :(')

prev = (0, 0, 0)
recording = False

total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_count = 0

matches = []


def in_range(p, r):
    return r[0] + 10 >= p[0] >= r[0] - 10 and r[1] + 10 >= p[1] >= r[1] - 10 and r[2] + 10 >= p[2] >= r[2] - 10


while True:
    got_frame, frame = cap.read()

    if not got_frame:
        break

    detection_range = frame[928:948, 739:741]
    detection_range_avg = np.average(detection_range, axis=(0, 1))

    if not recording and in_range(prev, (150, 150, 150)) and not in_range(detection_range_avg, (150, 150, 150)):
        # match start
        recording = True
        matches.append([frame_count])

    if recording and in_range(detection_range_avg, (10, 10, 220)):
        # match end
        recording = False
        matches[-1].append(frame_count)

    frame_count += 1
    prev = detection_range_avg
    print(f'{frame_count / total_frames * 100:.2f}%', end='\r')

print(matches)

cap.release()
cv2.destroyAllWindows()

As seen above I inspect the progress bar. Looking at one pixel turns out to be too unpredictable so I look at an average. Even that has some issues because initially there is some grey left in the progress bar before going full green, so instead I'm just gonna look at shifts away from gray

In [None]:
in_file = ffmpeg.input('./data/raw/videos/oregon_cut.avi')
for (start, end) in matches:
    in_file
    .trim(start_frame=start, end_frame=end)
    .setpts('PTS-STARTPTS')
    .output(f'{OUT_DIR}/match-oregon-{start}-{end}.avi')
    .run()

In [None]:
def in_range(p, r):
    return r[0] + 10 >= p[0] >= r[0] - 10 and r[1] + 10 >= p[1] >= r[1] - 10 and r[2] + 10 >= p[2] >= r[2] - 10


def has_text(cropped, text):
    gray = cv2.cvtColor(cropped, cv2.COLOR_RGB2GRAY)
    gray, img_bin = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    gray = cv2.bitwise_not(img_bin)
    return text == pytesseract.image_to_string(gray, lang='eng')


for video in os.listdir(IN_DIR):
    cap = cv2.VideoCapture(f'{IN_DIR}/{video}')

    got_frame, _ = cap.read()
    if not got_frame:
        print(f'Skipping {video}')

    prev = (0, 0, 0)
    recording = False

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_count = 0

    matches = []

    while True:
        got_frame, frame = cap.read()

        if not got_frame:
            break

        detection_range = frame[928:948, 739:741]
        text_range = frame[910:1020, 910:950]
        detection_range_avg = np.average(detection_range, axis=(0, 1))

        if not recording \
                and in_range(prev, (150, 150, 150)) \
                and not in_range(detection_range_avg, (150, 150, 150)) \
                and has_text(text_range, "29"):
            # match start
            recording = True
            matches.append([frame_count])

        if recording \
                and in_range(detection_range_avg, (10, 10, 220)) \
                and has_text(text_range, "0"):
            # match end
            recording = False
            matches[-1].append(frame_count)

        frame_count += 1
        prev = detection_range_avg
        print(f'{frame_count / total_frames * 100:.2f}%', end='\r')

    print(matches)

cap.release()