In [1]:
from PIL import Image
import os, sys
import cv2
from mmocr.apis import MMOCRInferencer
import numpy as np
from imutils.object_detection import non_max_suppression
import random
import time
import requests
import json


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /Users/aklywtx/opt/anaconda3/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so
'NoneType' object has no attribute 'cadam32bit_grad_fp32'
CUDA SETUP: Loading binary /Users/aklywtx/opt/anaconda3/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so...
dlopen(/Users/aklywtx/opt/anaconda3/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so, 0x0006): tried: '/Users/aklywtx/opt/anaconda3/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so' (not a mach-o file), '/System/Volumes/Preboot/Cryptexes/OS/Users/aklywtx/opt/anaconda3/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so' (no such file), '/Users/aklywtx/opt/anaconda3/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so' (not a mach-o file)


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [2]:
random.seed(42)

#### Tune the hyperparameters here

In [3]:
# hyperparameters
frame_interval = 50 # the interval for extracting frames
east_confidence = 0.5 # the confidence threshold for EAST text detection
num_frames_for_overlap_check = 3 # the number of frames for checking the overlap of text boxes and optical flow boxes

#### Change the video path here

In [4]:
# file path
vidpath = '../data/train5_crop.mp4'

## Step 1: Extract frames

In [5]:
def files(vidpath):
    """
    Extracts frames from the given video path at a specified interval.
    
    Args:
        vidpath (str): Path to the video file.

    Returns:
        tuple: Source video capture object, saved frame numbers, and directory of video frames.
    """
    base_name = os.path.basename(vidpath)
    vidname, _ = os.path.splitext(base_name)  # Split the base name to get the name without the extension
    
    try: # Remove image frames path if it already exists
        os.remove(f"{vidname}_image_frames")
    except OSError:
        pass
    if not os.path.exists(f"{vidname}_image_frames"): # create the directory if it does not already exist
        os.makedirs(f"{vidname}_image_frames")

    src_vid = cv2.VideoCapture(vidpath) # open the video file
    return(src_vid)
    
def process(src_vid, frame_interval):
    """
    Processes video frames, saving every nth frame as specified by the interval.
    
    Args:
        src_vid (cv2.VideoCapture): Source video capture object.
        frame_interval (int): Interval for frame extraction, a hyperparameter

    Returns:
        tuple: List of saved frame numbers and the directory of video frames.
    """
    base_name = os.path.basename(vidpath)
    vidname, _ = os.path.splitext(base_name)  # Split the base name to get the name without the extension
    saved_frame_nums = []
    
    index = 0
    while src_vid.isOpened():
        ret, frame = src_vid.read() 
        if not ret: # break at the end of the video (ret returns True if frame is succesfully read)
            break
        name = f'./{vidname}_image_frames/frame{str(index)}.png'

        if index % frame_interval == 0: # every {frame_interval} frame will be saved: can adjust this number to capture more or less frames
            print('Extracting frame...' + name)
            cv2.imwrite(name, frame)
            saved_frame_nums.append(index)
        index = index + 1
        
    src_vid.release()
    cv2.destroyAllWindows()
    
    video_frames_dir = f"./{vidname}_image_frames"
    
    return saved_frame_nums, video_frames_dir

In [6]:
vid = files(vidpath)
saved_frame_nums, video_frames_dir = process(vid, frame_interval)

Extracting frame..../train5_crop_image_frames/frame0.png
Extracting frame..../train5_crop_image_frames/frame50.png
Extracting frame..../train5_crop_image_frames/frame100.png
Extracting frame..../train5_crop_image_frames/frame150.png


## Step 2: Use Optical Flow to get rolling text bouding boxes candidates

In [7]:
def detect_screengrab_boundingboxes(video_path):
    ''' 
    Detects significant leftward motion in the input video and returns bounding boxes of significant motion.
    
    Args:
        video_path (str): Path to the video file.

    Returns:
        list: Coordinates of bounding boxes of significant leftward motion.
    '''
    vidname = vidname = str(video_path)
    cap = cv2.VideoCapture(video_path)
    
    # Get total number of frames and calculate frame at which point to save
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    save_frame = total_frames // 1.5  # save at ~75%

    ret, first_frame = cap.read() # Get the first frame: if not readable ret returns False
    if not ret:
        print("Failed to read the video.")
        return

    # Convert first frame to grayscale
    prev_gray = cv2.cvtColor(first_frame, cv2.COLOR_BGR2GRAY)

    # Initialize a mask for consistent leftward movement
    consistent_motion_mask = None
    consistency_threshold = 0.8  # Consistent movement threshold (percentage of frames)
    num_frames = 0

    saved_screengrab = False
    screengrab_frame = None
    bounding_boxes = []

    while cap.isOpened(): # Iterate through each frame of the video
        ret, frame = cap.read()
        if not ret:
            break
        
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        # Compute dense optical flow between current and previous frames using Farneback method
        # "flow" is a 3d array dimensions (img height, img width, 2)
        # Channel 1: Horizontal Movement. Channel 2: Vertical Movement
        flow = cv2.calcOpticalFlowFarneback(prev_gray, gray, None, 0.5, 5, 15, 3, 5, 1.2, 0)
        
        # Calculate the mask of leftward movement (negative horizontal movement)
        leftward_motion_mask = flow[..., 0] < 0
        
        # Initialize the consistent motion mask
        # with the same shape as leftward_motion_mask to accumulate counts of leftward motion
        if consistent_motion_mask is None:
            consistent_motion_mask = np.zeros_like(leftward_motion_mask, dtype=np.float32)
        
        # Update the consistent leftward movement mask where leftward motion is true
        consistent_motion_mask[leftward_motion_mask] += 1
        num_frames += 1

        # Calculate the ratio of consistent leftward movement
        consistent_ratio = consistent_motion_mask / num_frames
        consistent_leftward_regions = consistent_ratio > consistency_threshold

        # Apply morphological closing to fill in the gaps
        consistent_leftward_mask = consistent_leftward_regions.astype(np.uint8)
        kernel = np.ones((5, 5), np.uint8) 
        consistent_leftward_mask = cv2.morphologyEx(consistent_leftward_mask, cv2.MORPH_CLOSE, kernel)

        # Create an image for highlighting the image where there is consistent leftward motion
        consistent_leftward_img = np.zeros_like(frame)
        consistent_leftward_img[consistent_leftward_mask == 1] = [0, 255, 0]  # Green

        # Use morphological closing (to fill small gaps) and closing (to remove noise) operations to clean leftward-motion mask
        kernel = np.ones((5, 5), np.uint8)
        cleaned_mask = cv2.morphologyEx(consistent_leftward_mask, cv2.MORPH_CLOSE, kernel)
        cleaned_mask = cv2.morphologyEx(cleaned_mask, cv2.MORPH_OPEN, kernel)
        
        # Save screengrab at save_frame point and extract the bounding boxes
        if num_frames == save_frame and not saved_screengrab:
            # Convert original frame to grayscale
            gray_frame = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)

            # Overlay green highlight on grayscale frame
            overlayed_img = cv2.addWeighted(gray_frame, 0.7, consistent_leftward_img, 0.3, 0)
            screengrab_frame = overlayed_img.copy()
            saved_screengrab = True

            contours, _ = cv2.findContours(cleaned_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            for cnt in contours:
                x, y, w, h = cv2.boundingRect(cnt)
                # Only add the bounding box if the width is at least 1.7x the height
                if w >= 1.7 * h:
                    bounding_boxes.append((x, y, x+w, y+h))
                    # Draw bounding box on the screengrab
                    cv2.rectangle(screengrab_frame, (x, y), (x+w, y+h), (0, 255, 0), 2)  # Green bounding box

            cv2.imwrite(f'./{vidname}_75percent_80threshold.png', screengrab_frame)
    
            return bounding_boxes
    
        prev_gray = gray.copy()
    
    cap.release()
    cv2.destroyAllWindows()

    return "fail"

In [8]:
start = time.time()
opflow_box_coordinates = detect_screengrab_boundingboxes(vidpath)
print(f"{vidpath} time taken: {time.time()-start}")

../data/train5_crop.mp4 time taken: 55.684545040130615


## Step 3: Use EAST to choose the best candidate

In [9]:
def east_detect(image, output_path, east_confidence):
    '''
    uses EAST detection to get x,y coordinates of all bounding boxes of input image 
    
    param images: paths to the image
    param output_path: path where to save copy of original image with bounding boxes drawn on top
    
    returns list of tuples: each tuple contains (x_min, y_min, x_max, y_max) of each detected bounding box

    code from: https://medium.com/technovators/scene-text-detection-in-python-with-east-and-craft-cbe03dda35d5
    '''
    
    
    layerNames = [
    	"feature_fusion/Conv_7/Sigmoid",
    	"feature_fusion/concat_3"]

    orig = image.copy()
    
    if len(image.shape) == 2:
        image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
    
    (H, W) = image.shape[:2]
    
    # set the new width and height and then determine the ratio in change
    # for both the width and height: Should be multiple of 32
    # (newW, newH) = (320, 320)
    (newW, newH) = (1024, 1024)
    
    rW = W / float(newW)
    rH = H / float(newH)
    
    # resize the image and grab the new image dimensions
    image = cv2.resize(image, (newW, newH))
    
    (H, W) = image.shape[:2]
    
    net = cv2.dnn.readNet("frozen_east_text_detection.pb")
    
    blob = cv2.dnn.blobFromImage(image, 1.0, (W, H),
    	(123.68, 116.78, 103.94), swapRB=True, crop=False)
    
    net.setInput(blob)
    
    (scores, geometry) = net.forward(layerNames)
    
    (numRows, numCols) = scores.shape[2:4]
    rects = []
    confidences = []
    # loop over the number of rows
    for y in range(0, numRows):
        # extract the scores (probabilities), followed by the geometrical
        # data used to derive potential bounding box coordinates that
        # surround text
        scoresData = scores[0, 0, y]
        xData0 = geometry[0, 0, y]
        xData1 = geometry[0, 1, y]
        xData2 = geometry[0, 2, y]
        xData3 = geometry[0, 3, y]
        anglesData = geometry[0, 4, y]
    
        for x in range(0, numCols):
    		# if our score does not have sufficient probability, ignore it
            # Set minimum confidence as required
            if scoresData[x] < east_confidence:
                continue
    		# compute the offset factor as our resulting feature maps will
            #  x smaller than the input image
            (offsetX, offsetY) = (x * 4.0, y * 4.0)
            # extract the rotation angle for the prediction and then
            # compute the sin and cosine
            angle = anglesData[x]
            cos = np.cos(angle)
            sin = np.sin(angle)
            # use the geometry volume to derive the width and height of
            # the bounding box
            h = xData0[x] + xData2[x]
            w = xData1[x] + xData3[x]
            # compute both the starting and ending (x, y)-coordinates for
            # the text prediction bounding box
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
            startX = int(endX - w)
            startY = int(endY - h)
            # add the bounding box coordinates and probability score to
            # our respective lists
            rects.append((startX, startY, endX, endY))
            confidences.append(scoresData[x])
                        
    boxes = non_max_suppression(np.array(rects), probs=confidences)
    # loop over the bounding boxes
    coordinates = []

    if len(boxes) == 0: # if no text/bounding boxes detected
        return coordinates
        
    if len(boxes) >= 1:
        for (startX, startY, endX, endY) in boxes:
            # scale the bounding box coordinates based on the respective
            # ratios
            startX = int(startX * rW)
            startY = int(startY * rH)
            endX = int(endX * rW)
            endY = int(endY * rH)
            # draw the bounding box on the image
            # origin is upper left corner
            # startX,startY ------
            # |                 |
            # |                 |
            # |                 |
            # ----------endX,endY
            startX = max(startX, 0) #TODO: ome starting points of east text bounding boxes are negative numbers. I dont know why
            out_image = cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2)
            # out_image = cv2.rectangle(out_image, (146, 888), (1920, 1002), (0, 255, 0), 2)
            coordinates += [[startX, startY, endX, endY]]
        
        cv2.imwrite(f"{output_path}", out_image)
        return coordinates


In [10]:
def get_text_coordinates_in_all_frames(saved_frame_nums, east_confidence):
    """
    Get the coordinates of the bounding boxes for text detection in all frames.

    Args:
        saved_frame_nums (list): A list of frame numbers to process.
        east_confidence (float): The confidence threshold for text detection.

    Returns:
        list: A list of coordinates for each frame, where each coordinate is a tuple (x, y, w, h).

    """
    coordinates_list= []

    for frame_number in saved_frame_nums:
        frame_path = video_frames_dir + f"/frame{frame_number}.png"  # path to individual frame being processed

        base_name = os.path.basename(frame_path)
        frame_name, _ = os.path.splitext(base_name)  # get the individual frame name without the extension
        
        if not os.path.isdir(f"{video_frames_dir}/output"):
            os.makedirs(f"{video_frames_dir}/output")  # makes output directory for the frame being processed
        
        image = cv2.imread(frame_path)
        coordinates = east_detect(image, f"{video_frames_dir}/output/out_{frame_name}_EAST.png", east_confidence)  # get coordinates of bounding boxes and save image to new output dir for that frame  
        coordinates_list += [coordinates]
        
    return coordinates_list

In [11]:
coordinates_list = get_text_coordinates_in_all_frames(saved_frame_nums, east_confidence)

In [12]:
opflow_box_coordinates

[(942, 798, 1200, 852), (952, 235, 1194, 275)]

In [13]:
coordinates_list

[[[144, 858, 326, 879],
  [277, 351, 399, 376],
  [30, 577, 153, 605],
  [118, 508, 245, 540],
  [28, 862, 75, 883],
  [108, 242, 219, 268],
  [967, 852, 1020, 875],
  [834, 175, 915, 203],
  [208, 740, 356, 767],
  [403, 340, 688, 374],
  [1, 512, 103, 541],
  [1096, 851, 1173, 874],
  [956, 234, 1076, 256],
  [403, 613, 498, 649],
  [11, 742, 138, 769],
  [31, 465, 166, 490],
  [562, 497, 688, 533],
  [405, 499, 547, 535],
  [161, 288, 288, 311],
  [513, 178, 648, 203],
  [99, 398, 189, 425],
  [393, 565, 558, 603],
  [0, 630, 80, 653],
  [217, 243, 281, 266],
  [99, 625, 217, 650],
  [1023, 853, 1080, 876],
  [586, 852, 706, 877],
  [121, 811, 208, 839],
  [376, 683, 600, 719],
  [54, 352, 253, 379],
  [427, 278, 528, 312],
  [768, 854, 915, 876],
  [980, 177, 1083, 200],
  [1020, 801, 1070, 822],
  [166, 465, 232, 489],
  [223, 577, 273, 602],
  [697, 338, 766, 375],
  [472, 854, 583, 879],
  [84, 861, 140, 883],
  [0, 400, 88, 425],
  [414, 230, 616, 266],
  [309, 861, 363, 881],


In [14]:
def calculate_overlap_area(rect1, rect2):
    """
    Calculate the overlapping area between two rectangles.
    
    Args:
        rect1 (tuple): The coordinates of the first rectangle in the form (startX, startY, endX, endY).
        rect2 (tuple): The coordinates of the second rectangle in the form (startX, startY, endX, endY).
    
    Returns:
        float: The overlapping area between the two rectangles.
    """
    x1_max = max(rect1[0], rect2[0])
    y1_max = max(rect1[1], rect2[1])
    x2_min = min(rect1[2], rect2[2])
    y2_min = min(rect1[3], rect2[3])
    
    overlap_width = max(0, x2_min - x1_max)
    overlap_height = max(0, y2_min - y1_max)
    
    if x1_max >= x2_min or y1_max >= y2_min:
        return 0
    
    return overlap_width * overlap_height


def calculate_total_overlapping_area(main_rect, rectangles):
    """
    Calculate the total overlapping area of multiple rectangles with the main rectangle.
    
    Args:
        main_rect (tuple): A tuple representing the main rectangle in the form (startX, startY, endX, endY).
        rectangles (list): A list of tuples representing the rectangles to calculate the overlapping areas with.
                           Each tuple should be in the form (startX, startY, endX, endY).
    
    Returns:
        float: The total overlapping area between the main rectangle and the given rectangles.
    """
    total_overlap_area = 0
    for rect in rectangles:
        total_overlap_area += calculate_overlap_area(main_rect, rect)
    
    return total_overlap_area


def get_best_opflow_box(opflow_box_coordinates, coordinates_list, num_frames):
    """
    Calculates the rolling box with the maximum overlap area.

    Args:
        opflow_box_coordinates (list): List of opflow box coordinates.
        coordinates_list (list): List of coordinates for each frame.
        num_frames (int): Number of frames to consider for overlap check.

    Returns:
        tuple: The rolling box with the maximum overlap area.

    """
    overlap_areas = []
    chosen_frames = random.sample(list(range(len(saved_frame_nums))), num_frames)
    
    for opflow_box_coordinate in opflow_box_coordinates:
        total_overlap = 0
        for frame_index in chosen_frames:
            total_overlap += calculate_total_overlapping_area(opflow_box_coordinate, coordinates_list[frame_index])
        overlap_areas.append(total_overlap)

    rolling_box_index = np.argmax(np.array(overlap_areas))
    rolling_box = opflow_box_coordinates[rolling_box_index]
    
    return rolling_box

In [15]:
rolling_box = get_best_opflow_box(opflow_box_coordinates, coordinates_list, num_frames_for_overlap_check)

In [16]:
rolling_box

(942, 798, 1200, 852)

## Step 4: Use mmocr to extract texts in rolling boxes

After getting the rolling box:

We crop the image with opflow boxes

but mmocr works badly on them....

Adding a wide border around these boxes can give us good results!

(Maybe because these boxes are too flat)

In [17]:
def crop_and_save_image(image_path, coords, output_path):
    '''Saves crops of the original image according to specified coordinates.

    Args:
        image_path (str): Path to the original image.
        coords (tuple): Coordinates of the bounding box in the format (x_min, y_min, x_max, y_max).
        output_path (str): Path of the output file.

    Returns:
        None

    Raises:
        cv2.error: If there is an error while saving the cropped image.

    '''
    
    img = cv2.imread(image_path)
    x_min, y_min, x_max, y_max = coords
    cropped_img = img[y_min:y_max, x_min:x_max]
    
    # Save
    try:
        cv2.imwrite(output_path, cropped_img)
    except cv2.error as e:
        print(coords)
        print(f"OpenCV error: {e}")


crop and save bounding boxes

In [28]:
# adding borders to the boxes:
def add_borders(input_img_path, output_img_path):
	"""
	Add borders to the input image.

	Parameters:
	input_img_path (str): The file path of the input image.
	output_img_path (str): The file path to save the output image.

	Returns:
	None
	"""
	old_im = Image.open(input_img_path)
	old_size = old_im.size
	
	new_size = (old_size[0] + 320, old_size[1] + 512)
	new_im = Image.new("RGB", new_size, (255, 255, 255))   ## luckily, this is already black!
	box = tuple((n - o) // 2 for n, o in zip(new_size, old_size))
	new_im.paste(old_im, box)
	
	new_im.save(output_img_path)
    
# Load models into memory
def get_ocr_texts(output_path):
	"""
	Extracts and sorts the OCR texts from the given output path.

	Args:
		output_path (str): The path to the OCR output file.

	Returns:
		list: A list of sorted OCR texts.
	"""
	ocr = MMOCRInferencer(det='DBNet', rec='SATRN', device='cpu')

	# Perform inference
	results = ocr(output_path, show=False, print_result=False)

	# Extract the OCR texts as a list of tuples with (word bounding box first coorinate, word)
	tuples_list = [(polygon[0], text) for polygon, text in zip(results["predictions"][0]["det_polygons"], results["predictions"][0]["rec_texts"])]

	# Sort the list of tuples based on the first number(word bounding box first coorinate) in each tuple
	sorted_tuples = sorted(tuples_list, key=lambda x: x[0])

	# Extract the sorted texts
	sorted_ocr_texts = [text for _, text in sorted_tuples]
 
	return sorted_ocr_texts

In [29]:
"""
1. Creates a directory to store cropped opflow images.
2. Iterates over a list of frame numbers and performs the following operations for each frame:
   a. Retrieves the path of the frame image.
   b. Crops and saves the opflow image using a rolling box.
   c. Prints the path of the saved cropped opflow image.
   d. Adds borders to the cropped opflow image.
   e. Extracts OCR texts from the opflow image with borders.
   f. Prints the OCR texts for the current frame.
   g. Appends the OCR texts to a list.

Note: The functions `crop_and_save_image`, `add_borders`, and `get_ocr_texts` are defined above
"""

ocr_texts = []

output_path = os.path.join(video_frames_dir, "croppedOpflow")
if os.path.isdir(output_path):
    print("The cropped folders exists! Please delete them :)")
else:
    os.makedirs(output_path)

for frame_num in saved_frame_nums:
    frame_path = os.path.join(video_frames_dir, f"frame{frame_num}.png")
        
    opflow_crop_path = os.path.join(output_path, f"frame_num_{frame_num}.jpg")
    crop_and_save_image(frame_path, rolling_box, opflow_crop_path)
    
    print(f"Saved cropped opflow box to {opflow_crop_path}")
    
    opflow_crop_with_border_path = os.path.join(output_path, f"frame_num_{frame_num}_border.jpg")
    add_borders(opflow_crop_path, opflow_crop_with_border_path)
    cur_ocr_texts = get_ocr_texts(opflow_crop_with_border_path)
    print(f"Current frame ocr texts are {cur_ocr_texts}")
    ocr_texts.append(cur_ocr_texts)


Saved cropped opflow box to ./train5_crop_image_frames/croppedOpflow/frame_num_0.jpg
Loads checkpoint by http backend from path: https://download.openmmlab.com/mmocr/textdet/dbnet/dbnet_resnet50-oclip_1200e_icdar2015/dbnet_resnet50-oclip_1200e_icdar2015_20221102_115917-bde8c87a.pth




Loads checkpoint by http backend from path: https://download.openmmlab.com/mmocr/textrecog/satrn/satrn_shallow_5e_st_mj/satrn_shallow_5e_st_mj_20220915_152443-5fd04a4c.pth


Output()

The model and loaded state dict do not match exactly

unexpected key in source state_dict: data_preprocessor.mean, data_preprocessor.std



Current frame ocr texts are ['Re:', 'Min.', '+++', 'Versp']
Saved cropped opflow box to ./train5_crop_image_frames/croppedOpflow/frame_num_50.jpg
Loads checkpoint by http backend from path: https://download.openmmlab.com/mmocr/textdet/dbnet/dbnet_resnet50-oclip_1200e_icdar2015/dbnet_resnet50-oclip_1200e_icdar2015_20221102_115917-bde8c87a.pth
Loads checkpoint by http backend from path: https://download.openmmlab.com/mmocr/textrecog/satrn/satrn_shallow_5e_st_mj/satrn_shallow_5e_st_mj_20220915_152443-5fd04a4c.pth


Output()

The model and loaded state dict do not match exactly

unexpected key in source state_dict: data_preprocessor.mean, data_preprocessor.std



Current frame ocr texts are ['Min.', '++', 'Verspatur']
Saved cropped opflow box to ./train5_crop_image_frames/croppedOpflow/frame_num_100.jpg
Loads checkpoint by http backend from path: https://download.openmmlab.com/mmocr/textdet/dbnet/dbnet_resnet50-oclip_1200e_icdar2015/dbnet_resnet50-oclip_1200e_icdar2015_20221102_115917-bde8c87a.pth
Loads checkpoint by http backend from path: https://download.openmmlab.com/mmocr/textrecog/satrn/satrn_shallow_5e_st_mj/satrn_shallow_5e_st_mj_20220915_152443-5fd04a4c.pth


Output()

The model and loaded state dict do not match exactly

unexpected key in source state_dict: data_preprocessor.mean, data_preprocessor.std



Current frame ocr texts are ['Jin.', '+++', 'Verspatung']
Saved cropped opflow box to ./train5_crop_image_frames/croppedOpflow/frame_num_150.jpg
Loads checkpoint by http backend from path: https://download.openmmlab.com/mmocr/textdet/dbnet/dbnet_resnet50-oclip_1200e_icdar2015/dbnet_resnet50-oclip_1200e_icdar2015_20221102_115917-bde8c87a.pth
Loads checkpoint by http backend from path: https://download.openmmlab.com/mmocr/textrecog/satrn/satrn_shallow_5e_st_mj/satrn_shallow_5e_st_mj_20220915_152443-5fd04a4c.pth


Output()

The model and loaded state dict do not match exactly

unexpected key in source state_dict: data_preprocessor.mean, data_preprocessor.std



Current frame ocr texts are ['+++', '-', 'Versplatung', 'the']


In [30]:
for text in ocr_texts:
	print(text)

['Re:', 'Min.', '+++', 'Versp']
['Min.', '++', 'Verspatur']
['Jin.', '+++', 'Verspatung']
['+++', '-', 'Versplatung', 'the']


# Step 5: Use llama to concatenate the texts

In [31]:
def llama3(prompt):
    """
    Sends a prompt to the Llama3.1 model and returns the response.

    Args:
        prompt (str): The prompt to send to the Llama3.1 model.

    Returns:
        str: The response from the Llama3.1 model.
    """
    url = "http://localhost:11434/api/chat"
    data = {
        "model": "llama3.1",
        "messages": [
            {
              "role": "user",
              "content": prompt
            }
        ],
        "stream": False
    }
    
    headers = {
        'Content-Type': 'application/json'
    }
    
    response = requests.post(url, headers=headers, json=data)
    return(response.json()['message']['content'])

In [35]:
prompt = f"Can you please take these word lists extracted from each frame of a rolling text video, concatenate them based on overlapping parts, and provide me with the resulting sentence? Please keep the original language, don't translate. Don't explain; provide only the sentence. {ocr_texts}"
response = llama3(prompt)
print(response)

Der Minister verspätet sich immer wieder.
