In [1]:
#!pip install easyocr

In [2]:
import os
import cv2
import glob
import json
import torch
import easyocr
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

torch.backends.cudnn.benchmark = True
try: torch.set_float32_matmul_precision("high")
except: pass

# Parse data path

In [3]:
keyframes_dir = r'D:\VN_Multi_User_Video_Search\frontend\ai\public\data\Keyframes'
all_keyframe_paths = dict()
for folder_name in sorted(os.listdir(keyframes_dir)):
    folder_path = os.path.join(keyframes_dir, folder_name)
    if os.path.isdir(folder_path) and folder_name.endswith('.mp4') is False:
        # Tên folder là Lxx_Vxxx
        data_part, video_id = folder_name.split('_')
        if data_part not in all_keyframe_paths:
            all_keyframe_paths[data_part] = dict()
        keyframe_paths = sorted(glob.glob(f'{folder_path}/*.jpg'))
        all_keyframe_paths[data_part][video_id] = keyframe_paths

# Run inference

In [4]:
reader = easyocr.Reader(['vi'], gpu=True) # this needs to run only once to load the model into memory

In [5]:
bs = 16
save_dir = './ocr'
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

def load_img(path, max_side=1280):
    data = np.fromfile(path, dtype=np.uint8)
    img = cv2.imdecode(data, cv2.IMREAD_COLOR)
    if img is None: return None
    h, w = img.shape[:2]; s = max(h, w)
    if s > max_side:
        r = max_side / s
        img = cv2.resize(img, (int(w*r), int(h*r)), interpolation=cv2.INTER_LINEAR)
    return img

keys = sorted(all_keyframe_paths.keys())
for key in tqdm(keys):
    out_key_dir = os.path.join(save_dir, key); os.makedirs(out_key_dir, exist_ok=True)
    video_keyframe_paths = all_keyframe_paths[key]
    for video_id in tqdm(sorted(video_keyframe_paths.keys()), leave=False):
        result_path = f"{out_key_dir}/{video_id}.json"
        if os.path.exists(result_path): 
            continue

        frames = video_keyframe_paths[video_id]
        video_ocr_results = []

        for i in range(0, len(frames), bs):
            batch_paths = frames[i:i+bs]
            with ThreadPoolExecutor(max_workers=8) as ex:
                imgs = list(ex.map(load_img, batch_paths))
            imgs = [im for im in imgs if im is not None]
            if not imgs:
                video_ocr_results.extend([[]]*len(batch_paths))
                continue

            results = reader.readtext_batched(imgs, batch_size=len(imgs), detail=1, decoder='greedy')

            # >>> Thêm get_paragraph ở đây <<<
            for res in results:
                refined = [it for it in res if it[2] > 0.5]                 # [bbox, text, conf]
                refined = easyocr.utils.get_paragraph(refined)               # gộp thành đoạn
                video_ocr_results.append([it[1] for it in refined])          # lấy text theo đoạn

        with open(result_path, "w", encoding='utf-8') as f:
            json.dump(video_ocr_results, f, ensure_ascii=False)

100%|██████████| 30/30 [4:45:26<00:00, 570.90s/it]  
