In [2]:
from PIL import Image
import numpy as np
import pytesseract
import cv2
import os
import re

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [29]:
def timestamp(image, debug = False):
    stamp = None
    extracted_text = pytesseract.image_to_string(image)
    line = extracted_text.split('\n')
    if debug:
        print(line)

    pattern = r'^\d{2}:\d{2}\.\d{3}$'
    time = line[0]
    if len(time) > 0:
        time = time.replace(',','')
        if re.match(pattern, time):
            stamp = time.replace('.',',')
        else:
            digits_only = re.sub(r'[^0-9]', '', time)
            if len(digits_only) == 7:
                stamp = digits_only[:2] + ':' + digits_only[2:4] + ',' + digits_only[4:]
            elif len(digits_only) == 5:
                stamp = digits_only[:1] + ':' + digits_only[1:3] + ',' + digits_only[3:] + '0**'
            elif len(digits_only) > 7:
                stamp = digits_only[:2] + ':' + digits_only[2:4] + ',' + digits_only[4:] + '0**'
            elif len(digits_only) > 5:
                stamp = digits_only[:2] + ':' + digits_only[2:4] + ',' + digits_only[4:] + '0**'
            else:
                stamp = '*** ' + digits_only
    else:
        stamp = '******'

    return stamp

In [15]:
def crop_image_around_text_lines(image, base_image, pad_extra=10, min_gap=5):    
    if base_image.ndim == 3:
        gray = np.mean(base_image, axis=2).astype(np.uint8)
    else:
        gray = base_image

    projection = np.sum(255 - gray, axis=1)
    threshold = np.max(projection) * 0.2

    row_has_text = projection > threshold
    lines = []
    in_line = False
    for i, has_text in enumerate(row_has_text):
        if has_text and not in_line:
            start = i
            in_line = True
        elif not has_text and in_line:
            end = i
            in_line = False
            lines.append((start, end))
    if in_line:
        lines.append((start, len(row_has_text)))
    
    merged_lines = []
    for line in lines:
        if not merged_lines or line[0] > merged_lines[-1][1] + min_gap:
            merged_lines.append(line)
        else:
            merged_lines[-1] = (merged_lines[-1][0], line[1])

    crops = []
    for start, end in merged_lines:
        center = (start + end) // 2
        height = end - start
        pad = height // 2 + pad_extra
        top = max(center - pad, 0)
        bottom = min(center + pad, image.size[1])
        crop_box = (0, top, image.size[0], bottom)
        crops.append(image.crop(crop_box))
    return crops

In [16]:
def single_run(image_path):
    base_image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    _, binary_image = cv2.threshold(base_image, 200, 255, cv2.THRESH_BINARY)

    image = Image.fromarray(binary_image)

    # cv2.imshow('Binary Image', binary_image)
    scale_factor = 2
    resized_image = image.resize(
        (image.width * scale_factor, image.height * scale_factor),
        resample=Image.LANCZOS
    )

    boxes = crop_image_around_text_lines(image, base_image)

    for line_img in boxes:
        # line_img.show()
        width, height = line_img.size
        left_box = (0, 0, box_width, height)
        middle_box = (box_width, 0, width - box_width - 1, height)
        right_box = (width - box_width, 0, width, height)

        left_section = line_img.crop(left_box)
        middle_section = line_img.crop(middle_box)
        right_section = line_img.crop(right_box) 

        if debug:
            left_section.show()
        start_stamp = timestamp(left_section, debug)
        if check:
            print("start", start_stamp)

        if debug:
            right_section.show()
        end_stamp = timestamp(right_section, debug)       
        if check:
            print("end", end_stamp)

        extracted_text = pytesseract.image_to_string(middle_section, config='psm 7 -l eng')
        if debug_words:
            print(extracted_text)
        line_text = extracted_text.split('\n')[0].replace('|', 'I')
        if check:
            print(line_text)

        if re.sub(r'[^0-9]', '', start_stamp) > re.sub(r'[^0-9]', '', end_stamp):
            stamp = "00:" + start_stamp +' --> 00:' + end_stamp + '&&'
        else:
            stamp = "00:" + start_stamp +' --> 00:' + end_stamp

        if len(line_text) != 0 or start_stamp != '******' or end_stamp != '******':
            start.append(start_stamp)
            end.append(end_stamp)
            text.append(line_text)
            time_stamp.append(stamp)

    return len(start), len(end), len(text), len(boxes)

In [None]:
def check_time():
    if "*" not in end[0]:
        last_end = re.sub(r'[^0-9]', '', end[0])
    elif "*" not in start[0]:
        # end_digits = re.sub(r'[^0-9]', '', end[0])
        last_end = re.sub(r'[^0-9]', '', start[0])
    else:
        last_end = 0000000
    for i in range(1, len(time_stamp)):
        start_time = re.sub(r'[^0-9]', '', start[0])
        end_time = re.sub(r'[^0-9]', '', end[0])
        if start_time < last_end:
            if start_time > end_time:


In [None]:
def check_time():
    last_end = re.sub(r'[^0-9]', '', end[0])
    for i in range(1, len(time_stamp)):
        start_time = re.sub(r'[^0-9]', '', start[0])
        end_time = re.sub(r'[^0-9]', '', end[0])
        


In [18]:
box_width = 78
debug = False
debug_words = False
check = False
folder_dir = "Images"
file_names = os.listdir(folder_dir)

In [23]:
offset = 0
count_file = 0 

In [27]:
start = []
end = []
text = []
time_stamp = []

In [28]:
image_path = folder_dir + '/' + file_names[count_file]
stats = single_run(image_path)
print(file_names[count_file], stats)
for i in range(len(text)):
    print(i)
    print(time_stamp[i])
    print(text[i])

1.png (14, 14, 14, 15)
0
00:00:00,002 --> 00:00:01,301
Hello everyone
1
00:00:01,560 --> 00:00:05,010**
Right now, it is 12 something at night
2
00:0:05,020** --> 00:00:06,010**&&
Middle of the night
3
00:0:06,020** --> 00:00:82,010**
[couldn't control my mouth and hand
4
00:00:08,202 --> 00:00:09,201
Just couldn't resist
5
00:00:11,870 --> 00:0:14,010**
Ordered some pudding Mango Pomelo Sago
6
00:00:14,002 --> 00:00:16,001
Wanted to do a taste test for everyone
7
00:00:16,002 --> 00:00:17,90010**
[Looks good]
8
00:00:17,001 --> 00:00:18,001
Iwill take a bite
9
00:00:19,047 --> 00:00:20,369
Everyone, watch my reaction
10
00:00:24,084 --> 00:00:25,836
[Start chewing]
11
00:00:26,576 --> 00:0:28,260**
[still chewing]
12
00:00:35,850 --> 00:00:37,545
It's not as amazing as I imagined
13
00:00:37,545 --> 00:00:39,578
i: I want to eat
