In [None]:
!pip install pytesseract pillow

#### Import the necessary libraries and Load the Images

In [1]:
import pytesseract
from PIL import Image
import os
import json
import pandas as pd

!export TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata/
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  
os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/4.00/tessdata/'
image_dir = 'bible/'
images = [f for f in os.listdir(image_dir) if f.endswith('.png')]
print(images)

['page_44.png', 'page_60.png', 'page_42.png', 'page_23.png', 'page_18.png', 'page_50.png', 'page_52.png', 'page_63.png', 'page_66.png', 'page_56.png', 'page_26.png', 'page_68.png', 'page_30.png', 'page_14.png', 'page_59.png', 'page_35.png', 'page_46.png', 'page_33.png', 'page_43.png', 'page_16.png', 'page_65.png', 'page_58.png', 'page_28.png', 'page_36.png', 'page_11.png', 'page_25.png', 'page_15.png', 'page_24.png', 'page_40.png', 'page_53.png', 'page_41.png', 'page_70.png', 'page_49.png', 'page_12.png', 'page_47.png', 'page_69.png', 'page_17.png', 'page_48.png', 'page_13.png', 'page_64.png', 'page_61.png', 'page_32.png', 'page_57.png', 'page_27.png', 'page_29.png', 'page_38.png', 'page_45.png', 'page_19.png', 'page_10.png', 'page_9.png', 'page_62.png', 'page_31.png', 'page_34.png', 'page_39.png', 'page_37.png', 'page_67.png', 'page_21.png', 'page_20.png', 'page_55.png', 'page_51.png', 'page_54.png', 'page_22.png']


#### OCR Function

In [3]:
def extract_text_from_region(image_path, region, lang='amh'):
    with Image.open(image_path) as img:
        cropped_img = img.crop(region)
        text = pytesseract.image_to_string(cropped_img, lang=lang)
    return text

#### Extracting Text from Annotated Regions

In [5]:
annotation_dir = 'preprocessed_bible_ann/'
annotations = []


for file_name in os.listdir(annotation_dir):
    if file_name.endswith('.json'):
        file_path = os.path.join(annotation_dir, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            image_path = os.path.join(image_dir, data['imagePath'])
            for shape in data['shapes']:
                label = shape['label']
                points = shape['points']
                x_min = min([p[0] for p in points])
                y_min = min([p[1] for p in points])
                x_max = max([p[0] for p in points])
                y_max = max([p[1] for p in points])
                region = (x_min, y_min, x_max, y_max)
                text = extract_text_from_region(image_path, region)
                annotations.append({
                    'image': image_path,
                    'label': label,
                    'region': region,
                    'text': text
                })

#### Saving the data

In [6]:
df = pd.DataFrame(annotations)
df.to_csv('extracted_texts.csv', index=False, encoding='utf-8')
df.head(10)

Unnamed: 0,image,label,region,text
0,bible/../bible/page_10.png,page_10,"(156.8275862068965, 145.43103448275855, 958.55...",\n \n \n\nቋ ዘሉንዞለ ነኽአኽግይሁ |\n\n \n\n \n\n% ቭ....
1,bible/../bible/page_9.png,page_9,"(36.561403508771946, 143.4298245614036, 880.42...",ያሴስል ክጦቡለኢፅ\nቦቱዞ፦ፀክልኒ 5ዐ.ሆት፡\nወሐየርያፉ አኔክፀቀ፡ክ\n...
2,bible/../bible/page_12.png,page_12,"(185.58490566037744, 90.01886792452837, 1000.6...",\n\nተ ።ቕ\nመ።፡ 5ሃ።2ክ፣ዘህኩሎ።:ወይ\n|ይቃ፡ወለ 5 መጽእ፡...
3,bible/../bible/page_15.png,page_15,"(10.076923076923094, 86.23076923076928, 890.84...",\n \n \n\nህ መጣዘሮድጩ ጳ፣፲ጄቶ\nሀ'ዊጅ፻5 58፡88 ጩ5፣...
4,bible/../bible/page_14.png,page_14,"(108.2962962962963, 80.12962962962969, 978.666...",፡ የመዬ፡ ፀዖዞብኔ ሏትቅፉ\nፎ»፡ ፮፯ጭ፲ብ፡ተዱ፴\n\nዌ፲ወጸጎራቂፄዘሕ...
5,bible/../bible/page_13.png,page_13,"(23.538461538461547, 88.15384615384622, 885.07...",ኃዠ 6በ:ህጢዚየ፡ ጸመ?\n|ኛ ክ ሠሠ-ታሰ-:ግ\n! መሙሽ ሰ 2: ፳ቹሕ...
6,bible/../bible/page_11.png,page_11,"(15.921568627451052, 126.19607843137257, 857.0...",\n\nከር.። ሖ.ዳ . . .9ወ. ህቂ\n፻8፦ ዕን ት፡5ጻ3\nአዥብ፣...
