**Initialization**

In [1]:
import google.generativeai as genai
import json
import os
from dotenv import load_dotenv
import easyocr
import cv2
from rapidfuzz import fuzz

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

**Gemini Model**

In [2]:
def generate(prompt, image_path) -> list | dict:
    api_key = os.getenv("GEMINI_API_KEY")
    genai.configure(api_key=api_key)

    model = genai.GenerativeModel(
        model_name="gemini-2.5-flash",
        system_instruction="You are a helpful assistant that extracts newspaper fields from images.",
        generation_config={"response_mime_type": "application/json"}
    )

    with open(image_path, "rb") as image_file:
        image_bytes = image_file.read()

    response = model.generate_content([
        {"text": prompt},
        {"mime_type": "image/png", "data": image_bytes}
    ])

    raw_json = response.text
    data = json.loads(raw_json)

    return data

**Call to generate from Gemini**

In [3]:
headline_schema = {
	"type": "array",
	"items": {"type": "string"}
}

headline_prompt = (
	"You are given a newspaper image. "
	"Extract only the main article headlines — ignore advertisements, captions, subheadlines, and any other text. "
	"Return the result strictly matching this JSON schema:\n\n"
	f"{json.dumps(headline_schema, indent=2)}"
)

target_image_path = "sample3.png"

headlines = generate(headline_prompt, target_image_path)

# Dev log
print(json.dumps(headlines, indent=2))

[
  "Ex-OWWA chief, 10 others face graft raps",
  "DepEd unveils 10-yr plan to reform basic education",
  "Gov\u2019t set to roll out P20/k rice for 15m families",
  "NBI nabs Chinese posing as Filipino"
]


**EasyOCR reads**

In [4]:
reader = easyocr.Reader(['en'])
image = cv2.imread(target_image_path)
results = reader.readtext(image, width_ths=1)

image_raw = image.copy()
for (top_left, top_right, bottom_right, bottom_left), text, confidence in results:
    tl = (int(top_left[0]), int(top_left[1]))
    br = (int(bottom_right[0]), int(bottom_right[1]))
    cv2.rectangle(image_raw, tl, br, (0, 0, 255), 2)
    coord_label = f"{tl} {br}"
    cv2.putText(image_raw, coord_label, (tl[0], tl[1] - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

output_path_raw = target_image_path.replace(".png", "_ocr_boxes.png")
cv2.imwrite(output_path_raw, image_raw)

print(results)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


[([[np.int32(34), np.int32(53)], [np.int32(255), np.int32(53)], [np.int32(255), np.int32(139)], [np.int32(34), np.int32(139)]], 'NEWS', np.float64(0.9998408555984497)), ([[np.int32(971), np.int32(113)], [np.int32(1103), np.int32(113)], [np.int32(1103), np.int32(129)], [np.int32(971), np.int32(129)]], 'WEDNESDAY, JULY 30, 2025', np.float64(0.6128967716935996)), ([[np.int32(977), np.int32(127)], [np.int32(1103), np.int32(127)], [np.int32(1103), np.int32(145)], [np.int32(977), np.int32(145)]], 'mst daydesk@gmail,com', np.float64(0.7478406280994535)), ([[np.int32(901), np.int32(143)], [np.int32(1109), np.int32(143)], [np.int32(1109), np.int32(205)], [np.int32(901), np.int32(205)]], 'standard', np.float64(0.9990762593531431)), ([[np.int32(25), np.int32(214)], [np.int32(1107), np.int32(214)], [np.int32(1107), np.int32(306)], [np.int32(25), np.int32(306)]], 'Ex-OWWA chief; 10 others face graft raps', np.float64(0.5661722918463146)), ([[np.int32(33), np.int32(315)], [np.int32(139), np.int32(31

**Finds the coordinates of the headings**

In [None]:
def is_close(coordinate1, coordinate2, gap_x=20, gap_y=50):
    ex_tl, ex_br = coordinate1
    tl, br = coordinate2

    horizontal_close = abs(ex_tl[1] - tl[1]) < gap_x 
    if horizontal_close:
        return True
    else:
        """
        Returns True if rectangles overlap or are within gap thresholds.
        """
        if ex_br[0] + gap_x < tl[0]:  # a is strictly left of b
            return False
        if br[0] + gap_x < ex_tl[0]:  # b is strictly left of a
            return False
        if ex_br[1] + gap_y < tl[1]:  # a is strictly above b
            return False
        if br[1] + gap_y < ex_tl[1]:  # b is strictly above a
            return False
        return True

bounding_box = {}
for headline in headlines:
    bounding_box[headline] = None



for coordinates, text, _ in results:
    for headline in headlines:
        score = fuzz.partial_ratio(headline, text)
        if score > 90 and len(text.split()) > 1:
            top_left, _, bottom_right, _ = coordinates
            current_box = ((int(top_left[0]), int(top_left[1])),
                           (int(bottom_right[0]), int(bottom_right[1])))

            if bounding_box[headline] is None:
                bounding_box[headline] = current_box
            else:
                if is_close(bounding_box[headline], current_box):
                    
                    (ex_tl_x, ex_tl_y), (ex_br_x, ex_br_y) = bounding_box[headline]
                    new_tl = (min(ex_tl_x, current_box[0][0]), min(ex_tl_y, current_box[0][1]))
                    new_br = (max(ex_br_x, current_box[1][0]), max(ex_br_y, current_box[1][1]))
                    bounding_box[headline] = (new_tl, new_br)


**Handles single word**

In [10]:
for coordinates, text, _ in results:
    for headline in headlines:
        score = fuzz.partial_ratio(headline, text)
        if score > 90 and len(text.split()) == 1:
            top_left, _, bottom_right, _ = coordinates
            current_box = ((int(top_left[0]), int(top_left[1])),
                           (int(bottom_right[0]), int(bottom_right[1])))
            
            
            if bounding_box[headline] and is_close(bounding_box[headline], current_box):
                print(text)
                font_size = abs(bounding_box[headline][0][1] - bounding_box[headline][1][1])
                current_font_size = abs(current_box[0][1] - current_box[1][1])
                print(font_size)
                print(current_font_size)
                print(bounding_box[headline])
                print(current_box[0], current_box[1])
                (ex_tl_x, ex_tl_y), (ex_br_x, ex_br_y) = bounding_box[headline]
                new_tl = (min(ex_tl_x, current_box[0][0]), min(ex_tl_y, current_box[0][1]))
                new_br = (max(ex_br_x, current_box[1][0]), max(ex_br_y, current_box[1][1]))
                bounding_box[headline] = (new_tl, new_br)

Education
80
14
((249, 851), (884, 931))
(393, 917) (449, 931)


**Write the bounding box**

In [11]:
image_merged = image.copy()
for tl, br in bounding_box.values():
    cv2.rectangle(image_merged, tl, br, (255, 0, 0), 5)
    print(tl, br)

output_path_merged = f"{os.path.splitext(target_image_path)[0]}_result{os.path.splitext(target_image_path)[1]}"
cv2.imwrite(output_path_merged, image_merged)

(25, 214) (1107, 306)
(249, 851) (884, 931)
(33, 1111) (229, 1233)
(469, 1660) (883, 1705)


True