**Initialization**

In [1]:
import google.generativeai as genai
import json
import os
from dotenv import load_dotenv
import easyocr
import cv2
from rapidfuzz import fuzz

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

**Gemini Model**

In [2]:
def generate(prompt, image_path) -> list | dict:
    api_key = os.getenv("GEMINI_API_KEY")
    genai.configure(api_key=api_key)

    model = genai.GenerativeModel(
        model_name="gemini-2.5-flash",
        system_instruction="You are a helpful assistant that extracts newspaper fields from images.",
        generation_config={"response_mime_type": "application/json"}
    )

    with open(image_path, "rb") as image_file:
        image_bytes = image_file.read()

    response = model.generate_content([
        {"text": prompt},
        {"mime_type": "image/png", "data": image_bytes}
    ])

    raw_json = response.text
    data = json.loads(raw_json)

    return data

**Call to generate from Gemini**

In [3]:
headline_schema = {
	"type": "array",
	"items": {"type": "string"}
}

headline_prompt = (
	"You are given a newspaper image. "
	"Extract only the main article headlines — ignore advertisements, captions, subheadlines, and any other text. "
	"Return the result strictly matching this JSON schema:\n\n"
	f"{json.dumps(headline_schema, indent=2)}"
)

target_image_path = "page_8.png"

headlines = generate(headline_prompt, target_image_path)

for i, headline in enumerate(headlines):
    headlines[i] = headlines[i].replace("/u", " ")

# Dev log
print(json.dumps(headlines, indent=2))

[
  "Thunderbelles' 5th-set grit turns back Crossovers",
  "Tenorio to follow in Jawo's footsteps as playing coach?",
  "Caloocan, Pangasinan hurdle rivals",
  "Cone lauds CJ Perez for helping Gilas defeat Black Bears",
  "Edoc shines as weather whips up surprises at JPGT Riviera"
]


**EasyOCR reads**

In [4]:
reader = easyocr.Reader(['en'])
image = cv2.imread(target_image_path)
results = reader.readtext(image, width_ths=2, link_threshold=0.3,)

image_raw = image.copy()
for (top_left, top_right, bottom_right, bottom_left), text, confidence in results:
    tl = (int(top_left[0]), int(top_left[1]))
    br = (int(bottom_right[0]), int(bottom_right[1]))
    cv2.rectangle(image_raw, tl, br, (0, 0, 255), 2)
    coord_label = f"{tl} {br}"
    cv2.putText(image_raw, coord_label, (tl[0], tl[1] - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

output_path_raw = target_image_path.replace(".png", "_ocr_boxes.png")
cv2.imwrite(output_path_raw, image_raw)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


True

**Finds the coordinates of the headings**

In [5]:
def is_close(coordinate1, coordinate2, gap_x=20, gap_y=50):
    ex_tl, ex_br = coordinate1
    tl, br = coordinate2

    horizontal_close = abs(ex_tl[1] - tl[1]) < gap_x 
    if horizontal_close:
        return True
    else:
        """
        Returns True if rectangles overlap or are within gap thresholds.
        """
        if ex_br[0] + gap_x < tl[0]:  # a is strictly left of b
            return False
        if br[0] + gap_x < ex_tl[0]:  # b is strictly left of a
            return False
        if ex_br[1] + gap_y < tl[1]:  # a is strictly above b
            return False
        if br[1] + gap_y < ex_tl[1]:  # b is strictly above a
            return False
        return True

bounding_box = {}
for headline in headlines:
    bounding_box[headline] = None


for coordinates, text, _ in results:
    for headline in headlines:
        score = fuzz.partial_ratio(headline, text)

        if score > 90 and len(text) > 3:
            
            top_left, _, bottom_right, _ = coordinates
            current_box = ((int(top_left[0]), int(top_left[1])),
                           (int(bottom_right[0]), int(bottom_right[1])))
            
            

            # Assign the coordinates
            if bounding_box[headline] is None:
                bounding_box[headline] = current_box
            else:
                print(f"{text=} {headline=} {current_box}")
                if is_close(bounding_box[headline], current_box):
                    
                    (ex_tl_x, ex_tl_y), (ex_br_x, ex_br_y) = bounding_box[headline]
                    new_tl = (min(ex_tl_x, current_box[0][0]), min(ex_tl_y, current_box[0][1]))
                    new_br = (max(ex_br_x, current_box[1][0]), max(ex_br_y, current_box[1][1]))
                    bounding_box[headline] = (new_tl, new_br)


text="Thunderbelles'" headline="Thunderbelles' 5th-set grit turns back Crossovers" ((103, 888), (798, 1046))
text='Sth-set grit' headline="Thunderbelles' 5th-set grit turns back Crossovers" ((107, 1035), (652, 1205))
text='turns back' headline="Thunderbelles' 5th-set grit turns back Crossovers" ((98, 1174), (607, 1319))
text='Crossovers' headline="Thunderbelles' 5th-set grit turns back Crossovers" ((105, 1312), (596, 1450))
text='footsteps as playing coach?' headline="Tenorio to follow in Jawo's footsteps as playing coach?" ((794, 2482), (3587, 2862))
text='Tenorio' headline="Tenorio to follow in Jawo's footsteps as playing coach?" ((2348, 3166), (2476, 3203))
text='Pangasinan' headline='Caloocan, Pangasinan hurdle rivals' ((100, 4139), (616, 4311))
text='hurdle rivals' headline='Caloocan, Pangasinan hurdle rivals' ((105, 4276), (638, 4414))
text='Caloocan;' headline='Caloocan, Pangasinan hurdle rivals' ((559, 5251), (753, 5299))
text='surprises at JPGT Riviera' headline='Edoc shines a

**Write the bounding box**

In [6]:
image_merged = image.copy()
for tl, br in bounding_box.values():
    cv2.rectangle(image_merged, tl, br, (0, 255, 0), 5)
    print(tl, br)

output_path_merged = f"{os.path.splitext(target_image_path)[0]}_result{os.path.splitext(target_image_path)[1]}"
cv2.imwrite(output_path_merged, image_merged)

(3335, 803) (3594, 846)
(794, 2240) (3587, 2862)
(100, 3999) (638, 4414)
(826, 3844) (3534, 4051)
(1039, 5301) (2493, 5619)


True