**Initialization**

In [21]:
import google.generativeai as genai
import json
import os
from dotenv import load_dotenv
import easyocr
import cv2
from rapidfuzz import fuzz
from fuzzywuzzy import process

load_dotenv()

True

**Gemini Model**

In [22]:
def generate(prompt, image_path) -> list | dict:
    api_key = os.getenv("GEMINI_API_KEY")
    genai.configure(api_key=api_key)

    model = genai.GenerativeModel(
        model_name="gemini-2.5-flash",
        system_instruction="You are a helpful assistant that extracts newspaper fields from images.",
        generation_config={"response_mime_type": "application/json"}
    )

    with open(image_path, "rb") as image_file:
        image_bytes = image_file.read()

    response = model.generate_content([
        {"text": prompt},
        {"mime_type": "image/png", "data": image_bytes}
    ])

    raw_json = response.text
    data = json.loads(raw_json)

    return data

**Call to generate headline from Gemini**

In [23]:
headline_schema = {
	"type": "array",
	"items": {"type": "string"}
}

headline_prompt = (
	"You are given a newspaper image. "
	"Extract only the article all possible headlines — ignore advertisements, captions, subheadlines, and any other text. "
	"Return the result strictly matching this JSON schema:\n\n"
	f"{json.dumps(headline_schema, indent=2)}"
)

target_image_path = "page_56.png"

headlines = generate(headline_prompt, target_image_path)

for i, headline in enumerate(headlines):
    headlines[i] = headline

# Dev log
print(json.dumps(headlines, indent=2, ensure_ascii=False))

[
  "PH share prices dip; peso climbs to 56.65 a dollar",
  "Metro Pacific unit acquires coconut processor for P1b",
  "Concepcion Industrial booked P355-m net income in second quarter",
  "Manila Water aims to expand Project i-Float",
  "Ayala Corp., four subsidiaries retain spots on FTSE4Good Index"
]


**EasyOCR reads**

In [24]:
reader = easyocr.Reader(['en'])
image = cv2.imread(target_image_path)
results = reader.readtext(image)

image_raw = image.copy()
for (top_left, top_right, bottom_right, bottom_left), text, confidence in results:
    tl = (int(top_left[0]), int(top_left[1]))
    br = (int(bottom_right[0]), int(bottom_right[1]))
    cv2.rectangle(image_raw, tl, br, (0, 0, 255), 2)
    coord_label = f"{tl} {br}"  
    cv2.putText(image_raw, coord_label, (tl[0], tl[1] - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

output_path_raw = target_image_path.replace(".png", "_ocr_boxes.png")
cv2.imwrite(output_path_raw, image_raw)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


True

**Filter out text that matches with the list of headlines from Gemini**

In [25]:
import random

# Prepare headline dict
possible_headlines = {}
for headline in headlines:
    possible_headlines[headline] = {
        "texts": [],
        "boxes": []
    }

# Assign random colors per headline
headline_colors = {
    headline: (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
    for headline in headlines
}

# Work on image copy
image_colored = image.copy()

for coordinates, text, _ in results:
    for headline in possible_headlines:
        score = fuzz.partial_ratio(headline, text)
        if score > 80:
            top_left, _, bottom_right, _ = coordinates
            tl = (int(top_left[0]), int(top_left[1]))
            br = (int(bottom_right[0]), int(bottom_right[1]))

            # Save text + box
            possible_headlines[headline]["texts"].append(text)
            possible_headlines[headline]["boxes"].append((tl, br))

            # Draw bounding box with unique color
            color = headline_colors[headline]
            cv2.rectangle(image_colored, tl, br, color, 2)

            # Put label (headline)
            cv2.putText(image_colored, headline, (tl[0], tl[1] - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

# Save the output
output_path = target_image_path.replace(".png", "_filtered_boxes.png")
cv2.imwrite(output_path, image_colored)

# Also dump JSON
print(json.dumps(possible_headlines, indent=2, ensure_ascii=False))


{
  "PH share prices dip; peso climbs to 56.65 a dollar": {
    "texts": [
      "PH share prices =",
      "peso",
      "climbs to 56.65a dollar",
      "are",
      "to",
      "to",
      "to",
      "to",
      "sha",
      "dip;"
    ],
    "boxes": [
      [
        [
          101,
          671
        ],
        [
          771,
          852
        ]
      ],
      [
        [
          927,
          706
        ],
        [
          1144,
          838
        ]
      ],
      [
        [
          98,
          815
        ],
        [
          1103,
          966
        ]
      ],
      [
        [
          2918,
          3134
        ],
        [
          2982,
          3172
        ]
      ],
      [
        [
          2919,
          3309
        ],
        [
          2965,
          3340
        ]
      ],
      [
        [
          3150,
          5027
        ],
        [
          3193,
          5065
        ]
      ],
      [
        [
          3224,

**Merging Bounding boxes that are close to each other**

In [26]:
def is_close(coordinate1, coordinate2, gap_x=30, gap_y=20):
    (x1_min, y1_min), (x1_max, y1_max) = coordinate1
    (x2_min, y2_min), (x2_max, y2_max) = coordinate2

    # Overlaps
    overlap_x = min(x1_max, x2_max) - max(x1_min, x2_min)
    overlap_y = min(y1_max, y2_max) - max(y1_min, y2_min)

    # Edge alignment (corner-based)
    align_x = (abs(x1_min - x2_min) <= gap_x) or (abs(x1_max - x2_max) <= gap_x)
    align_y = (abs(y1_min - y2_min) <= gap_y) or (abs(y1_max - y2_max) <= gap_y)

    # If boxes intersect at all, they’re close
    if overlap_x > 0 and overlap_y > 0:
        return True

    # Side-by-side: small gap on X AND top/bottom edges align (corner closeness)
    side_by_side = (
        (0 <= x2_min - x1_max <= gap_x) or
        (0 <= x1_min - x2_max <= gap_x)
    ) and align_y

    # Stacked: small gap on Y AND left/right edges align (corner closeness)
    stacked = (
        (0 <= y2_min - y1_max <= gap_y) or
        (0 <= y1_min - y2_max <= gap_y)
    ) and align_x

    return side_by_side or stacked

# value is list of dicts with "text" and "box"
new = {}

for headline, obj in possible_headlines.items():

    new[headline] = []

    for i in range(len(obj["texts"])):
        text = obj["texts"][i]
        box = obj["boxes"][i]

        if not new[headline]:
            new[headline].append({"text": text, "box": box})

        else:
            for i, currentBox in enumerate(new[headline]):
                score = fuzz.ratio(new[headline][i]["text"], headline)
                
                if score >= 90:
                    break
                if is_close(currentBox["box"], box):
                    
                    (ex_tl_x, ex_tl_y), (ex_br_x, ex_br_y) = currentBox["box"]
                    (tl_x, tl_y), (br_x, br_y) = box

                    new_tl = (min(ex_tl_x, tl_x), min(ex_tl_y, tl_y))
                    new_br = (max(ex_br_x, br_x), max(ex_br_y, br_y))

                    new[headline][i]["box"] = (new_tl, new_br)
                    new[headline][i]["text"] += " " + text
                    break
            else:
                new[headline].append({"text": text, "box": box})

for key in new:
    print(new[key])
    for i in range(len(new[key])):
        for j in range(i + 1, len(new[key])):
            if is_close(new[key][i]["box"], new[key][j]["box"]):
                print(f"Merging {new[key][i]} and {new[key][j]}")
                (ex_tl_x, ex_tl_y), (ex_br_x, ex_br_y) = new[key][i]["box"]
                (tl_x, tl_y), (br_x, br_y) = new[key][j]["box"]

                new_tl = (min(ex_tl_x, tl_x), min(ex_tl_y, tl_y))
                new_br = (max(ex_br_x, br_x), max(ex_br_y, br_y))

                new[key][i]["box"] = (new_tl, new_br)
                new[key][i]["text"] += " " + new[key][j]["text"]
                del new[key][j]
                break

print(json.dumps(new, indent=2, ensure_ascii=False))

import cv2
import json
import random

# Assign colors per headline (consistent)
headline_colors = {
    headline: (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
    for headline in new
}

# Copy original image
image_colored = image.copy()

# Draw merged boxes
for headline, items in new.items():
    color = headline_colors[headline]
    for entry in items:
        (tl, br) = entry["box"]

        # Draw rectangle
        cv2.rectangle(image_colored, tl, br, color, 2)

        # Put headline text (or merged text if you prefer)
        label = headline  
        cv2.putText(image_colored, label, (tl[0], tl[1] - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

# Save the output
output_path = target_image_path.replace(".png", "_merged_boxes.png")
cv2.imwrite(output_path, image_colored)

# Print JSON for verification
print(json.dumps(new, indent=2, ensure_ascii=False))


[{'text': 'PH share prices = climbs to 56.65a dollar dip;', 'box': ((98, 671), (1103, 966))}, {'text': 'peso', 'box': ((927, 706), (1144, 838))}, {'text': 'are', 'box': ((2918, 3134), (2982, 3172))}, {'text': 'to', 'box': ((2919, 3309), (2965, 3340))}, {'text': 'to', 'box': ((3150, 5027), (3193, 5065))}, {'text': 'to', 'box': ((3224, 5159), (3267, 5197))}, {'text': 'to', 'box': ((3287, 5333), (3330, 5371))}, {'text': 'sha', 'box': ((515, 5425), (559, 5443))}]
Merging {'text': 'PH share prices = climbs to 56.65a dollar dip;', 'box': ((98, 671), (1103, 966))} and {'text': 'peso', 'box': ((927, 706), (1144, 838))}
[{'text': 'Metro Pacific unit acquires coconut processor for P1b', 'box': ((1287, 1875), (3643, 2423))}]
[{'text': 'on', 'box': ((525, 2434), (576, 2464))}, {'text': 'in', 'box': ((703, 2428), (746, 2465))}, {'text': 'on', 'box': ((1010, 2434), (1056, 2464))}, {'text': 'in', 'box': ((1188, 3050), (1231, 3087))}, {'text': 'in', 'box': ((3234, 3087), (3283, 3124))}, {'text': 'in',

**Picks the Best match and write their bounding box**

In [27]:
image_merged = image.copy()

for key in new:
    query = key
    choices = [i["text"] for i in new[key]]

    if not choices:
        continue  # skip if no choices

    # map choice → index
    choices_dict = {c: i for i, c in enumerate(choices)}

    # get best match
    best_match = process.extractOne(query, list(choices_dict.keys()))
    if best_match:
        text, score = best_match
        index = choices_dict[text]
        tl, br = new[key][index]["box"]
        print(f"{text=} {tl=} {br=}")
        cv2.rectangle(image_merged, tl, br, (0, 255, 0), 5)

# save output
output_path_merged = f"{os.path.splitext(target_image_path)[0]}_result{os.path.splitext(target_image_path)[1]}"
cv2.imwrite(output_path_merged, image_merged)

text='PH share prices = climbs to 56.65a dollar dip; peso' tl=(98, 671) br=(1144, 966)
text='Metro Pacific unit acquires coconut processor for P1b' tl=(1287, 1875) br=(3643, 2423)
text='Concepcion Industrial booked P355-m net income in second quarter' tl=(103, 3493) br=(2762, 3656)
text='Manila Water aims to expand Project i-Float' tl=(2983, 3507) br=(3578, 3886)
text='Ayala Corp;, four subsidiaries retain spots on FTSE4Good Index' tl=(1302, 5762) br=(3423, 5926)


True

**Generate Byline from Gemini**

In [None]:
byline_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "headline": {"type": "string"},
            "byline": {"type": "string"},
        },
        "required": ["headline", "byline"]
    }
}

byline_prompt = (
	"You are given a newspaper image. "
	"Extract only the article's byline or author from the given headlines — ignore advertisements, captions, subheadlines, and any other text. "
    "use the following headlines to find the bylines: "
    f"{json.dumps(headlines, indent=2)}. "
	"Return the result strictly matching this JSON schema:\n\n"
	f"{json.dumps(byline_schema, indent=2)}"
)

bylines = generate(byline_prompt, target_image_path)

print(json.dumps(bylines, indent=2, ensure_ascii=False))