**Initialization**

In [33]:
import google.generativeai as genai
import json
import os
from dotenv import load_dotenv
import easyocr
import cv2
from rapidfuzz import fuzz
from fuzzywuzzy import process

load_dotenv()

True

**Gemini Model**

In [34]:
def generate(prompt, image_path) -> list | dict:
    api_key = os.getenv("GEMINI_API_KEY")
    genai.configure(api_key=api_key)

    model = genai.GenerativeModel(
        model_name="gemini-2.5-flash",
        system_instruction="You are a helpful assistant that extracts newspaper fields from images.",
        generation_config={"response_mime_type": "application/json"}
    )

    with open(image_path, "rb") as image_file:
        image_bytes = image_file.read()

    response = model.generate_content([
        {"text": prompt},
        {"mime_type": "image/png", "data": image_bytes}
    ])

    raw_json = response.text
    data = json.loads(raw_json)

    return data

**Call to generate headline from Gemini**

In [35]:
headline_schema = {
	"type": "array",
	"items": {"type": "string"}
}

headline_prompt = (
	"You are given a newspaper image. "
	"Extract only the article all possible headlines — ignore advertisements, captions, subheadlines, and any other text. "
	"Return the result strictly matching this JSON schema:\n\n"
	f"{json.dumps(headline_schema, indent=2)}"
)

target_image_path = "page_17.png"

generated_headlines = generate(headline_prompt, target_image_path)

# Dev log
print(json.dumps(generated_headlines, indent=2, ensure_ascii=False))

[
  "OCTA SURVEY SHOWS UPTICK IN PBBM, SPEAKER RATINGS",
  "DILG: Full-blast probe on public funds misuse",
  "Poor flood-control partly due to budget insertions—DPWH",
  "Human skull, teeth from Taal Lake raise hope for DNA—DOJ",
  "Pres'l adviser, 2 others charged with indirect contempt before SC",
  "House elects new committee heads"
]


**EasyOCR reads**

In [36]:
reader = easyocr.Reader(['en'])
image = cv2.imread(target_image_path)
results = reader.readtext(image)
    
image_raw = image.copy()
for (top_left, top_right, bottom_right, bottom_left), text, confidence in results:
    tl = (int(top_left[0]), int(top_left[1]))
    br = (int(bottom_right[0]), int(bottom_right[1]))
    cv2.rectangle(image_raw, tl, br, (0, 0, 255), 2)
    coord_label = f"{tl} {br}"  
    cv2.putText(image_raw, text, (tl[0], tl[1] - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

output_path_raw = target_image_path.replace(".png", "_ocr_boxes.png")
cv2.imwrite(output_path_raw, image_raw)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


True

**Filter out text that matches with the list of headlines from Gemini**

In [37]:
def filter_texts_by_score(texts: list, threshold: int = 80) -> list:

    possible_texts = {}
    for text in texts:
        possible_texts[text] = {
            "texts": [],
            "boxes": []
        }

    for coordinates, text, _ in results:
        
        for headline in possible_texts:
            score = fuzz.partial_ratio(headline, text)
            if score > threshold:
                top_left, _, bottom_right, _ = coordinates
                
                current_box = ((int(top_left[0]), int(top_left[1])),
                            (int(bottom_right[0]), int(bottom_right[1])))
                possible_texts[headline]["texts"].append(text)
                possible_texts[headline]["boxes"].append(current_box)

    return possible_texts
        
possible_headlines = filter_texts_by_score(generated_headlines)
print(json.dumps(possible_headlines, indent=2, ensure_ascii=False))

{
  "OCTA SURVEY SHOWS UPTICK IN PBBM, SPEAKER RATINGS": {
    "texts": [
      "OCTA SURVEY SHOWS UPTICK IN PBBM, SPEAKER RATINGS",
      "OCTA"
    ],
    "boxes": [
      [
        [
          174,
          502
        ],
        [
          3574,
          685
        ]
      ],
      [
        [
          466,
          1146
        ],
        [
          577,
          1189
        ]
      ]
    ]
  },
  "DILG: Full-blast probe on public funds misuse": {
    "texts": [
      "public",
      "DILG: Full-blast probe",
      "on",
      "public funds misuse",
      "Public",
      "a"
    ],
    "boxes": [
      [
        [
          1354,
          1326
        ],
        [
          1476,
          1382
        ]
      ],
      [
        [
          96,
          1455
        ],
        [
          2800,
          1894
        ]
      ],
      [
        [
          95,
          1885
        ],
        [
          468,
          2186
        ]
      ],
      [
        [
         

**Merging Bounding boxes that are close to each other**

In [38]:
def merging_bounding_boxes(texts: list) -> list:

    def is_close(coordinate1, coordinate2, gap_x=30, gap_y=20):
        (x1_min, y1_min), (x1_max, y1_max) = coordinate1
        (x2_min, y2_min), (x2_max, y2_max) = coordinate2

        # Overlaps
        overlap_x = min(x1_max, x2_max) - max(x1_min, x2_min)
        overlap_y = min(y1_max, y2_max) - max(y1_min, y2_min)

        # Edge alignment (corner-based)
        align_x = (abs(x1_min - x2_min) <= gap_x) or (abs(x1_max - x2_max) <= gap_x)
        align_y = (abs(y1_min - y2_min) <= gap_y) or (abs(y1_max - y2_max) <= gap_y)

        # If boxes intersect at all, they’re close
        if overlap_x > 0 and overlap_y > 0:
            return True

        # Side-by-side: small gap on X AND top/bottom edges align (corner closeness)
        side_by_side = (
            (0 <= x2_min - x1_max <= gap_x) or
            (0 <= x1_min - x2_max <= gap_x)
        ) and align_y

        # Stacked: small gap on Y AND left/right edges align (corner closeness)
        stacked = (
            (0 <= y2_min - y1_max <= gap_y) or
            (0 <= y1_min - y2_max <= gap_y)
        ) and align_x

        return side_by_side or stacked

    # value is list of dicts with "text" and "box"
    new = {}

    for headline, obj in texts.items():

        new[headline] = []

        for i in range(len(obj["texts"])):
            text = obj["texts"][i]
            box = obj["boxes"][i]

            if not new[headline]:
                new[headline].append({"text": text, "box": box})

            else:
                for i, currentBox in enumerate(new[headline]):

                    if is_close(currentBox["box"], box):
                        (ex_tl_x, ex_tl_y), (ex_br_x, ex_br_y) = currentBox["box"]
                        (tl_x, tl_y), (br_x, br_y) = box

                        new_tl = (min(ex_tl_x, tl_x), min(ex_tl_y, tl_y))
                        new_br = (max(ex_br_x, br_x), max(ex_br_y, br_y))

                        new[headline][i]["box"] = (new_tl, new_br)
                        new[headline][i]["text"] += " " + text
                        break
                else:
                    new[headline].append({"text": text, "box": box})

    for key in new:
        for i in range(len(new[key])):
            for j in range(i + 1, len(new[key])):
                if is_close(new[key][i]["box"], new[key][j]["box"]):
                    print(f"Merging {new[key][i]} and {new[key][j]}")
                    (ex_tl_x, ex_tl_y), (ex_br_x, ex_br_y) = new[key][i]["box"]
                    (tl_x, tl_y), (br_x, br_y) = new[key][j]["box"]

                    new_tl = (min(ex_tl_x, tl_x), min(ex_tl_y, tl_y))
                    new_br = (max(ex_br_x, br_x), max(ex_br_y, br_y))

                    new[key][i]["box"] = (new_tl, new_br)
                    new[key][i]["text"] += " " + new[key][j]["text"]
                    del new[key][j]
                    break 
    print(json.dumps(new, indent=2, ensure_ascii=False))
    return new

merged_texts_headlines = merging_bounding_boxes(possible_headlines)

{
  "OCTA SURVEY SHOWS UPTICK IN PBBM, SPEAKER RATINGS": [
    {
      "text": "OCTA SURVEY SHOWS UPTICK IN PBBM, SPEAKER RATINGS",
      "box": [
        [
          174,
          502
        ],
        [
          3574,
          685
        ]
      ]
    },
    {
      "text": "OCTA",
      "box": [
        [
          466,
          1146
        ],
        [
          577,
          1189
        ]
      ]
    }
  ],
  "DILG: Full-blast probe on public funds misuse": [
    {
      "text": "public",
      "box": [
        [
          1354,
          1326
        ],
        [
          1476,
          1382
        ]
      ]
    },
    {
      "text": "DILG: Full-blast probe on public funds misuse",
      "box": [
        [
          95,
          1455
        ],
        [
          2963,
          2252
        ]
      ]
    },
    {
      "text": "Public",
      "box": [
        [
          3393,
          2127
        ],
        [
          3509,
          2170
        ]
      ]
   

**Picks the Best match and write their bounding box**

In [39]:
result_image = image.copy()
drawn_boxes = []  # store drawn rectangles as (tl, br)

def boxes_overlap(box1, box2):
    (x1_min, y1_min), (x1_max, y1_max) = box1
    (x2_min, y2_min), (x2_max, y2_max) = box2

    # check if boxes intersect
    return not (x1_max < x2_min or x2_max < x1_min or
                y1_max < y2_min or y2_max < y1_min)

def draw_boxes(texts: list, color=(0, 255, 0), thickness=5, bylines=False):
    for key in texts:
        query = key
        choices = [i["text"] for i in texts[key]]

        if not choices:
            continue  # skip if no choices

        # map choice → index
        choices_dict = {c: i for i, c in enumerate(choices)}

        # get best match
        best_match = process.extractOne(query, list(choices_dict.keys()))
        if best_match:
            text, score = best_match
            index = choices_dict[text]
            tl, br = texts[key][index]["box"]
            new_box = (tl, br)

            # skip if overlaps any existing box
            if bylines and any(boxes_overlap(new_box, old) for old in drawn_boxes):
                continue

            # draw box
            cv2.rectangle(result_image, tl, br, color, thickness)
            if bylines:
                drawn_boxes.append(new_box)

    # save output
    output_path_merged = f"{os.path.splitext(target_image_path)[0]}_result{os.path.splitext(target_image_path)[1]}"
    cv2.imwrite(output_path_merged, result_image)

draw_boxes(merged_texts_headlines)
print(drawn_boxes)


[]


**Generate Byline from Gemini**

In [40]:
article_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "headline": {"type": "string"},
            "subheadline": {"type": "string"},
            "body" : {"type": "string"},
            "byline": {"type": "string"}
        },
        "required": ["subheadline", "body", "byline"]
    }
}

article_prompt = (
	"You are given a newspaper image. "
	"From the given headlines, extract their subheadline if there is any"
	"From the given headlines, extract their body"
	"Also extract the article's byline or author from the given headlines — bylines might appear after your generated body and will never appear on top of the headline"
    "use the following headlines to find the subheadline, body and bylines: "
    f"{json.dumps(generated_headlines, indent=2)}. "
	"Return the result strictly matching this JSON schema:\n\n"
	f"{json.dumps(article_schema, indent=2)}"
)

generated_article = generate(article_prompt, target_image_path)

print(json.dumps(generated_article, indent=2, ensure_ascii=False))

[
  {
    "headline": "OCTA SURVEY SHOWS UPTICK IN PBBM, SPEAKER RATINGS",
    "subheadline": "",
    "body": "PRESIDENT Ferdinand Marcos Jr. and House Speaker Martin Romual-dez saw marked improvements in their trust and performance ratings, accord-ing to the latest Tugon ng Masa (TNM) survey released by OCTA Research yesterday.\nAt the same time, Vice President Sara Duterte-Carpio and Senate Presi-dent Francis Escudero logged notable declines.\nMr. Marcos emerged as the top-performing official in the second-quarter survey, with a 4-point increase in trust rating (64%) and a 3-point rise in per-formance rating (62%).\nWhile the latter remains within the margin of error, OCTA noted that the consistent upward trend \"genuine and positive shift\" in public perception.\nThe improvements marked a rever-sal of the downward trend observed since late 2024, highlighting what OCTA describes as a renewed public",
    "byline": "By Rex Espiritu and Maricel V. Cruz"
  },
  {
    "headline": "DILG: 

In [41]:
bylines = [i["byline"] for i in generated_article]
possible_bylines = filter_texts_by_score(bylines)
merged_bylines = merging_bounding_boxes(possible_bylines)
draw_boxes(merged_bylines, color=(0, 0, 255), bylines=True)

subheadlines = [i["subheadline"] for i in generated_article]
possible_subheadlines = filter_texts_by_score(subheadlines)
merged_subheadlines = merging_bounding_boxes(possible_subheadlines)
draw_boxes(merged_subheadlines, color=(255, 0, 0))

{
  "By Rex Espiritu and Maricel V. Cruz": [
    {
      "text": "By Rex Espiritu and Maricel V. Cruz",
      "box": [
        [
          111,
          733
        ],
        [
          574,
          864
        ]
      ]
    },
    {
      "text": "and",
      "box": [
        [
          113,
          967
        ],
        [
          182,
          1004
        ]
      ]
    },
    {
      "text": "and",
      "box": [
        [
          3002,
          2174
        ],
        [
          3072,
          2212
        ]
      ]
    },
    {
      "text": "By Rex Espiritu,",
      "box": [
        [
          111,
          2494
        ],
        [
          495,
          2562
        ]
      ]
    },
    {
      "text": "an",
      "box": [
        [
          112,
          3270
        ],
        [
          178,
          3320
        ]
      ]
    },
    {
      "text": "a",
      "box": [
        [
          119,
          3409
        ],
        [
          149,
      