**Initialization**

In [75]:
import google.generativeai as genai
import json
import os
from dotenv import load_dotenv
import easyocr
import cv2
from rapidfuzz import fuzz

load_dotenv()

True

**Gemini Model**

In [76]:
def generate(prompt, image_path) -> list | dict:
    api_key = os.getenv("GEMINI_API_KEY")
    genai.configure(api_key=api_key)

    model = genai.GenerativeModel(
        model_name="gemini-2.5-flash",
        system_instruction="You are a helpful assistant that extracts newspaper fields from images.",
        generation_config={"response_mime_type": "application/json"}
    )

    with open(image_path, "rb") as image_file:
        image_bytes = image_file.read()

    response = model.generate_content([
        {"text": prompt},
        {"mime_type": "image/png", "data": image_bytes}
    ])

    raw_json = response.text
    data = json.loads(raw_json)

    return data

**Call to generate headline from Gemini**

In [77]:
headline_schema = {
	"type": "array",
	"items": {"type": "string"}
}

headline_prompt = (
	"You are given a newspaper image. "
	"Extract only the article all possible headlines — ignore advertisements, captions, subheadlines, and any other text. "
	"Return the result strictly matching this JSON schema:\n\n"
	f"{json.dumps(headline_schema, indent=2)}"
)

target_image_path = "page_17.png"

headlines = generate(headline_prompt, target_image_path)

for i, headline in enumerate(headlines):
    headlines[i] = headline
print(headlines)
# Dev log
print(json.dumps(headlines, indent=2, ensure_ascii=False))

['OCTA SURVEY SHOWS UPTICK IN PBBM, SPEAKER RATINGS', 'DILG: Full-blast probe on public funds misuse', 'Poor flood-control partly due to budget insertions—DPWH', 'Human skull, teeth from Taal Lake raise hope for DNA—DOJ', "Pres'l adviser, 2 others charged with indirect contempt before SC", 'House elects new committee heads']
[
  "OCTA SURVEY SHOWS UPTICK IN PBBM, SPEAKER RATINGS",
  "DILG: Full-blast probe on public funds misuse",
  "Poor flood-control partly due to budget insertions—DPWH",
  "Human skull, teeth from Taal Lake raise hope for DNA—DOJ",
  "Pres'l adviser, 2 others charged with indirect contempt before SC",
  "House elects new committee heads"
]


**EasyOCR reads**

In [78]:
reader = easyocr.Reader(['en'])
image = cv2.imread(target_image_path)
results = reader.readtext(image)

image_raw = image.copy()
for (top_left, top_right, bottom_right, bottom_left), text, confidence in results:
    tl = (int(top_left[0]), int(top_left[1]))
    br = (int(bottom_right[0]), int(bottom_right[1]))
    cv2.rectangle(image_raw, tl, br, (0, 0, 255), 2)
    coord_label = f"{tl} {br}"
    cv2.putText(image_raw, coord_label, (tl[0], tl[1] - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

output_path_raw = target_image_path.replace(".png", "_ocr_boxes.png")
cv2.imwrite(output_path_raw, image_raw)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


True

**Filter out text that matches with the list of headlines from Gemini**

In [89]:
bounding_box_text = []
bounding_box_coordinates = []

for coordinates, text, _ in results:
    
    for headline in headlines:
        score = fuzz.partial_ratio(headline, text)
        if score > 80:
            print(text)
            top_left, _, bottom_right, _ = coordinates
            
            current_box = ((int(top_left[0]), int(top_left[1])),
                           (int(bottom_right[0]), int(bottom_right[1])))
            bounding_box_text.append(text)
            bounding_box_coordinates.append(current_box)

OCTA SURVEY SHOWS UPTICK IN PBBM, SPEAKER RATINGS
the
within
OCTA
public
Poor flood-control
DILG: Full-blast probe
partly due to budget
insertions
DPWH
on
on
on
public funds misuse
Public
the
an
a
a
a
a
a
the
Human skull, teeth from Taal
Lake raise hope for DNA-_DOJ
Taal Lake
DNA
from
the
within
Pres'I adviser; 2 others charged
House elects new
with indirect contempt before SC
committeeheads
committees
in
in
contempt


In [80]:
def is_close(coordinate1, coordinate2, gap_x=5, gap_y=20):
    (x1_min, y1_min), (x1_max, y1_max) = coordinate1
    (x2_min, y2_min), (x2_max, y2_max) = coordinate2

    # Expand both boxes by the gap
    x1_min, y1_min, x1_max, y1_max = x1_min - gap_x, y1_min - gap_y, x1_max + gap_x, y1_max + gap_y
    x2_min, y2_min, x2_max, y2_max = x2_min - gap_x, y2_min - gap_y, x2_max + gap_x, y2_max + gap_y

    # Overlap condition (after expansion)
    horizontal_overlap = not (x1_max < x2_min or x2_max < x1_min)
    vertical_overlap = not (y1_max < y2_min or y2_max < y1_min)

    return horizontal_overlap and vertical_overlap

new = {}
print("Headlines:", headlines)
for headline in headlines:
    new[headline] = []

for i, text in enumerate(bounding_box_text):
    box = bounding_box_coordinates[i]
    for headline in new:
        score = fuzz.partial_ratio(headline, text)
        if score > 80:
            headline.replace(text, "")
            print(f"{headline=}")
            if not new[headline]:
                new[headline].append({"text": text, "box": box})
                print("new")
                print(f"{text=}, {box=}")
            else:
                for i, currentBox in enumerate(new[headline]):
                    score = fuzz.ratio(new[headline][i]["text"], headline)
                    print(f"{text=}, {box=}, {currentBox=}, {score=}")
                    if score >= 90:
                        break
                    if is_close(currentBox["box"], box):
                        
                        (ex_tl_x, ex_tl_y), (ex_br_x, ex_br_y) = currentBox["box"]
                        (tl_x, tl_y), (br_x, br_y) = box

                        new_tl = (min(ex_tl_x, tl_x), min(ex_tl_y, tl_y))
                        new_br = (max(ex_br_x, br_x), max(ex_br_y, br_y))

                        new[headline][i]["box"] = (new_tl, new_br)
                        new[headline][i]["text"] += " " + text
                        break
                else:
                    new[headline].append({"text": text, "box": box})

print(json.dumps(new, indent=2, ensure_ascii=False))

Headlines: ['OCTA SURVEY SHOWS UPTICK IN PBBM, SPEAKER RATINGS', 'DILG: Full-blast probe on public funds misuse', 'Poor flood-control partly due to budget insertions—DPWH', 'Human skull, teeth from Taal Lake raise hope for DNA—DOJ', "Pres'l adviser, 2 others charged with indirect contempt before SC", 'House elects new committee heads']
headline='OCTA SURVEY SHOWS UPTICK IN PBBM, SPEAKER RATINGS'
new
text='OCTA SURVEY SHOWS UPTICK IN PBBM, SPEAKER RATINGS', box=((174, 502), (3574, 685))
headline="Pres'l adviser, 2 others charged with indirect contempt before SC"
new
text='the', box=((1014, 967), (1073, 1010))
headline="Pres'l adviser, 2 others charged with indirect contempt before SC"
text='within', box=((1362, 967), (1474, 1004)), currentBox={'text': 'the', 'box': ((1014, 967), (1073, 1010))}, score=8.823529411764708
headline='OCTA SURVEY SHOWS UPTICK IN PBBM, SPEAKER RATINGS'
text='OCTA', box=((466, 1146), (577, 1189)), currentBox={'text': 'OCTA SURVEY SHOWS UPTICK IN PBBM, SPEAKER RA

In [81]:
for key in new:
    print(new[key])
    for i in range(len(new[key])):
        for j in range(i + 1, len(new[key])):
            if is_close(new[key][i]["box"], new[key][j]["box"]):
                print(f"Merging {new[key][i]} and {new[key][j]}")
                (ex_tl_x, ex_tl_y), (ex_br_x, ex_br_y) = new[key][i]["box"]
                (tl_x, tl_y), (br_x, br_y) = new[key][j]["box"]

                new_tl = (min(ex_tl_x, tl_x), min(ex_tl_y, tl_y))
                new_br = (max(ex_br_x, br_x), max(ex_br_y, br_y))

                new[key][i]["box"] = (new_tl, new_br)
                new[key][i]["text"] += " " + new[key][j]["text"]
                del new[key][j]
                break
print(new)

[{'text': 'OCTA SURVEY SHOWS UPTICK IN PBBM, SPEAKER RATINGS', 'box': ((174, 502), (3574, 685))}]
[{'text': 'public', 'box': ((1354, 1326), (1476, 1382))}, {'text': 'DILG: Full-blast probe on on on public funds misuse', 'box': ((95, 1455), (2963, 2252))}]
[{'text': 'Poor flood-control partly due to budget insertions', 'box': ((2986, 1504), (3644, 1848))}]
[{'text': 'an', 'box': ((112, 3270), (178, 3320))}, {'text': 'a a a a a', 'box': ((119, 3409), (149, 3450))}, {'text': 'Human skull, teeth from Taal Lake raise hope for DNA-_DOJ', 'box': ((2275, 4360), (3630, 4664))}]
[{'text': 'the', 'box': ((1014, 967), (1073, 1010))}, {'text': 'within', 'box': ((1362, 967), (1474, 1004))}, {'text': 'on on on', 'box': ((95, 1885), (468, 2186))}, {'text': 'the', 'box': ((3240, 2359), (3298, 2397))}, {'text': 'a a a a a', 'box': ((119, 3409), (149, 3450))}, {'text': 'the', 'box': ((413, 3662), (477, 3699))}, {'text': 'the', 'box': ((2681, 4932), (2745, 4970))}, {'text': 'within', 'box': ((3008, 5054),

In [82]:
from fuzzywuzzy import process

image_merged = image.copy()

for key in new:
    query = key
    choices = [i["text"] for i in new[key]]


    if not choices:
        continue  # skip if no choices

    # map choice → index
    choices_dict = {c: i for i, c in enumerate(choices)}

    # get best match
    best_match = process.extractOne(query, list(choices_dict.keys()))
    if best_match:
        text, score = best_match
        index = choices_dict[text]
        tl, br = new[key][index]["box"]
        cv2.rectangle(image_merged, tl, br, (0, 255, 0), 5)

# save output
output_path_merged = f"{os.path.splitext(target_image_path)[0]}_result{os.path.splitext(target_image_path)[1]}"
cv2.imwrite(output_path_merged, image_merged)



True

**Generate Byline from Gemini**

In [90]:
byline_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "headline": {"type": "string"},
            "byline": {"type": "string"},
        },
        "required": ["headline", "byline"]
    }
}

byline_prompt = (
	"You are given a newspaper image. "
	"Extract only the article's byline or author from the given headlines — ignore advertisements, captions, subheadlines, and any other text. "
    "use the following headlines to find the bylines: "
    f"{json.dumps(headlines, indent=2)}. "
	"Return the result strictly matching this JSON schema:\n\n"
	f"{json.dumps(byline_schema, indent=2)}"
)

bylines = generate(byline_prompt, target_image_path)

print(json.dumps(bylines, indent=2, ensure_ascii=False))

[
  {
    "headline": "OCTA SURVEY SHOWS UPTICK IN PBBM, SPEAKER RATINGS",
    "byline": "By Rex Espiritu and Maricel V. Cruz"
  },
  {
    "headline": "DILG: Full-blast probe on public funds misuse",
    "byline": "By Rex Espiritu, Pot Chavez and Vito Barcelo"
  },
  {
    "headline": "Poor flood-control partly due to budget insertions—DPWH",
    "byline": "By Vito Barcelo, Ram Superable, and Maricel V. Cruz"
  },
  {
    "headline": "Human skull, teeth from Taal Lake raise hope for DNA—DOJ",
    "byline": "By Pot Chavez"
  },
  {
    "headline": "Pres'l adviser, 2 others charged with indirect contempt before SC",
    "byline": "By Pot Chavez, Vito Barcelo, and Ram Superable"
  },
  {
    "headline": "House elects new committee heads",
    "byline": "By Maricel V. Cruz"
  }
]
