**Initialization**

In [1]:
import google.generativeai as genai
import json
import os
from dotenv import load_dotenv
import easyocr
import cv2
from rapidfuzz import fuzz
from fuzzywuzzy import process

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

**Gemini Model**

In [2]:
def generate(prompt, image_path) -> list | dict:
    api_key = os.getenv("GEMINI_API_KEY")
    genai.configure(api_key=api_key)

    model = genai.GenerativeModel(
        model_name="gemini-2.5-flash",
        system_instruction="You are a helpful assistant that extracts newspaper fields from images.",
        generation_config={"response_mime_type": "application/json"}
    )

    with open(image_path, "rb") as image_file:
        image_bytes = image_file.read()

    response = model.generate_content([
        {"text": prompt},
        {"mime_type": "image/png", "data": image_bytes}
    ])

    raw_json = response.text
    data = json.loads(raw_json)

    return data

**Call to generate headline from Gemini**

In [None]:
headline_schema = {
	"type": "array",
	"items": {"type": "string"}
}

headline_prompt = (
	"You are given a newspaper image. "
	"Extract only the article all possible headlines — ignore advertisements, captions, subheadlines, and any other text. "
	"Return the result strictly matching this JSON schema:\n\n"
	f"{json.dumps(headline_schema, indent=2)}"
)

target_image_path = "page_17.png"

headlines = generate(headline_prompt, target_image_path)

for i, headline in enumerate(headlines):
    headlines[i] = headline

# Dev log
print(json.dumps(headlines, indent=2, ensure_ascii=False))

[
  "House to open bicam to public",
  "Solons to prioritize social protection measures",
  "DPWH readies list of flood control projects; DepDev draws up protocols",
  "DOH explains zero-billing coverage, where it can be availed",
  "Senate vote to continue or junk VP impeach trial set for Aug. 6",
  "DOJ chief bares new witness on ‘sabungeros’ case",
  "Pag-IBIG special rate for expanded 4PH program"
]


**EasyOCR reads**

In [4]:
reader = easyocr.Reader(['en'])
image = cv2.imread(target_image_path)
results = reader.readtext(image)

image_raw = image.copy()
for (top_left, top_right, bottom_right, bottom_left), text, confidence in results:
    tl = (int(top_left[0]), int(top_left[1]))
    br = (int(bottom_right[0]), int(bottom_right[1]))
    cv2.rectangle(image_raw, tl, br, (0, 0, 255), 2)
    coord_label = f"{tl} {br}"  
    cv2.putText(image_raw, coord_label, (tl[0], tl[1] - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

output_path_raw = target_image_path.replace(".png", "_ocr_boxes.png")
cv2.imwrite(output_path_raw, image_raw)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


True

**Filter out text that matches with the list of headlines from Gemini**

In [5]:
possible_headlines = {}
for headline in headlines:
    possible_headlines[headline] = {
        "texts": [],
        "boxes": []
    }

for coordinates, text, _ in results:
    
    for headline in possible_headlines:
        score = fuzz.partial_ratio(headline, text)
        if score > 80:
            top_left, _, bottom_right, _ = coordinates
            
            current_box = ((int(top_left[0]), int(top_left[1])),
                           (int(bottom_right[0]), int(bottom_right[1])))
            possible_headlines[headline]["texts"].append(text)
            possible_headlines[headline]["boxes"].append(current_box)

print(json.dumps(possible_headlines, indent=2, ensure_ascii=False))

{
  "House to open bicam to public": {
    "texts": [
      "House to open bicam to public",
      "a",
      "lic",
      "to",
      "l",
      "to"
    ],
    "boxes": [
      [
        [
          114,
          1646
        ],
        [
          3690,
          2085
        ]
      ],
      [
        [
          1189,
          2534
        ],
        [
          1220,
          2570
        ]
      ],
      [
        [
          1067,
          3187
        ],
        [
          1120,
          3225
        ]
      ],
      [
        [
          1131,
          4770
        ],
        [
          1172,
          4800
        ]
      ],
      [
        [
          775,
          5837
        ],
        [
          854,
          5905
        ]
      ],
      [
        [
          2512,
          6472
        ],
        [
          2555,
          6510
        ]
      ]
    ]
  },
  "Solons to prioritize social protection measures": {
    "texts": [
      "a",
      "Solons to pr

**Merging Bounding boxes that are close to each other**

In [6]:
def is_close(coordinate1, coordinate2, gap_x=30, gap_y=20):
    (x1_min, y1_min), (x1_max, y1_max) = coordinate1
    (x2_min, y2_min), (x2_max, y2_max) = coordinate2

    # Overlaps
    overlap_x = min(x1_max, x2_max) - max(x1_min, x2_min)
    overlap_y = min(y1_max, y2_max) - max(y1_min, y2_min)

    # Edge alignment (corner-based)
    align_x = (abs(x1_min - x2_min) <= gap_x) or (abs(x1_max - x2_max) <= gap_x)
    align_y = (abs(y1_min - y2_min) <= gap_y) or (abs(y1_max - y2_max) <= gap_y)

    # If boxes intersect at all, they’re close
    if overlap_x > 0 and overlap_y > 0:
        return True

    # Side-by-side: small gap on X AND top/bottom edges align (corner closeness)
    side_by_side = (
        (0 <= x2_min - x1_max <= gap_x) or
        (0 <= x1_min - x2_max <= gap_x)
    ) and align_y

    # Stacked: small gap on Y AND left/right edges align (corner closeness)
    stacked = (
        (0 <= y2_min - y1_max <= gap_y) or
        (0 <= y1_min - y2_max <= gap_y)
    ) and align_x

    return side_by_side or stacked

# value is list of dicts with "text" and "box"
new = {}

for headline, obj in possible_headlines.items():

    new[headline] = []

    for i in range(len(obj["texts"])):
        text = obj["texts"][i]
        box = obj["boxes"][i]

        if not new[headline]:
            new[headline].append({"text": text, "box": box})

        else:
            for i, currentBox in enumerate(new[headline]):
                score = fuzz.ratio(new[headline][i]["text"], headline)
                
                if score >= 90:
                    break
                if is_close(currentBox["box"], box):
                    
                    (ex_tl_x, ex_tl_y), (ex_br_x, ex_br_y) = currentBox["box"]
                    (tl_x, tl_y), (br_x, br_y) = box

                    new_tl = (min(ex_tl_x, tl_x), min(ex_tl_y, tl_y))
                    new_br = (max(ex_br_x, br_x), max(ex_br_y, br_y))

                    new[headline][i]["box"] = (new_tl, new_br)
                    new[headline][i]["text"] += " " + text
                    break
            else:
                new[headline].append({"text": text, "box": box})

for key in new:
    print(new[key])
    for i in range(len(new[key])):
        for j in range(i + 1, len(new[key])):
            if is_close(new[key][i]["box"], new[key][j]["box"]):
                print(f"Merging {new[key][i]} and {new[key][j]}")
                (ex_tl_x, ex_tl_y), (ex_br_x, ex_br_y) = new[key][i]["box"]
                (tl_x, tl_y), (br_x, br_y) = new[key][j]["box"]

                new_tl = (min(ex_tl_x, tl_x), min(ex_tl_y, tl_y))
                new_br = (max(ex_br_x, br_x), max(ex_br_y, br_y))

                new[key][i]["box"] = (new_tl, new_br)
                new[key][i]["text"] += " " + new[key][j]["text"]
                del new[key][j]
                break

print(json.dumps(new, indent=2, ensure_ascii=False))

[{'text': 'House to open bicam to public', 'box': ((114, 1646), (3690, 2085))}]
[{'text': 'a', 'box': ((1189, 2534), (1220, 2570))}, {'text': 'Solons to prioritize socialprotection measures', 'box': ((1537, 2975), (3596, 3172))}]
[{'text': 'a', 'box': ((1189, 2534), (1220, 2570))}, {'text': 'DPWH readies list of flood control projects; DepDevdrawsup protocols', 'box': ((96, 4206), (1529, 4527))}]
[{'text': 'a', 'box': ((1189, 2534), (1220, 2570))}, {'text': 'in an', 'box': ((1178, 4621), (1283, 4659))}, {'text': 'DOH explains zero-billing coverage, where itcan be availed', 'box': ((100, 4942), (2594, 5140))}]
[{'text': 'a', 'box': ((1189, 2534), (1220, 2570))}, {'text': 'for', 'box': ((2269, 3308), (2328, 3346))}, {'text': 'for', 'box': ((2797, 3308), (2855, 3346))}, {'text': 'in', 'box': ((1178, 4621), (1221, 4659))}, {'text': 'to', 'box': ((1131, 4770), (1172, 4800))}, {'text': 'Senate vote to continue orjunk VP impeach trial setfor _', 'box': ((1525, 5491), (2872, 5804))}, {'text': 

**Picks the Best match and write their bounding box**

In [7]:
image_merged = image.copy()

for key in new:
    query = key
    choices = [i["text"] for i in new[key]]

    if not choices:
        continue  # skip if no choices

    # map choice → index
    choices_dict = {c: i for i, c in enumerate(choices)}

    # get best match
    best_match = process.extractOne(query, list(choices_dict.keys()))
    if best_match:
        text, score = best_match
        index = choices_dict[text]
        tl, br = new[key][index]["box"]
        print(f"{text=} {tl=} {br=}")
        cv2.rectangle(image_merged, tl, br, (0, 255, 0), 5)

# save output
output_path_merged = f"{os.path.splitext(target_image_path)[0]}_result{os.path.splitext(target_image_path)[1]}"
cv2.imwrite(output_path_merged, image_merged)

text='House to open bicam to public' tl=(114, 1646) br=(3690, 2085)
text='Solons to prioritize socialprotection measures' tl=(1537, 2975) br=(3596, 3172)
text='DPWH readies list of flood control projects; DepDevdrawsup protocols' tl=(96, 4206) br=(1529, 4527)
text='DOH explains zero-billing coverage, where itcan be availed' tl=(100, 4942) br=(2594, 5140)
text='Senate vote to continue orjunk VP impeach trial setfor _' tl=(1525, 5491) br=(2872, 5804)
text="DOJ chiefbares new witness on 'sabungeros case" tl=(2980, 5010) br=(3614, 5400)
text='Pag-IBIG specialrate for expanded 4PHprogram' tl=(1543, 6148) br=(3612, 6341)


True

**Generate Byline from Gemini**

In [8]:
byline_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "headline": {"type": "string"},
            "byline": {"type": "string"},
        },
        "required": ["headline", "byline"]
    }
}

byline_prompt = (
	"You are given a newspaper image. "
	"Extract only the article's byline or author from the given headlines — ignore advertisements, captions, subheadlines, and any other text. "
    "use the following headlines to find the bylines: "
    f"{json.dumps(headlines, indent=2)}. "
	"Return the result strictly matching this JSON schema:\n\n"
	f"{json.dumps(byline_schema, indent=2)}"
)

bylines = generate(byline_prompt, target_image_path)

print(json.dumps(bylines, indent=2, ensure_ascii=False))

[
  {
    "headline": "House to open bicam to public",
    "byline": "Maricel Cruz, Charles Dantes, Darwin G. Amojelar and Ram Superable"
  },
  {
    "headline": "Solons to prioritize social protection measures",
    "byline": "Maricel V. Cruz"
  },
  {
    "headline": "DPWH readies list of flood control projects; DepDev draws up protocols",
    "byline": "Vito Barcelo"
  },
  {
    "headline": "DOH explains zero-billing coverage, where it can be availed",
    "byline": "Ram Superable"
  },
  {
    "headline": "Senate vote to continue or junk VP impeach trial set for Aug. 6",
    "byline": "Ram Superable"
  },
  {
    "headline": "DOJ chief bares new witness on ‘sabungeros’ case",
    "byline": "Pot Chavez and Vince Lopez"
  },
  {
    "headline": "Pag-IBIG special rate for expanded 4PH program",
    "byline": ""
  }
]
