**Initialization**

In [None]:
import google.generativeai as genai
import json
import os
from dotenv import load_dotenv
import easyocr
import cv2
from rapidfuzz import fuzz
from fuzzywuzzy import process, fuzz

load_dotenv()

**Gemini Model**

In [None]:
def generate(prompt, image_path) -> list | dict:
    api_key = os.getenv("GEMINI_API_KEY")
    genai.configure(api_key=api_key)

    model = genai.GenerativeModel(
        model_name="gemini-2.5-flash",
        system_instruction="You are a helpful assistant that extracts newspaper fields from images.",
        generation_config={"response_mime_type": "application/json"}
    )

    with open(image_path, "rb") as image_file:
        image_bytes = image_file.read()

    response = model.generate_content([
        {"text": prompt},
        {"mime_type": "image/png", "data": image_bytes}
    ])

    raw_json = response.text
    data = json.loads(raw_json)

    return data

**Call to generate headline from Gemini**

In [None]:
headline_schema = {
	"type": "array",
	"items": {"type": "string"}
}

headline_prompt = (
	"You are given a newspaper image. "
	"Extract only the article all possible headlines — ignore advertisements, captions, subheadlines, and any other text. "
	"Return the result strictly matching this JSON schema:\n\n"
	f"{json.dumps(headline_schema, indent=2)}"
)

target_image_path = "page_1.png"

generated_headlines = generate(headline_prompt, "input/" + target_image_path)

# Dev log
print(json.dumps(generated_headlines, indent=2, ensure_ascii=False))

**EasyOCR reads**

In [None]:
reader = easyocr.Reader(['en'])
image = cv2.imread("input/" + target_image_path)
results = reader.readtext(image)

**Shows how the OCR reads the newspaper**

In [None]:
image_raw = image.copy()
for (top_left, top_right, bottom_right, bottom_left), text, confidence in results:
    tl = (int(top_left[0]), int(top_left[1]))
    br = (int(bottom_right[0]), int(bottom_right[1]))
    cv2.rectangle(image_raw, tl, br, (0, 255, 0), 2)

    coord_label = f"{tl} {br}"  
    cv2.putText(image_raw, text, (tl[0], tl[1] - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)

output_path_raw = "ocr_reads/" + target_image_path
cv2.imwrite(output_path_raw, image_raw)


**Filter out text that matches with the list of headlines from Gemini**

In [None]:
def filter_texts_by_score(texts: list, threshold: int = 70) -> list:

    possible_texts = {}
    for text in texts:
        if text and text in possible_texts:
            text += "."
        if text:
            possible_texts[text] = {
                "texts": [],
                "boxes": []
            }

    for coordinates, text, _ in results:
        
        for headline in possible_texts:
            score = fuzz.partial_ratio(headline, text)
            if score > threshold:
                top_left, _, bottom_right, _ = coordinates
                
                current_box = ((int(top_left[0]), int(top_left[1])),
                            (int(bottom_right[0]), int(bottom_right[1])))
                possible_texts[headline]["texts"].append(text)
                possible_texts[headline]["boxes"].append(current_box)
    print(possible_texts)
    return possible_texts
        
possible_headlines = filter_texts_by_score(generated_headlines)

**Merging Bounding boxes that are close to each other**

In [None]:
def merging_bounding_boxes(texts: list) -> list:

    def is_close(coordinate1, coordinate2, gap_x=30, gap_y=20):
        (x1_min, y1_min), (x1_max, y1_max) = coordinate1
        (x2_min, y2_min), (x2_max, y2_max) = coordinate2

        # Overlaps
        overlap_x = min(x1_max, x2_max) - max(x1_min, x2_min)
        overlap_y = min(y1_max, y2_max) - max(y1_min, y2_min)

        # Edge alignment (corner-based)
        align_x = (abs(x1_min - x2_min) <= gap_x) or (abs(x1_max - x2_max) <= gap_x)
        align_y = (abs(y1_min - y2_min) <= gap_y) or (abs(y1_max - y2_max) <= gap_y)

        # If boxes intersect at all, they’re close
        if overlap_x > 0 and overlap_y > 0:
            return True

        # Side-by-side: small gap on X AND top/bottom edges align (corner closeness)
        side_by_side = (
            (0 <= x2_min - x1_max <= gap_x) or
            (0 <= x1_min - x2_max <= gap_x)
        ) and align_y

        # Stacked: small gap on Y AND left/right edges align (corner closeness)
        stacked = (
            (0 <= y2_min - y1_max <= gap_y) or
            (0 <= y1_min - y2_max <= gap_y)
        ) and align_x

        return side_by_side or stacked

    # value is list of dicts with "text" and "box"
    new = {}

    for headline, obj in texts.items():

        new[headline] = []

        for i in range(len(obj["texts"])):
            text = obj["texts"][i]
            box = obj["boxes"][i]

            if not new[headline]:
                new[headline].append({"text": text, "box": box})

            else:
                for i, currentBox in enumerate(new[headline]):
                    if headline == "By Pot Chavez and Vince Lopez":
                        
                        print(text)
                        print("result", is_close(currentBox["box"], box))
                        print(box, currentBox["box"])
                    if is_close(currentBox["box"], box):
                        (ex_tl_x, ex_tl_y), (ex_br_x, ex_br_y) = currentBox["box"]
                        (tl_x, tl_y), (br_x, br_y) = box

                        new_tl = (min(ex_tl_x, tl_x), min(ex_tl_y, tl_y))
                        new_br = (max(ex_br_x, br_x), max(ex_br_y, br_y))

                        new[headline][i]["box"] = (new_tl, new_br)
                        new[headline][i]["text"] += " " + text
                        break
                else:
                    new[headline].append({"text": text, "box": box})

    for key in new:
        for i in range(len(new[key])):
            for j in range(i + 1, len(new[key])):
                if is_close(new[key][i]["box"], new[key][j]["box"]):
                    (ex_tl_x, ex_tl_y), (ex_br_x, ex_br_y) = new[key][i]["box"]
                    (tl_x, tl_y), (br_x, br_y) = new[key][j]["box"]

                    new_tl = (min(ex_tl_x, tl_x), min(ex_tl_y, tl_y))
                    new_br = (max(ex_br_x, br_x), max(ex_br_y, br_y))

                    new[key][i]["box"] = (new_tl, new_br)
                    new[key][i]["text"] += " " + new[key][j]["text"]
                    del new[key][j]
                    break 
    print(json.dumps(new, indent=2))
    return new

merged_texts_headlines = merging_bounding_boxes(possible_headlines)

**Call again to Gemini for the others parts of the article based on the generated headline**

In [None]:
article_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "headline": {"type": "string"},
            "subheadline": {"type": "string"},
            "body" : {"type": "string"},
            "byline": {"type": "string"}
        },
        "required": ["subheadline", "body", "byline"]
    }
}

article_prompt = (
	"You are given a newspaper image. "
	"From the given headlines, extract their subheadline if there is any (required: subheadlines must have higher font size compared to body)"
	"From the given headlines, extract their body"
	"Also extract the article's byline or author from the given headlines — bylines might appear after your generated body and will never appear on top of the headline"
    "use the following headlines to find the subheadline, body and bylines: "
    f"{json.dumps(generated_headlines, indent=2)}. "
	"Return the result strictly matching this JSON schema:\n\n"
	f"{json.dumps(article_schema, indent=2)}"
)

generated_article = generate(article_prompt, "input/" + target_image_path)

def transform_articles(articles):
    transformed = []
    for article in articles:
        new_article = {}
        for key, value in article.items():
            new_article[key] = {
                "text": value,
                "coordinate": []
            }
        transformed.append(new_article)
    return transformed

return_json = transform_articles(generated_article)
print(return_json)

**Picks the Best match and write their bounding box**

In [None]:
result_image = image.copy()

def boxes_overlap(box1, box2):
    (x1_min, y1_min), (x1_max, y1_max) = box1
    (x2_min, y2_min), (x2_max, y2_max) = box2

    # check if boxes intersect
    return not (x1_max < x2_min or x2_max < x1_min or
                y1_max < y2_min or y2_max < y1_min)



def match_score(query, candidate):
    # 1. partial ratio (substring match)
    partial = fuzz.partial_ratio(query, candidate)
    # 2. full ratio (length + overall match)
    full = fuzz.ratio(query, candidate)
    # 3. token set (ignores word order & duplicates)
    token = fuzz.token_set_ratio(query, candidate)

    # weighted average (tweak weights as needed)
    score = (partial * 0.4) + (full * 0.3) + (token * 0.3)
    return score


def draw_boxes(texts: list, color=(0, 255, 0), thickness=5, part="headline"):
    drawn_boxes = []  # store drawn rectangles as (tl, br)
    
    for i, key in enumerate(texts):
        query = key
        choices = []
        cord = []
        for obj in texts[key]:
            if drawn_boxes:
                if obj["box"] in drawn_boxes:
                    continue
                else:
                    choices.append(obj["text"])
                    cord.append(obj["box"])
            else:
                choices.append(obj["text"])
                cord.append(obj["box"])


        if not choices:
            continue  # skip if no choices

        # map choice → index
        choices_dict = {c: i for i, c in enumerate(choices)}

        # get best match
        best_match = process.extractOne(query, list(choices_dict.keys()))

        if best_match:
            text, _ = best_match
            tl, br = cord[choices.index(text)]
            new_box = (tl, br)
                   
            # skip if overlaps any existing box
            if any(boxes_overlap(new_box, old) for old in drawn_boxes):
                continue
            # draw box
            cv2.rectangle(result_image, tl, br, color, thickness)

            print("result =", text, (tl, br))
            drawn_boxes.append((tl, br))
            
            if part != "headline":
                for index, obj in enumerate(return_json):
                    print(index, obj)
                    obj[part]["text"]
                    score = match_score(obj[part]["text"], text)
                    if score > 90:
                        i = index
                        print("success")
                        return_json[i][part]["coordinate"] = [*tl, *br]
            else:
                return_json[i][part]["coordinate"] = [*tl, *br]


    # save output
    cv2.imwrite("output/" + target_image_path, result_image)

draw_boxes(merged_texts_headlines)

**Run the rest parts of the article**

In [None]:
bylines = [i["byline"] for i in generated_article]
possible_bylines = filter_texts_by_score(bylines)
merged_bylines = merging_bounding_boxes(possible_bylines)
draw_boxes(merged_bylines, color=(0, 0, 255), part="byline")

subheadlines = [i["subheadline"] for i in generated_article]
possible_subheadlines = filter_texts_by_score(subheadlines)
merged_subheadlines = merging_bounding_boxes(possible_subheadlines)
draw_boxes(merged_subheadlines, color=(255, 0, 0), part="subheadline")

**Save the details as JSON**

In [None]:
with open(f"json/{target_image_path.split('.')[0] + '.json'}", "w", encoding="utf-8") as f:
    json.dump(return_json, f, indent=2, ensure_ascii=False)