In [20]:
import cv2
import json
import easyocr
import requests
import numpy as np
from detectron2 import model_zoo
from detectron2.config import get_cfg
from pdf2image import convert_from_path
from detectron2.engine import DefaultPredictor


In [21]:
paper_path = "./paper.pdf"

In [None]:
server_url = "http://130.179.29.185:8000/summarize"
frcnn_path = "./faster-rcnn.pth"
frcnn_zoo_config_name = 'COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml'
prediction_score_threshold = 0.7
class_labels = ['text', 'title', 'list', 'table', 'figure']

In [23]:
def process_pages(path):
    images = convert_from_path(path)
    processed = []
    for image in images:
        image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
        processed.append(image)
    return processed

In [24]:
def get_detectron_model(model_zoo_config_name, model_path, prediction_score_threshold):
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file(model_zoo_config_name))
    cfg.MODEL.DEVICE = "cpu"
    cfg.MODEL.WEIGHTS = model_path
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = prediction_score_threshold
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 5
    predictor = DefaultPredictor(cfg)
    return predictor

In [25]:
def detect_objects(image, detector):
    outputs = detector(image)
    return outputs["instances"].to("cpu")

In [26]:
def extract_text(image, reader):
    results = reader.readtext(image)
    text = " ".join([result[1] for result in results])
    return text

In [27]:
def process_page_sections(image, objects, reader):
    page_text = ""
    page_figures = []
    pred_boxes = objects.pred_boxes
    pred_classes = objects.pred_classes
    for i in range(len(pred_boxes)):
        box = pred_boxes[i].tensor.numpy()[0]
        x1, y1, x2, y2 = box
        box_image = image[int(y1):int(y2), int(x1):int(x2)]
        if pred_classes[i] < 2:
            text = extract_text(box_image, reader)
            page_text += text + "\n"
        elif pred_classes[i] > 2:
            page_figures.append(box_image)
    return page_text, page_figures

In [28]:
def get_paper_section(paper_path, detector, reader):
    pages = process_pages(paper_path)
    full_text = []
    figures = []
    figure_pages = []

    for i in range(len(pages)):
        objects = detect_objects(pages[i], detector)
        page_text, page_figures = process_page_sections(pages[i], objects, reader)
        full_text.append(page_text)
        figures.extend(page_figures)
        figure_pages.append(len(page_figures))

    return full_text, figures, figure_pages

In [29]:
def get_figure_files(figures, figure_pages):
    files = []
    figures = iter(figures)
    for i in range(len(figure_pages)):
        for j in range(0, figure_pages[i]):
            _, buffer = cv2.imencode('.jpg', next(figures))
            files.append(("figures", (f"{i}_{j}.jpg", buffer.tobytes(), "image/jpeg")))
    return files

In [30]:
def summarize_paper(paper_path, detector, reader):
    full_text, figures, figure_pages = get_paper_section(paper_path, detector, reader)
    files = get_figure_files(figures, figure_pages)
    data = {"full_text" : json.dumps(full_text), "figure_pages" : json.dumps(figure_pages)}
    response = requests.post(server_url, files=files, data=data)
    return json.loads(response.content.decode())["summary"]
    

In [31]:
detector = get_detectron_model(frcnn_zoo_config_name, frcnn_path, prediction_score_threshold)
reader = easyocr.Reader(['en'])

  return torch.load(f, map_location=torch.device("cpu"))


In [32]:
summary = summarize_paper(paper_path, detector, reader)

In [33]:
with open("summary.md", "w") as file:
    file.write(summary)


In [34]:
!pandoc summary.md -o summary.pdf