In [None]:
import pdfplumber
from langchain.tools import BaseTool
from io import BytesIO
import collections
from src.utils import find_footnote_bloc, find_blocs, extract_bloc

class PDFLangChainParser(BaseTool):
    """
    Custom LangChain tool for parsing PDFs using pdfplumber and the provided logic.
    """
    name = "PDFLangChainParser"
    description = "This tool extracts and processes text from a PDF using custom logic."

    def __init__(self, main_size=None, footnote_size=None, sidetext_size=None):
        self.main_size = main_size
        self.footnote_size = footnote_size
        self.sidetext_size = sidetext_size

    def _get_font_sizes(self, pdf):
        """Extract font sizes from the PDF to determine main, footnote, and sidetext sizes."""
        font_sizes = collections.Counter()
        num_pages = len(pdf.pages)
        for i in range(num_pages):
            page = pdf.pages[i]
            for char in page.chars:
                if char["text"] != ' ':
                    size_key = round(char["size"], 2)
                    font_sizes[size_key] += 1
        return font_sizes

    def _set_font_sizes(self, pdf):
        """Set font sizes based on the most common fonts."""
        font_sizes = self._get_font_sizes(pdf)
        if len(font_sizes) >= 3:
            most_common_sizes = font_sizes.most_common(3)
            self.main_size = most_common_sizes[0][0]
            self.footnote_size = most_common_sizes[1][0]
            self.sidetext_size = most_common_sizes[2][0]
        else:
            raise ValueError("Not enough different font sizes found in the PDF.")

    def _combine_blocs(self, clean_page): 
        """Combine blocks of text from a PDF page based on your custom logic."""
        x0, y0, x1, y1 = clean_page.layout.x0, clean_page.layout.y0, clean_page.layout.x1, clean_page.layout.y1
        footnote_top = find_footnote_bloc(clean_page, self.main_size, self.sidetext_size)

        if footnote_top is not None:
            blocs, x_middle, upper_main = find_blocs(clean_page, footnote_top, self.main_size, y1)
            footnote = clean_page.within_bbox((x0, footnote_top, x1, y1))
            footnote_bloc_text = footnote.extract_text() or ""

            if blocs is not None:
                upperbloc = clean_page.within_bbox((x0, y0, x1, upper_main))
                upperbloc_text = upperbloc.extract_text() or ""
                bloc_text = extract_bloc(clean_page, blocs, x_middle, x0, x1) or ""
                page_text = f"{upperbloc_text}\n{bloc_text}\n{footnote_bloc_text}"
            else:
                upperbloc = clean_page.within_bbox((x0, y0, x1, footnote_top))
                upperbloc_text = upperbloc.extract_text() or ""
                page_text = f"{upperbloc_text}\n{footnote_bloc_text}"
        else:
            blocs, x_middle, upper_main = find_blocs(clean_page, footnote_top, self.main_size, y1)
            if blocs is not None:
                upperbloc = clean_page.within_bbox((x0, y0, x1, upper_main))
                upperbloc_text = upperbloc.extract_text() or ""
                bloc_text = extract_bloc(clean_page, blocs, x_middle, x0, x1) or ""
                page_text = f"{upperbloc_text}\n{bloc_text}"
            else:
                upperbloc = clean_page.within_bbox((x0, y0, x1, y1))
                upperbloc_text = upperbloc.extract_text() or ""
                page_text = upperbloc_text

        return page_text

    def _run(self, pdf_file: bytes) -> str:
        """Extract and process the PDF."""
        file = BytesIO(pdf_file)
        extracted_text = ""

        with pdfplumber.open(file) as pdf:
            self._set_font_sizes(pdf)  # Determine font sizes if not already set
            num_pages = len(pdf.pages)
            for i in range(num_pages):
                page = pdf.pages[i]
                clean_page = page.filter(lambda obj: obj.get("mcid") is not None)
                text = self._combine_blocs(clean_page)
                extracted_text += f"{text}\n\n"

        return extracted_text

    async def _arun(self, pdf_file: bytes) -> str:
        raise NotImplementedError("Async method not implemented for PDFLangChainParser.")