In [58]:
from pdfminer.high_level import extract_text, extract_pages
from pdfminer.layout import LAParams, LTTextContainer, LTChar, LTRect, LTFigure, LTTextLine
import markdownify
import os

# version 2
import pdfplumber
from PIL import Image
from pdf2image import convert_from_path

# import pytesseract

In [None]:
# !pip install pytesseract

In [1]:
# version 1 - the logical structure of PDF is not kept
def write_content(input_file_path, markdown_content):
    output_file = os.path.basename(input_file_path).split(".")[0]
    with open(f'../data/processed/{output_file}.md', 'w') as f:
        f.write(markdown_content)

def extract_text_from_pdf(pdf_file_path):
    text = extract_text(pdf_file_path)
    return text

def extract_text_from_pdf_with_layout(pdf_file_path):
    laparams = LAParams()
    text = extract_text(pdf_file_path, laparams=laparams)
    return text

def convert_text_to_markdown(text):
    markdown_content = markdownify.markdownify(text, heading_style="ATX")
    return markdown_content

def process_pdf_to_markdown(file_path):
    # pdf_content = extract_text_from_pdf(file_path)
    pdf_content = extract_text_from_pdf_with_layout(file_path)
    markdown_content = convert_text_to_markdown(pdf_content)
    write_content(file_path, pdf_content)
    return markdown_content

markdown_content = process_pdf_to_markdown("../data/input/usermanual.pdf")

In [None]:
# taken from here https://github.com/pdfminer/pdfminer.six/blob/master/docs/source/howto/character_properties.rst
from pathlib import Path
from typing import Iterable, Any

from pdfminer.high_level import extract_pages


def show_ltitem_hierarchy(o: Any, depth=0):
    """Show location and text of LTItem and all its descendants"""
    if depth == 0:
        print('element                        font                  stroking color  text')
        print('------------------------------ --------------------- --------------  ----------')

    print(
        f'{get_indented_name(o, depth):<30.30s} '
        f'{get_optional_fontinfo(o):<20.20s} '
        f'{get_optional_color(o):<17.17s}'
        f'{get_optional_text(o)}'
    )

    if isinstance(o, Iterable):
        for i in o:
            show_ltitem_hierarchy(i, depth=depth + 1)


def get_indented_name(o: Any, depth: int) -> str:
    """Indented name of class"""
    return '  ' * depth + o.__class__.__name__


def get_optional_fontinfo(o: Any) -> str:
    """Font info of LTChar if available, otherwise empty string"""
    if hasattr(o, 'fontname') and hasattr(o, 'size'):
        return f'{o.fontname} {round(o.size)}pt'
    return ''

def get_optional_color(o: Any) -> str:
    """Font info of LTChar if available, otherwise empty string"""
    if hasattr(o, 'graphicstate'):
        return f'{o.graphicstate.scolor}'
    return ''


def get_optional_text(o: Any) -> str:
    """Text of LTItem if available, otherwise empty string"""
    if hasattr(o, 'get_text'):
        return o.get_text().strip()
    return ''


path = Path('../data/input/usermanual.pdf').expanduser()
pages = extract_pages(path)
show_ltitem_hierarchy(pages)

### pdfminer links:

https://github.com/pdfminer/pdfminer.six/blob/master/docs/source/tutorial/extract_pages.rst#id3
https://towardsdatascience.com/extracting-text-from-pdf-files-with-python-a-comprehensive-guide-9fc4003d517
https://github.com/g-stavrakis/PDF_Text_Extraction/blob/main/PDF_Reader.ipynb


Fixing the LTChar problem:
https://github.com/pdfminer/pdfminer.six/issues/526

https://medium.com/@mb20261/python-by-examples-extract-pdf-by-pdfminer-six-246cba6f89b3


In [69]:
def extract_text_local(object):
    # save the text content of the line
    line_text = object.get_text()

    # list that saves tuples of (fontname, size)
    line_formats = []

    # iterating through all lines of this object
    for text_line in object:
        print(f'+++++++{text_line.get_text()} +++++++++++')

        # this text_line may not be a true "text_line"
        if isinstance(text_line , LTTextLine):
            # iterating through all characters of a line
            for character in text_line:
                # this character may not be a true "character"
                if isinstance(character, LTChar):
                    line_formats.append((character.fontname, character.size))
    unique_line_formats = list(set(line_formats))
    return (line_text, unique_line_formats)

In [70]:
# version 2
pdf_file_path ="../data/input/usermanual.pdf"

obj = []
page_objects = []

# unique_line_formats = []


# dictionary containing all pages with all objects: text, images, tables per page
final_dictionary = {}

line_format = []

for page_number, page in enumerate(extract_pages(pdf_file_path)):

    print(f'************{page_number}************')


    # get all objects from the page
    page_objects = [element for element in page]

    for object in page:
        obj.append(object)
        print(f'-------------{object}-------------------')

        if isinstance(object, LTTextContainer):
            
            # ++++++++++++++ EXTRACT TEXT ++++++++++++++++++
            (line_text, unique_line_formats) = extract_text_local(object)
            print(f'line_text {line_text}')
            print(f'unique_line_formats {unique_line_formats}')
            line_format.append(unique_line_formats)

            # print(object.get_text())
        # break

    # break



************0************
-------------<LTTextBoxHorizontal(0) 420.430,772.356,554.109,780.876 'Strength. Performance. Passion. \n'>-------------------
+++++++Strength. Performance. Passion. 
 +++++++++++
line_text Strength. Performance. Passion. 

unique_line_formats [('Helvetica-Bold', 8.519999999999982)]
-------------<LTTextBoxHorizontal(1) 43.920,590.712,431.443,618.672 'User Manual: LOGON 2.2.4.6 \n'>-------------------
+++++++User Manual: LOGON 2.2.4.6 
 +++++++++++
line_text User Manual: LOGON 2.2.4.6 

unique_line_formats [('Helvetica-Bold', 27.960000000000036)]
-------------<LTTextBoxHorizontal(2) 43.920,567.856,145.549,576.376 'Application Area Manual \n'>-------------------
+++++++Application Area Manual 
 +++++++++++
line_text Application Area Manual 

unique_line_formats [('Helvetica-Bold', 8.519999999999982)]
-------------<LTTextBoxHorizontal(3) 43.920,500.986,346.719,521.266 'Authors: \nEnrique Velasco Martín | Jacinto Jiménez Fuentes | Carmen García García  \n'>--------

In [67]:
unique_line_formats


[[('Helvetica-Bold', 8.519999999999982)],
 [('Helvetica-Bold', 27.960000000000036)],
 [('Helvetica-Bold', 8.519999999999982)],
 [('Helvetica-Bold', 8.519999999999982)],
 [('Helvetica', 6.960000000000001)],
 [('Helvetica', 6.960000000000001)],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [('Helvetica', 8.519999999999982)],
 [('Helvetica-Bold', 14.040000000000077)],
 [('Helvetica-Bold', 9.960000000000036)],
 [('Arial,Bold', 9.959999999999923), ('Helvetica-Bold', 9.960000000000036)],
 [('Helvetica', 9.960000000000036)],
 [('Helvetica-Bold', 9.960000000000036)],
 [('Helvetica', 9.960000000000036), ('Helvetica-Bold', 9.960000000000036)],
 [('Helvetica-Bold', 9.960000000000036)],
 [('Helvetica', 9.960000000000036), ('Helvetica-Bold', 9.960000000000036)],
 [('Helvetica', 9.95999999999998)],
 [('Helvetica', 9.95999999999998)],
 [('Helvetica-Bold', 9.960000000000036)],
 [('Helvetica', 9.95999999999998), ('Helvetica', 9.960000000000036)],
 [('Helveti

### Other resources:

https://www.posos.co/blog-articles/how-to-extract-and-structure-text-from-pdf-files-with-python-and-machine-learning