In [1]:
from operator import itemgetter
import fitz
import json

In [2]:
def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles

In [3]:
def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag

In [4]:
def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    block_texts = {}
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.getText("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                                if size_tag[s['size']] in block_texts.keys():
                                    #print (s['text'])
                                    block_texts[size_tag[s['size']]] += s['text']
                                else:
                                    block_texts[size_tag[s['size']]] = s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                        if size_tag[s['size']] in block_texts.keys():
                                            #print (s['text'])
                                            block_texts[size_tag[s['size']]] += s['text']
                                        else:
                                            block_texts[size_tag[s['size']]] = s['text']
                                            
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']
                                    if size_tag[s['size']] in block_texts.keys():
                                        #print (s['text'])
                                        block_texts[size_tag[s['size']]] += '\n'+s['text']
                                    else:
                                        block_texts[size_tag[s['size']]] = s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)

    return header_para, block_texts

In [5]:
document = 'aditya bhartia (copy).pdf'
doc = fitz.open(document)

font_counts, styles = fonts(doc, granularity=False)

size_tag = font_tags(font_counts, styles)

elements, block_texts = headers_para(doc, size_tag)

In [6]:
#try to find the headers in document depending on the text font and size
print (block_texts['<p>'])

PROFESSIONAL EXPERIENCEEquity research analyst
Held joint responsibility with the team leader for analysing  the UK  Support Services sector, which 
Involved in 
Prepared  
Authored five 
Involved with  Irevna Research Services (subsidiary of S&P), ChennaiOffshore  
Built  
Prepared 
Authored First Call notesAPPRENTICEP. K, Narula & Co, New Delhi 
Conducted internal and statutory audits in different sectors.
Finalized accounts, prepared balance sheets, and filed returns of income.EDUCATION
Associate Chartered Accountant 
 Cleared CA Final examinations with 21
 rank on All India Basis.
 Cleared CA Foundation examination with 19
 rank on All India Basis.
Bachelor of Commerce, Shri Ram College of Commerce, Delhi
 Secured 2
 rank in Delhi University in B. Com (Hons.) 1
 year.
 Awarded UFJ Foundations and SRCC Alumni scholarships for academic excellence. 
CBSE(Class XII) Mahavir Senior Model School, Delhi
 Secured 1
 rank in R. S. Asiads 2000 for Accountancy in Class XI.
CBSE (Class X) Maha

In [7]:
print (elements)

['<h1>ADITYA BHARTIA|', '<s1>G-3, Tandon Apartment, Charat Singh Colony, Andheri (E), Mumbai| Ph: +91 9820929220, E-mail: adityabhartia@yahoo.com|', '', '<p>PROFESSIONAL EXPERIENCE| Noble Group/Clear Capital, Mumbai| June 2007 – Till date|', '<p>Equity research analyst  at Noble Group, a UK-based investment bank specializing in small and mid-cap | equities. Noble’s clients include some of the UK’s top institutional investors such as Scottish Widows, Fidelity, | Gartmore, Aberforth, and L&G.|', '', '<s1>\uf0a7|', '<p>Held joint responsibility with the team leader for analysing  the UK  Support Services sector, which | encompasses a  wide array of business models  like equipment rental, accident management, social housing | and engineering consultancy. |', '', '<s1>\uf0a7|', '<p>Involved in  all stages of equity research , starting with company meetings and developing sophisticated | earning models and ending with writing research notes and servicing clients through roadshows, meetings, 