Using pdf miner

In [1]:
pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/5.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/5.6 MB[0m [31m2.6 MB/s[0m eta [36m0:00:03[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/5.6 MB[0m [31m28.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m5.6/5.6 MB[0m [31m54.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pdfminer.six
Successfully installed pdfminer.six-20221105


In [2]:
import pdfminer

In [3]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import HTMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO

In [5]:
def converting_pdf_to_html(pdf_file):
    # Creating a PDF resource manager
    rsrcmgr = PDFResourceManager()
    # Creating a buffer to store the HTML content
    retstr = BytesIO()
    codec = 'utf-8'
    # Setting layout analysis parameters
    laparams = LAParams(char_margin=10.0, word_margin=5.0)
    # Creating an HTMLConverter
    device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    with open(pdf_file, 'rb') as pdf:
        pdf_interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(pdf):
            pdf_interpreter.process_page(page)  # Processing each page of the PDF
    device.close()  # Closing the HTMLConverter device
    content = retstr.getvalue()  # HTML content from the buffer
    retstr.close()  # Closing the buffer

    # Returning the HTML content as a string
    return content.decode('utf-8')


In [4]:
pdf_file = '/content/CKMourya_Resume.pdf'

In [6]:
html_resume = converting_pdf_to_html(pdf_file)

In [7]:
from IPython.display import HTML
HTML(html_resume)

In [8]:
html_resume

'<html><head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n</head><body>\n<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span>\n<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>\n<div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:111px; top:67px; width:389px; height:39px;"><span style="font-family: Lato-Hairline; font-size:39px">ChintaKrishna</span><span style="font-family: Lato-Light; font-size:39px">Mourya\n<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:142px; top:113px; width:324px; height:10px;"><span style="font-family: Raleway-Medium; font-size:10px">mouryachinta19@gmail.com|+91-7793981667|+91-9346074972\n<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:35px; top:146px; width:151px; height:68px;"><span style="font-family: Lato-Light; font-size

Parsing the html using BeautifulSoup

In [9]:
pip install beautifulsoup4



In [10]:
from bs4 import BeautifulSoup
# Parsing the HTML content using BeautifulSoup
soup = BeautifulSoup(html_resume, 'html.parser')

In [11]:
soup

<html><head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
</head><body>
<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span>
<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
<div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:111px; top:67px; width:389px; height:39px;"><span style="font-family: Lato-Hairline; font-size:39px">ChintaKrishna</span><span style="font-family: Lato-Light; font-size:39px">Mourya
<br/></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:142px; top:113px; width:324px; height:10px;"><span style="font-family: Raleway-Medium; font-size:10px">mouryachinta19@gmail.com|+91-7793981667|+91-9346074972
<br/></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:35px; top:146px; width:151px; height:68px;"><span style="font-family: Lato-Light; font-size:15px

Finding the font sizes of the required headings

In [12]:
def find_specific_heading_font_sizes(html_resume, headings):
  '''Function to find the font size of the heading'''
  # Parsing the HTML content
  soup = BeautifulSoup(html_resume, 'html.parser')

  # Creating a dictionary to store the maximum font size for each heading(since maximum font size will be the case of headings)
  max_font_sizes = {heading: 0 for heading in headings}

  # Iterating through all the <span> elements found in the HTML content
  for each_span in soup.find_all('span'):
        # Checking if the span contains any of the specified heading texts in a case-insensitive manner
        span_text = each_span.get_text().strip().lower()
        for heading in headings:
            if heading in span_text:
                # Extracting the font size
                span_style = each_span.get('style')
                if span_style:
                    style_parts = span_style.split(';')
                    for each_part in style_parts:
                        if 'font-size' in each_part:
                            font_size = each_part.split(':')[1].strip()
                            font_size = int(font_size.replace('px', ''))  # Converting to an integer(since the the result will be like 11px,so removing px and making 11 to int)
                            # Updating the maximum font size for the heading
                            if font_size > max_font_sizes[heading]:
                                max_font_sizes[heading] = font_size
                            # We need only the headings found in the resume
  font_sizes_of_headings = {heading:f"{font_size}px" for heading,font_size in max_font_sizes.items() if font_size>0}
  return font_sizes_of_headings


In [13]:
# List of heading texts to search for
headings = ['education', 'experience', 'skill','award','project','achievement','internhsip','activities','link','certification','language','positions','academics']

In [14]:
# Calling the function to find and print font sizes for the specified headings
headings_font  = find_specific_heading_font_sizes(html_resume, headings)
headings_font

{'education': '15px',
 'experience': '15px',
 'skill': '15px',
 'project': '15px',
 'activities': '15px',
 'link': '15px',
 'certification': '15px',
 'language': '15px'}

Finding the text of the education heading

In [15]:
education_font = headings_font['education']
education_font

'15px'

In [16]:
dict_instead_of_education = {key:value for key,value in headings_font.items() if key!='education'}
dict_instead_of_education

{'experience': '15px',
 'skill': '15px',
 'project': '15px',
 'activities': '15px',
 'link': '15px',
 'certification': '15px',
 'language': '15px'}

In [17]:
required_text = None
output = ''

for each_span in soup.find_all('span'):
    text = each_span.get_text().strip().lower()  # Convert text to lowercase for case-insensitive comparison
    span_style = each_span.get('style')
    style_parts = span_style.split(';')
    font_size = None
    new_font_size = None
    text_next = ""
    for each_part in style_parts:
        if 'font-size' in each_part:
            font_size = each_part.split(':')[1].strip()

    if font_size == education_font and 'education' in text:
        output += str(each_span)
        next_text = each_span.find_next('span')
        output += str(next_text)
        for key, value in dict_instead_of_education.items():

            # Check if the font size doesn't match and the key is not in the text_next
            if value != new_font_size and key not in text_next:
                required_text = next_text.find_next('span')

                text_next = required_text.get_text().strip().lower()
                text_style = required_text.get('style')
                text_parts = text_style.split(';')
                for part in text_parts:
                    if 'font-size' in part:
                      new_font_size = part.split(':')[1].strip()
                next_text=required_text
                output += str(required_text)
            else:
              continue


html_output = HTML(output)
display(html_output)

In [None]:
output

'<span style="font-family: SofiaProBold; font-size:36px">EDUCATION\n<br/></span><span style="font-family: Graphik-Bold; font-size:45px">National University of Singapore</span><span style="font-family: Graphik-Regular; font-size:45px"> — B. Computing\n<br/></span><span style="font-family: Graphik-Regular; font-size:30px">2012 - 2016\n<br/></span><span style="font-family: Graphik-Regular; font-size:38px">Bachelor of Computing (honours with Distinction) in Communications and \n<br/></span>'