# Below is the implementation of a resume reader, developed from scratch without relying on any pre-trained models. This approach focuses on custom extraction techniques to process and analyze resumes effectively.

In [121]:
import pdfplumber
import pytesseract
from PIL import Image
import pdf2image
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os

# # to unzip nltk stopwords:
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# Preprocessing: load stopwords
stop_words = set(stopwords.words('english'))

In [123]:
# Function to preprocess text
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    words = word_tokenize(text)  # Tokenize
    words = [w for w in words if w not in stop_words]  # Remove stopwords
    return words

# # Function to check if a line is likely to be a heading (based on structure)
# def is_heading(line, next_line):
#     # Consider the line a heading if it's shorter (less than 5 words)
#     if len(line.split()) <= 5:
#         # Additionally, headings often have no punctuation and are followed by content
#         if len(next_line.split()) > 5:
#             return True
#     return False

# Function to extract text from text-based PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
    all_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                all_text += text + "\n"
    return all_text

Please note that in the heading patterns, the notations are not general but in fact oriented to the resume used in the notebook. However, one should note that these headings are the ones generally used by people to construct their resume and these headings give a broad picture of different types of patterns that can be found.

One should always have the domain knowledge before using the code since it might help with much more clarity with the output.

In [131]:
# Function to extract headings and descriptions
import re

def extract_headings_and_descriptions(text):
    # Updated regex patterns for headings (no need for a colon after headings)
    heading_patterns = [
        r"\beducation\b",  
        r"\bexperience\b",  
        r"\bprojects\b",  
        r"\btechnical\s*skills\b",  
        r"\bpositions\s*of\s*responsibility\b",  
        r"\bcertifications\b",  
        r"\bhobbies\b"
    ]
    
    # Join the patterns into a single regex with case-insensitive matching
    heading_regex = re.compile(r"(?P<heading>(" + "|".join(heading_patterns) + r"))\s*(?:$|\n)", re.IGNORECASE)
    
    # Split text based on headings
    sections = heading_regex.split(text)
    
    result = {}
    
    for i in range(0, len(sections)):
        # If the current section matches a heading (non-None and non-empty)
        if sections[i] and heading_regex.match(sections[i].strip()):
            heading = sections[i].strip()
            # Get the next section as content, ensuring it's not None
            if i + 1 < len(sections) and sections[i + 1]:
                content = sections[i + 1].strip()
                processed_content = preprocess_text(content)
                result[heading] = processed_content
    return result

# # Function to dynamically extract headings and descriptions based on text structure
# def extract_dynamic_headings(text_lines):
#     result = {}
#     current_heading = None
#     current_content = []

#     for i in range(len(text_lines) - 1):
#         line = text_lines[i].strip()
#         next_line = text_lines[i + 1].strip()  # Look ahead to the next line

#         # Check if the current line is a heading
#         if is_heading(line, next_line):
#             # If we have accumulated content, store it under the previous heading
#             if current_heading:
#                 result[current_heading] = ' '.join(current_content).strip()
#                 current_content = []  # Reset content buffer
            
#             current_heading = line  # Set new heading
#         else:
#             if current_heading:  # Only accumulate content after a heading
#                 current_content.append(line)

#     # Add the last section
#     if current_heading:
#         result[current_heading] = ' '.join(current_content).strip()

#     return result



In [133]:
# Main function to handle both text-based and image-based PDFs
def process_pdf(pdf_path):
    # Check if the file is valid
    if not os.path.exists(pdf_path):
        print(f"File {pdf_path} not found!")
        return
    else:
        print(f"Processing file: {pdf_path}")  
    
    # Try extracting text with pdfplumber (for text-based PDFs)
    try:
        extracted_text = extract_text_from_pdf(pdf_path)
        print(f"Extracted Text: {extracted_text}") 
    except Exception as e:
        print(f"Error extracting text: {e}")

    # Extract headings and descriptions dynamically
    heading_to_content = extract_headings_and_descriptions(extracted_text)


    # Display the extracted headings and content
    if heading_to_content:
        for heading, content in heading_to_content.items():
            print(f"Heading: {heading}")
            print(f"Content: {', '.join(content)}")
            print("-" * 40)
    else:
        print("No headings and content extracted.")    




In [135]:
# Example usage:
if __name__ == "__main__":
    pdf_path = "C:/Users/Agam/OneDrive/Desktop/Briefcase/MY PROFILE/Agambir_Singh_Duggal_Resume_nan.pdf"  # path to your PDF
    process_pdf(pdf_path)

Processing file: C:/Users/Agam/OneDrive/Desktop/Briefcase/MY PROFILE/Agambir_Singh_Duggal_Resume_nan.pdf
Extracted Text: AGAMBIR SINGH DUGGAL
Mobile: +91 78887 25690 | Mail: aduggal_be22@thapar.edu |
LinkedIn: www.linkedin.com/in/agambir-singh-duggal-073ba02aa
EDUCATION
Bachelor of Engineering in Computer Engineering
Sep 2022 - Present
. Thapar Institute of Engineering and Technology, Patiala
Secured 8.99 absolute CGPA till 4th semester.
Senior Secondary Education 2020 - 2022
Sanawar Institute For Children, Bathinda
Secured 95% marks in 12th standard.
Secondary Education Pass out 2020
Silver Oaks School, Bathinda
Secured 94.4% marks in 10th standard.
TECHNICAL EXPERIENCE
Competitive Programming (2023- Present)
Regular participant in coding platforms like Leetcode and GeeksforGeeks where I have solved
100+ problems. Consistently solved algorithmic challenges based on sorting, binary search,
greedy algorithms, dynamic programming algorithms and graph with a focus on efficiency and
optimi