In [9]:
import pdfplumber
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os

# Preprocessing: load stopwords
stop_words = set(stopwords.words('english'))

# Function to preprocess text (remove punctuation, lowercase, tokenize, and remove stopwords)
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    words = word_tokenize(text)  # Tokenize
    words = [w for w in words if w not in stop_words]  # Remove stopwords
    return words

# Function to extract text from text-based PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
    all_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                all_text += text + "\n"
    return all_text

# Function to extract headings and descriptions from resume text
def extract_headings_and_descriptions(text):
    # Updated regex patterns for headings (no need for a colon after headings)
    heading_patterns = [
        r"\beducation\b",  
        r"\bexperience\b",  
        r"\bprojects\b",  
        # r"\btechnical\s*skills\b",  
        r"\bskills\b",
        r"\bpositions\s*of\s*responsibility\b",  
        r"\bcertifications\b",  
        r"\bmiscellaneous\b"
    ]
    
    # Join the patterns into a single regex with case-insensitive matching
    heading_regex = re.compile(r"(?P<heading>(" + "|".join(heading_patterns) + r"))\s*(?:$|\n)", re.IGNORECASE)
    
    # Split text based on headings
    sections = heading_regex.split(text)
    
    result = {}
    
    for i in range(0, len(sections)):
        # If the current section matches a heading (non-None and non-empty)
        if sections[i] and heading_regex.match(sections[i].strip()):
            heading = sections[i].strip()
            # Get the next section as content, ensuring it's not None
            if i + 1 < len(sections) and sections[i + 1]:
                content = sections[i + 1].strip()
                processed_content = preprocess_text(content)
                result[heading] = processed_content
    return result

# Main function to handle text-based PDFs
def process_pdf(pdf_path):
    # Check if the file is valid
    if not os.path.exists(pdf_path):
        print(f"File {pdf_path} not found!")
        return
    else:
        print(f"Processing file: {pdf_path}")
    
    # Extract text with pdfplumber (for text-based PDFs)
    try:
        extracted_text = extract_text_from_pdf(pdf_path)
        print(f"Extracted Text: {extracted_text[:500]}...")  # Show only first 500 characters for brevity
    except Exception as e:
        print(f"Error extracting text: {e}")
        return

    # Extract headings and descriptions
    heading_to_content = extract_headings_and_descriptions(extracted_text)

    # Display the extracted headings and content
    if heading_to_content:
        for heading, content in heading_to_content.items():
            print(f"Heading: {heading}")
            print(f"Content: {content}")
            print("-" * 40)
    else:
        print("No headings and content extracted.")

# Example usage:
if __name__ == "__main__":
    pdf_path = "202308111555929_dice_resume_cv_madhura_bapat.pdf"  # Replace with the path to your PDF
    process_pdf(pdf_path)


Processing file: 202308111555929_dice_resume_cv_madhura_bapat.pdf
Extracted Text: Madhura A. Bapat
Columbus, Ohio Cell: +1 (404) 518 6607
LinkedIn URL E-mail: madhurabapat1108@gmail.com
An IT professional with more than 18 years of experience in managing, driving the delivery and solutions across various large client accounts
through program/project management, technical and solution Architecting. Experienced across Finance, Public Services, Healthcare, Insurance,
Manufacturing, HR domains. Working experience on integrations, transformations, custom developments, upgrade/migr...
Heading: CERTIFICATIONS
Content: ['integration', 'technologies', 'oracle', 'cloud', 'integrations', 'oic', 'databases', 'oracle', 'autonomous', 'database', 'cloud', 'oracle', '19c', 'oci', 'soa', 'cs', 'pcs', 'mft', 'cs', 'oracle', 'cloud', 'erp', 'hcm', 'global', 'hr', 'cs', 'technologies', 'tools', 'aws', 'azure', 'cloud', 'jcs', 'java', 'j2ee', 'premises', 'oracle', 'fusion', 'mw', 'suite', '12c', 'soa', 'os

In [13]:
import pdfplumber
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os

# Preprocessing: load stopwords
stop_words = set(stopwords.words('english'))

# Function to preprocess text (remove punctuation, lowercase, tokenize, and remove stopwords)
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    words = word_tokenize(text)  # Tokenize
    words = [w for w in words if w not in stop_words]  # Remove stopwords
    return words

# Function to check if a line is likely to be a heading (based on structure)
def is_heading(line, next_line):
    # Consider the line a heading if it's shorter (less than 5 words)
    if len(line.split()) <= 5:
        # Additionally, headings often have no punctuation and are followed by content
        if len(next_line.split()) > 5:
            return True
    return False

# Function to extract text from text-based PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
    all_text = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                all_text.extend(text.split("\n"))  # Split text into lines
    return all_text

# Function to dynamically extract headings and descriptions based on text structure
def extract_dynamic_headings(text_lines):
    result = {}
    current_heading = None
    current_content = []

    for i in range(len(text_lines) - 1):
        line = text_lines[i].strip()
        next_line = text_lines[i + 1].strip()  # Look ahead to the next line

        # Check if the current line is a heading
        if is_heading(line, next_line):
            # If we have accumulated content, store it under the previous heading
            if current_heading:
                result[current_heading] = ' '.join(current_content).strip()
                current_content = []  # Reset content buffer
            
            current_heading = line  # Set new heading
        else:
            if current_heading:  # Only accumulate content after a heading
                current_content.append(line)

    # Add the last section
    if current_heading:
        result[current_heading] = ' '.join(current_content).strip()

    return result

# Main function to handle text-based PDFs
def process_pdf(pdf_path):
    # Check if the file is valid
    if not os.path.exists(pdf_path):
        print(f"File {pdf_path} not found!")
        return
    else:
        print(f"Processing file: {pdf_path}")
    
    # Extract text with pdfplumber (for text-based PDFs)
    try:
        extracted_lines = extract_text_from_pdf(pdf_path)
        print(f"Extracted Text Lines: {extracted_lines[:10]}...")  # Show only first 10 lines for brevity
    except Exception as e:
        print(f"Error extracting text: {e}")
        return

    # Extract headings and descriptions dynamically
    heading_to_content = extract_dynamic_headings(extracted_lines)

    # Display the extracted headings and content
    if heading_to_content:
        for heading, content in heading_to_content.items():
            print(f"Heading: {heading}")
            print(f"Content: {content}")
            print("-" * 40)
    else:
        print("No headings and content extracted.")

# Example usage:
if __name__ == "__main__":
    pdf_path = "202308031905911_jessicatucker.docx"  # Replace with the path to your PDF
    process_pdf(pdf_path)


Processing file: 202308031905911_jessicatucker.docx
Error extracting text: No /Root object! - Is this really a PDF?


In [1]:
import pdfplumber
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
import docx  # For handling Word documents

# Preprocessing: load stopwords
stop_words = set(stopwords.words('english'))

# Function to preprocess text (remove punctuation, lowercase, tokenize, and remove stopwords)
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    words = word_tokenize(text)  # Tokenize
    words = [w for w in words if w not in stop_words]  # Remove stopwords
    return words

# Function to check if a line is likely to be a heading (based on structure)
def is_heading(line, next_line):
    # Consider the line a heading if it's shorter (less than 5 words)
    if len(line.split()) <= 5:
        # Additionally, headings often have no punctuation and are followed by content
        if len(next_line.split()) > 5:
            return True
    return False

# Function to extract text from text-based PDF using pdfplumber
def extract_text_from_pdf(pdf_path):
    all_text = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                all_text.extend(text.split("\n"))  # Split text into lines
    return all_text

# Function to extract text from Word document using python-docx
def extract_text_from_docx(docx_path):
    all_text = []
    doc = docx.Document(docx_path)
    for para in doc.paragraphs:
        text = para.text.strip()
        if text:
            all_text.append(text)
    return all_text

# Function to dynamically extract headings and descriptions based on text structure
def extract_dynamic_headings(text_lines):
    result = {}
    current_heading = None
    current_content = []

    for i in range(len(text_lines) - 1):
        line = text_lines[i].strip()
        next_line = text_lines[i + 1].strip()  # Look ahead to the next line

        # Check if the current line is a heading
        if is_heading(line, next_line):
            # If we have accumulated content, store it under the previous heading
            if current_heading:
                result[current_heading] = ' '.join(current_content).strip()
                current_content = []  # Reset content buffer
            
            current_heading = line  # Set new heading
        else:
            if current_heading:  # Only accumulate content after a heading
                current_content.append(line)

    # Add the last section
    if current_heading:
        result[current_heading] = ' '.join(current_content).strip()

    return result

# Main function to handle both text-based PDFs and Word documents
def process_document(file_path):
    # Check if the file is valid
    if not os.path.exists(file_path):
        print(f"File {file_path} not found!")
        return
    else:
        print(f"Processing file: {file_path}")
    
    # Determine file type based on extension
    file_extension = os.path.splitext(file_path)[1].lower()

    # Extract text based on file type
    if file_extension == '.pdf':
        try:
            extracted_lines = extract_text_from_pdf(file_path)
            print(f"Extracted Text Lines: {extracted_lines[:10]}...")  # Show only first 10 lines for brevity
        except Exception as e:
            print(f"Error extracting text from PDF: {e}")
            return
    elif file_extension == '.docx':
        try:
            extracted_lines = extract_text_from_docx(file_path)
            print(f"Extracted Text Lines: {extracted_lines[:10]}...")  # Show only first 10 lines for brevity
        except Exception as e:
            print(f"Error extracting text from Word document: {e}")
            return
    else:
        print(f"Unsupported file format: {file_extension}")
        return

    # Extract headings and descriptions dynamically
    heading_to_content = extract_dynamic_headings(extracted_lines)

    # Display the extracted headings and content
    if heading_to_content:
        for heading, content in heading_to_content.items():
            print(f"Heading: {heading}")
            print(f"Content: {content}")
            print("-" * 40)
    else:
        print("No headings and content extracted.")

# Example usage:
if __name__ == "__main__":
    file_path = "202308031905911_jessicatucker.docx"  # Replace with the path to your file (PDF or DOCX)
    process_document(file_path)


Processing file: 202308031905911_jessicatucker.docx
Extracted Text Lines: ['JESSICA TUCKER', 'EL Dorado Hills, CA 95762', '(916) 542-5545', 'Jessmommie1101@gmail.com', 'OBJECTIVE', 'To obtain a rewarding and challenging position as a Medical Biller/Coder or Office Assistant where a highly motivated and enthusiastic individual as myself can become a valuable member of a health care team.', 'EDUCATION', 'Carrington College\t\t\t\t\t\t\tPresent', 'AA-Associates in Health Studies', 'Carrington College\t\t\t\t\t\t\tNovember 2021']...
Heading: OBJECTIVE
Content: To obtain a rewarding and challenging position as a Medical Biller/Coder or Office Assistant where a highly motivated and enthusiastic individual as myself can become a valuable member of a health care team. EDUCATION Carrington College							Present AA-Associates in Health Studies Carrington College							November 2021 Medical Billing and Coding Certificate DeAnza College							December 2010 Web Design SKILLS/TRAINING/ABILITIES
----