In [1]:
pip install pymupdf pandas

Collecting pymupdfNote: you may need to restart the kernel to use updated packages.

  Downloading PyMuPDF-1.24.11-cp38-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.11-cp38-abi3-win_amd64.whl (16.0 MB)
   ---------------------------------------- 0.0/16.0 MB ? eta -:--:--
    --------------------------------------- 0.3/16.0 MB ? eta -:--:--
   --- ------------------------------------ 1.6/16.0 MB 6.0 MB/s eta 0:00:03
   ------------- -------------------------- 5.2/16.0 MB 12.2 MB/s eta 0:00:01
   ------------- -------------------------- 5.2/16.0 MB 12.2 MB/s eta 0:00:01
   ------------------- -------------------- 7.9/16.0 MB 8.5 MB/s eta 0:00:01
   ------------------------- -------------- 10.2/16.0 MB 9.1 MB/s eta 0:00:01
   ------------------------------- -------- 12.6/16.0 MB 9.6 MB/s eta 0:00:01
   ---------------------------------- ----- 13.9/16.0 MB 8.9 MB/s eta 0:00:01
   ---------------------------------------  15.7/16.0 MB 9.3 MB/s eta 0:00:01
   ----------------

In [3]:
import fitz  # PyMuPDF for reading PDFs
import re
import pandas as pd
import os

# Function to extract text from PDF file
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

# Function to extract the candidate's name
def extract_name(text):
    # This assumes that the name appears at the top of the resume
    # Usually, it might be in the first 5-7 words.
    lines = text.split('\n')
    for line in lines[:5]:  # Look in the first few lines for the name
        # Adjust the regex as needed to match typical name patterns
        if re.match(r'^[A-Z][a-z]*\s[A-Z][a-z]*', line):
            return line.strip()
    return "Not Found"

# Function to extract the email address
def extract_email(text):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    match = re.search(email_pattern, text)
    return match.group(0) if match else "Not Found"

# Function to extract the phone number
def extract_phone_number(text):
    # This regex pattern captures various phone number formats.
    phone_pattern = r'\b(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})\b'
    match = re.search(phone_pattern, text)
    return match.group(0) if match else "Not Found"

# Function to extract address (if address keywords like 'Address', 'Street' are present)
def extract_address(text):
    address_keywords = ['address', 'street', 'city', 'state', 'zip', 'postal']
    lines = text.split('\n')
    address = "Not Found"
    for line in lines:
        if any(keyword in line.lower() for keyword in address_keywords):
            address = line.strip()
            break
    return address

# Function to process all PDFs in a folder and save the extracted info in a CSV
def process_resumes(folder_path, output_csv_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(pdf_path)
            
            # Extract information
            name = extract_name(text)
            email = extract_email(text)
            phone = extract_phone_number(text)
            address = extract_address(text)
            
            # Add extracted info to data list
            data.append({
                'File Name': filename,
                'Name': name,
                'Email': email,
                'Phone': phone,
                'Address': address
            })
    
    # Create a DataFrame and save it to a CSV file
    df = pd.DataFrame(data)
    df.to_csv(output_csv_path, index=False)
    print(f'Data saved to {output_csv_path}')

# Example usage:
# Replace 'path_to_resume_folder' with the actual folder path containing PDF resumes
# Replace 'output.csv' with your desired output file name
process_resumes('C:\\Users\\santhoshs.s\\jupyter\\resumes\\data\\data\\BPO\\bb', 'output.csv')


Data saved to output.csv


In [4]:
import fitz  # PyMuPDF for reading PDFs
import re
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize

# Function to extract text from PDF file
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

# Function to extract the candidate's name
def extract_name(text):
    """
    Attempts to extract a name from the provided text using regular expressions and NER.

    Args:
        text (str): Text content to search for names.

    Returns:
        str: Extracted name (if found), or an empty string.
    """
    # Try regular expressions first
    name_patterns = [
        r"[A-Z][a-z]+ [A-Z][a-z]+",  # Two words with capital letters (first name + last name)
        r"[A-Z][a-z]+-\w+"  # Hyphenated names like "Anne-Marie"
    ]
    
    for pattern in name_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group()

    # If regular expressions fail, try using NLTK for Named Entity Recognition (NER)
    try:
        tokens = word_tokenize(text)
        tagged_text = nltk.pos_tag(tokens)
        for word, tag in tagged_text:
            if tag == 'NNP':  # Proper noun, typically used for names
                return word
    except LookupError:
        print("NLTK data not downloaded. Install NLTK and run 'nltk.download()' to download required resources.")

    return "Not Found"

# Function to extract the email address
def extract_email(text):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    match = re.search(email_pattern, text)
    return match.group(0) if match else "Not Found"

# Function to extract the phone number
def extract_phone_number(text):
    # This regex pattern captures various phone number formats.
    phone_pattern = r'\b(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})\b'
    match = re.search(phone_pattern, text)
    return match.group(0) if match else "Not Found"

# Function to extract address (if address keywords like 'Address', 'Street' are present)
def extract_address(text):
    address_keywords = ['address', 'street', 'city', 'state', 'zip', 'postal']
    lines = text.split('\n')
    address = "Not Found"
    for line in lines:
        if any(keyword in line.lower() for keyword in address_keywords):
            address = line.strip()
            break
    return address

# Function to process all PDFs in a folder and save the extracted info in a CSV
def process_resumes(folder_path, output_csv_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(pdf_path)
            
            # Extract information
            name = extract_name(text)
            email = extract_email(text)
            phone = extract_phone_number(text)
            address = extract_address(text)
            
            # Add extracted info to data list
            data.append({
                'File Name': filename,
                'Name': name,
                'Email': email,
                'Phone': phone,
                'Address': address
            })
    
    # Create a DataFrame and save it to a CSV file
    df = pd.DataFrame(data)
    df.to_csv(output_csv_path, index=False)
    print(f'Data saved to {output_csv_path}')

# Example usage:
# Replace 'path_to_resume_folder' with the actual folder path containing PDF resumes
# Replace 'output.csv' with your desired output file name
process_resumes('C:\\Users\\santhoshs.s\\jupyter\\resumes\\data\\data\\BPO\\bb', 'output1.csv')


Data saved to output1.csv


In [2]:
import fitz  # PyMuPDF for reading PDFs
import re
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize

# Function to extract text from PDF file
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

# Function to extract the candidate's name
def extract_name(text):
    name_patterns = [
        r"[A-Z][a-z]+ [A-Z][a-z]+",  # Two words with capital letters (first name + last name)
        r"[A-Z][a-z]+-\w+"  # Hyphenated names like "Anne-Marie"
    ]
    
    for pattern in name_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group()

    # If regular expressions fail, try using NLTK for Named Entity Recognition (NER)
    try:
        tokens = word_tokenize(text)
        tagged_text = nltk.pos_tag(tokens)
        for word, tag in tagged_text:
            if tag == 'NNP':  # Proper noun, typically used for names
                return word
    except LookupError:
        print("NLTK data not downloaded. Install NLTK and run 'nltk.download()' to download required resources.")

    return "Not Found"

# Function to extract the email address
def extract_email(text):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    match = re.search(email_pattern, text)
    return match.group(0) if match else "Not Found"

# Function to extract the phone number
def extract_phone_number(text):
    phone_pattern = r'\b(?:\+?(\d{1,3}))?[-. (]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})\b'
    match = re.search(phone_pattern, text)
    return match.group(0) if match else "Not Found"

# Function to extract address (if address keywords like 'Address', 'Street' are present)
def extract_address(text):
    address_keywords = ['address', 'street', 'city', 'state', 'zip', 'postal']
    lines = text.split('\n')
    address = "Not Found"
    for line in lines:
        if any(keyword in line.lower() for keyword in address_keywords):
            address = line.strip()
            break
    return address

# Function to extract profile or summary
def extract_profile(text):
    profile_keywords = ['profile', 'summary', 'about me', 'professional summary']
    profile = "Not Found"
    lines = text.lower().split('\n')
    for i, line in enumerate(lines):
        if any(keyword in line for keyword in profile_keywords):
            profile = "\n".join(lines[i:i+3]).strip()  # Get a few lines after the keyword
            break
    return profile

# Function to extract links (e.g., LinkedIn, GitHub)
def extract_links(text):
    link_pattern = r'(https?://[^\s]+)'
    links = re.findall(link_pattern, text)
    return ', '.join(links) if links else "Not Found"

# Function to extract experience
def extract_experience(text):
    experience_keywords = ['experience', 'work history', 'professional experience']
    experience = "Not Found"
    lines = text.lower().split('\n')
    for i, line in enumerate(lines):
        if any(keyword in line for keyword in experience_keywords):
            experience = "\n".join(lines[i:i+5]).strip()  # Get a few lines after the keyword
            break
    return experience

# Function to extract education
def extract_education(text):
    education_keywords = ['education', 'academic background', 'qualifications']
    education = "Not Found"
    lines = text.lower().split('\n')
    for i, line in enumerate(lines):
        if any(keyword in line for keyword in education_keywords):
            education = "\n".join(lines[i:i+5]).strip()  # Get a few lines after the keyword
            break
    return education

# Function to extract languages
def extract_languages(text):
    language_keywords = ['languages', 'language proficiency']
    languages = "Not Found"
    lines = text.lower().split('\n')
    for i, line in enumerate(lines):
        if any(keyword in line for keyword in language_keywords):
            languages = "\n".join(lines[i:i+3]).strip()  # Get a few lines after the keyword
            break
    return languages

# Function to extract certificates
def extract_certificates(text):
    certificate_keywords = ['certification', 'certificate', 'licenses']
    certificates = "Not Found"
    lines = text.lower().split('\n')
    for i, line in enumerate(lines):
        if any(keyword in line for keyword in certificate_keywords):
            certificates = "\n".join(lines[i:i+3]).strip()  # Get a few lines after the keyword
            break
    return certificates

# Function to process all PDFs in a folder and save the extracted info in a CSV
def process_resumes(folder_path, output_csv_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(pdf_path)
            
            # Extract information
            name = extract_name(text)
            email = extract_email(text)
            phone = extract_phone_number(text)
            address = extract_address(text)
            profile = extract_profile(text)
            links = extract_links(text)
            experience = extract_experience(text)
            education = extract_education(text)
            languages = extract_languages(text)
            certificates = extract_certificates(text)
            
            # Add extracted info to data list
            data.append({
                'File Name': filename,
                'Name': name,
                'Email': email,
                'Phone': phone,
                'Address': address,
                'Profile': profile,
                'Links': links,
                'Experience': experience,
                'Education': education,
                'Languages': languages,
                'Certificates': certificates
            })
    
    # Create a DataFrame and save it to a CSV file
    df = pd.DataFrame(data)
    df.to_csv(output_csv_path, index=False)
    print(f'Data saved to {output_csv_path}')

# Example usage:
# Replace 'path_to_resume_folder' with the actual folder path containing PDF resumes
# Replace 'output.csv' with your desired output file name
process_resumes('C:\\Users\\santhoshs.s\\jupyter\\resumes\\data\\data\\BPO\\bb', 'output2.csv')


Data saved to output2.csv
