In [5]:
import PyPDF2
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import csv
import os

import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF document.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        str: Extracted text content.
    """
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ''
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
        return text

def extract_name(text):
    """
    Attempts to extract a name from the provided text using regular expressions and NER.

    Args:
        text (str): Text content to search for names.

    Returns:
        str: Extracted name (if found), or an empty string.
    """
    # Try regular expressions first
    name_patterns = [r"[A-Z][a-z]+ [A-Z][a-z]+",  # Two words with capital letters
                   r"[A-Z][a-z]+-\w+"]  # Hyphenated names
    for pattern in name_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group()

    # If regular expressions fail, try NER
    try:
        tokens = word_tokenize(text)
        tagged_text = nltk.pos_tag(tokens)
        for word, tag in tagged_text:
            if tag == 'NNP':  # Proper noun
                return word
    except LookupError:  # NLTK data might not be downloaded
        print("NLTK data not downloaded. Install NLTK and run 'nltk.download()' to download required resources.")

    return ""

def extract_contact_details(text):
    """
    Attempts to extract email, phone number, and address using regular expressions.

    Args:
        text (str): Text content to search for contact information.

    Returns:
        dict: A dictionary containing extracted email, phone number, and address (if found).
    """
    email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    phone_pattern = r"[0-9]{3}-[0-9]{3}-[0-9]{4}"  # US-style phone number (modify for other formats)
    address_pattern = r"[0-9]+\s?[a-zA-Z]+(?:\s[a-zA-Z]+)?\s+(?:[A-Z][a-z]+\s?)*,\s?[A-Z]{2}\s+\d{5}(?:-\d{4})?"

    matches = {}
    matches["email"] = re.search(email_pattern, text)
    if matches["email"]:
        matches["email"] = matches["email"].group()

    matches["phone_number"] = re.search(phone_pattern, text)
    if matches["phone_number"]:
        matches["phone_number"] = matches["phone_number"].group()

    matches["address"] = re.search(address_pattern, text)
    if matches["address"]:
        matches["address"] = matches["address"].group()

    return matches

def extract_skills_from_text(text):
    """
    Attempts to extract skills using NLTK and Part-of-Speech tagging.

    Args:
        text (str): Text content to search for skills.

    Returns:
        list: A list of extracted skills.
    """
    skills = []
    tokens = word_tokenize(text)
    tagged_text = nltk.pos_tag(tokens)
    stop_words = set(stopwords.words('english'))

    for word, tag in tagged_text:
        if tag == 'NN' or tag == 'VB' and word.lower() not in stop_words:
            skills.append(word)

    return skills

def extract_experience_from_text(text):
    """
    Attempts to extract experience details using regular expressions.

    Args:
        text (str): Text content to search for experience details.

    Returns:
        list: A list of dictionaries, each containing experience details (job title, company, start date, end date).
    """
    experience_pattern = r"([A-Z][a-z]+(?:\s[A-Z][a-z]+)?) at ([A-Z][a-z]+(?:\s[A-Z][a-z]+)?) from ([\d/]+) to ([\d/]+)"
    matches = re.findall(experience_pattern, text)
    experience_list = []
    for match in matches:
        experience = {
            "job_title": match[0],
            "company": match[1],
            "start_date": match[2],
            "end_date": match[3]
        }
        experience_list.append(experience)
    return experience_list

def extract_education_from_text(text):
    """
    Attempts to extract education details using regular expressions.

    Args:
        text (str): Text content to search for education details.

    Returns:
        list: A list of dictionaries, each containing education details (degree, field, institution, year).
    """
    education_pattern = r"([A-Z][a-z]+(?:\s[A-Z][a-z]+)?) in ([A-Z][a-z]+(?:\s[A-Z][a-z]+)?) from ([A-Z][a-z]+(?:\s[A-Z][a-z]+)?) in ([\d]{4})"
    matches = re.findall(education_pattern, text)
    education_list = []
    for match in matches:
        education = {
            "degree": match[0],
            "field": match[1],
            "institution": match[2],
            "year": match[3]
        }
        education_list.append(education)
    return education_list

def extract_information_from_resumes(resume_folder, csv_file):
    """
    Extracts information from PDF resumes in a specified folder and saves it to a CSV file.

    Args:
        resume_folder (str): Path to the folder containing resumes.
        csv_file (str): Path to the CSV file where extracted information will be saved.
    """
    header = ["name", "email", "phone_number", "address", "skills", "experience", "education"]
    with open(csv_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(header)
        for filename in os.listdir(resume_folder):
            if filename.endswith('.pdf'):
                pdf_path = os.path.join(resume_folder, filename)
                text = extract_text_from_pdf(pdf_path)
                name = extract_name(text)
                contact_details = extract_contact_details(text)
                skills = extract_skills_from_text(text)
                experience = extract_experience_from_text(text)
                education = extract_education_from_text(text)
                row = [name, contact_details["email"], contact_details["phone_number"], contact_details["address"], skills, experience, education]
                writer.writerow(row)

# Example usage
resume_folder = "C:\\Users\\santhoshs.s\\jupyter\\resumes\\data\\data\\BPO\\bb"
csv_file = "resume_data.csv"
extract_information_from_resumes(resume_folder, csv_file)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\santhoshs.s\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\santhoshs.s\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [51]:
import fitz  
import re
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
import PyPDF2

def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text.lower()  

def extract_name(text):
    name_patterns = [
        r"[A-Z][a-z]+ [A-Z][a-z]+",  # Matches names like "John Doe"
        r"[A-Z][a-z]+-\w+",           # Matches hyphenated names like "Anne-Marie"
        r"\b[A-Z]+(?: [A-Z]+)* ?[A-Z]?\b"  # Matches fully capitalized names like "HARRISH KALYAN V"
    ]

    for pattern in name_patterns:
        match = re.search(pattern, text, re.IGNORECASE)  
        if match:
            return match.group()
            
    try:
        tokens = word_tokenize(text)
        tagged_text = nltk.pos_tag(tokens)
        for word, tag in tagged_text:
            if tag == 'NNP': 
                return word
    except LookupError:
        print("NLTK data not downloaded. Install NLTK and run 'nltk.download()' to download required resources.")

    return "Not Found"


def extract_profile_summary(text):
    keywords = [
        "summary", "profile", "objective", "professional summary",
        "career summary", "executive summary", "personal summary",
        "summary of qualifications", "overview", "Profile"
    ]
    
    lines = text.splitlines()
    summary_start = None

    keyword_patterns = [re.compile(r'\s*'.join(list(keyword.lower()))) for keyword in keywords]

    for pattern in keyword_patterns:
        for idx, line in enumerate(lines):
            if pattern.search(line.lower()):  
                summary_start = idx
                break
        if summary_start is not None:
            break

    if summary_start is not None:
        extracted_lines = []
        
        for i in range(summary_start + 1, len(lines)):
            line = lines[i].strip()

            if line == "" or line.startswith("*") or line.startswith("#"):
                continue
            if line.isupper():
                break
            
            extracted_lines.append(line)

        summary_result = "\n".join(extracted_lines).strip()
        return summary_result if summary_result else "Not Found"
    
    return "Not Found"

# Function to extract email address
def extract_email(text):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    match = re.search(email_pattern, text)
    return match.group(0) if match else "Not Found"

# Function to extract the phone number
def extract_phone_number(text):
    phone_pattern = r'\b(?:\+?(\d{1,3}))?[-. ()]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})\b'
    match = re.search(phone_pattern, text)
    
    if match:
        raw_number = match.group(0)
        clean_number = re.sub(r'\D', '', raw_number) 
        return clean_number
    
    return "Not Found"

# Function to extract address
def extract_address(text):
    address_keywords = ['address', 'street', 'city', 'state', 'zip', 'postal']
    lines = text.split('\n')
    for line in lines:
        if any(keyword in line.lower() for keyword in address_keywords):
            return line.strip()
    return "Not Found"

# Function to extract links along with associated text from a PDF file
def extract_links(pdf_path):
    links_with_text = []
    
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]

            if '/Annots' in page:
                annotations = page['/Annots']
                for annotation in annotations:
                    annotation_object = annotation.get_object()
                    if '/A' in annotation_object and '/URI' in annotation_object['/A']:
                        link = annotation_object['/A']['/URI']

                        links_with_text.append(link)
    # Join the links, placing each on a new line
    return '\n'.join(links_with_text) if links_with_text else "Not Found"

def extract_skills(text):
    skills_keywords = [
        "python", "java", "c++", "sql", "javascript", "html", "css",
        "data analysis", "machine learning", "deep learning", "project management",
        "excel", "communication", "teamwork", "problem solving", "time management",
        "leadership", "adobe photoshop", "graphic design", "cloud computing",
        "aws", "azure", "docker", "kubernetes", "linux", "windows",
        "networking", "digital marketing", "seo", "content writing", "sql", 
        "nosql", "data visualization", "power bi", "tableau", "salesforce",
        "financial analysis", "r programming", "pandas", "numpy", "tensorflow", "keras",
        "project planning", "agile", "scrum", "big data", "hadoop", "spark", "Food preparation",
        "Kitchen maintenance", "Kitchen equipment", "operation", "Food sanitation"

    ]
    
    text_lower = text.lower()

    found_skills = set()

    for skill in skills_keywords:
        pattern = r'\b' + re.escape(skill.lower()) + r'\b'
        if re.search(pattern, text_lower):
            found_skills.add(skill)

    return sorted(found_skills) if found_skills else "Not Found"

import re

def extract_experience(text):
    # List of job roles to search for
    job_roles = [
        "Staff Accountant", "Bookkeeper", "Accounts Payable Specialist", "Accounts Receivable Specialist",
        "Payroll Specialist", "General Ledger Accountant", "Cash Applications Specialist", "Fixed Assets Accountant",
        "Revenue Recognition Specialist", "Financial Reporting Analyst", "Financial Accountant", "Cost Accountant",
        "Tax Accountant", "Forensic Accountant", "Financial Reporting Manager", "Financial Analyst", "Budget Analyst",
        "Internal Auditor", "Compliance Officer", "Risk Analyst", "Controller", "Financial Controller",
        "Corporate Controller", "Divisional Controller", "Financial Planning and Analysis Manager", "Business Analyst",
        "Management Accountant", "Cost Control Analyst", "Profitability Analyst", "Performance Analyst",
        "Auditing Manager", "Tax Manager", "International Accountant", "Government Accountant", "Non-profit Accountant",
        "Healthcare Accountant", "Manufacturing Accountant", "Retail Accountant", "Construction Accountant",
        "Information Technology Accountant", "Real Estate Accountant", "Mergers and Acquisitions Accountant",
        "Fraud Examiner", "Forensic Auditor", "Valuation Specialist", "Financial Advisor", "Financial Consultant",
        "CFO (Chief Financial Officer)", "Treasurer", "Financial Operations Manager",
        # Additional job roles created for enhancement
        "Accounts Manager", "Finance Director", "Revenue Analyst", "Cash Management Specialist",
        "Business Development Manager", "Risk Management Consultant", "Tax Compliance Analyst",
        "Investment Analyst", "Corporate Finance Manager", "Cost Analyst", "Budget Officer",
        "Financial Systems Analyst", "Procurement Analyst", "Loan Officer", "Equity Analyst",
        "Financial Planning Manager", "Audit Associate", "Financial Operations Analyst", "Risk Management Analyst",
        "Tax Associate", "Compliance Analyst", "Financial Reporting Officer", "Strategic Finance Analyst",
        "Credit Analyst", "Accounts Supervisor", "Payment Analyst", "Financial Risk Manager",
        "Real Estate Financial Analyst", "Insurance Accountant", "Treasury Analyst", "Securities Analyst",
        "Fixed Income Analyst", "Equity Research Analyst", "Actuarial Analyst", "Real Estate Investment Analyst",
        "Performance Measurement Analyst", "Corporate Treasury Manager", "Investment Banking Analyst",
        "Project Finance Analyst", "Financial Product Manager", "Operations Accountant", "Compliance Manager",
        "Cost Estimator", "Business Intelligence Analyst"
    ]

    # Define patterns for dates
    date_patterns = [
        r"\b(19[9]\d|20[0-2]\d)\b",  # Years from 1990 to 2029
        r"\b(19[9]\d|20[0-2]\d)[-–](19[9]\d|20[0-2]\d)\b",  # Year ranges like "2020-2022"
        r"\b(19[9]\d|20[0-2]\d)[-–](Present|present)\b",  # Year ranges like "2021-Present"
        r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}\b",  # Month Year
        r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}[-–] (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}\b"  # Month Year - Month Year
    ]

    # Combine all date patterns into a single regex pattern
    date_pattern = re.compile("|".join(date_patterns), re.IGNORECASE)

    # Split the text into lines for processing
    lines = text.splitlines()

    # Storage for job entries
    experience_list = []
    current_experience = {}

    # Iterate through lines to extract experience data
    for i, line in enumerate(lines):
        line = line.strip()

        # Skip empty lines
        if not line:
            continue

        # Check for job roles mentioned in the line
        for job in job_roles:
            if job.lower() in line.lower():
                # If a job title is found, initiate job extraction
                current_experience["Job Title"] = job
                
                # Look for dates in the line
                date_match = date_pattern.search(line)
                if date_match:
                    current_experience["Dates"] = date_match.group()

                # Look for the company name (assumes it's mentioned after 'at' or similar)
                company_match = re.search(r"(?:at|for|with) ([A-Z][a-zA-Z]+(?: [A-Z][a-zA-Z]+)*)", line, re.IGNORECASE)
                if company_match:
                    current_experience["Company"] = company_match.group(1)
                
                # Look for the place (assumes it's mentioned after 'in' or 'at')
                place_match = re.search(r"(?:in|at) ([A-Z][a-zA-Z]+(?: [A-Z][a-zA-Z]+)*)", line, re.IGNORECASE)
                if place_match:
                    current_experience["Place"] = place_match.group(1)
                
                # Add to the experience list if we have essential details
                if current_experience.get("Company") and current_experience.get("Job Title") and current_experience.get("Dates"):
                    company = current_experience.get("Company", "")
                    role = current_experience.get("Job Title", "")
                    dates = current_experience.get("Dates", "")
                    place = current_experience.get("Place", "")
                    experience_entry = f"{company}-{role}-{dates}-{place}"
                    experience_list.append(experience_entry)
                    current_experience = {}  # Reset for the next job

                break  # Stop checking further job roles in this line if one is found

        # Look for dates in the line if no job roles are found
        if not current_experience and date_pattern.search(line):
            # Handle cases where we need to find company and place based on earlier logic
            current_experience["Dates"] = date_pattern.search(line).group()

            # Look for job titles and companies around this line
            job_description = [line]

            # Check the next 2 lines for more job details (job title, company)
            for next_line in lines[i+1:i+3]:
                next_line = next_line.strip()
                if next_line:
                    job_description.append(next_line)

            combined_description = " ".join(job_description)

            # Extract job title based on capitalized words pattern
            job_title_match = re.search(r"([A-Z][a-zA-Z]+(?: [A-Z][a-zA-Z]+)*)", combined_description)
            if job_title_match:
                current_experience["Job Title"] = job_title_match.group()

            # Extract company name based on context
            company_match = re.search(r"(?:at|for|with) ([A-Z][a-zA-Z]+(?: [A-Z][a-zA-Z]+)*)", combined_description, re.IGNORECASE)
            if company_match:
                current_experience["Company"] = company_match.group(1)

            # Extract place
            place_match = re.search(r"(?:in|at) ([A-Z][a-zA-Z]+(?: [A-Z][a-zA-Z]+)*)", combined_description, re.IGNORECASE)
            if place_match:
                current_experience["Place"] = place_match.group(1)

    # Append the last found experience if it exists
    if current_experience:
        company = current_experience.get("Company", "")
        role = current_experience.get("Job Title", "")
        dates = current_experience.get("Dates", "")
        place = current_experience.get("Place", "")
        experience_entry = f"{company}-{role}-{dates}-{place}"
        experience_list.append(experience_entry)

    # Filter out any experience entries with dates before 1990
    experience_list = [entry for entry in experience_list if not re.search(r"\b(19[0-8]\d)\b", entry)]

    # Return the extracted experience data or 'Not Found' if nothing was found
    return experience_list if experience_list else ["Not Found"]



# Function to extract education details
def extract_education(text):
    education_list = []
    education_keywords = ["education", "academic background"]
    for keyword in education_keywords:
        education_start = text.find(keyword)
        if education_start != -1:
            lines = text.splitlines()[education_start:]  
            for line in lines:
                if line.strip() == "":
                    continue  
                university_match = re.search(r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)?", line, re.IGNORECASE)
                if university_match:
                    university = university_match.group()
                    degree_match = re.search(r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)?", line, re.IGNORECASE)
                    degree = degree_match.group() if degree_match else ""
                    year_match = re.search(r"\d{4}", line)
                    year = year_match.group() if year_match else ""
                    education_list.append({
                        "University": university,
                        "Degree": degree,
                        "Year": year
                    })
            break
    return education_list

# Function to extract languages
def extract_languages(text):
    language_keywords = ["languages", "skills", "proficiency"]
    for keyword in language_keywords:
        language_start = text.find(keyword)
        if language_start != -1:
            lines = text.splitlines()[language_start:]
            languages = []
            for line in lines:
                if line.strip() == "":
                    continue
                language_match = re.search(r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)?", line, re.IGNORECASE)
                if language_match:
                    languages.append(language_match.group())
            return languages
    return []

# Function to extract certificates
def extract_certificates(text):
    certificate_keywords = ["certifications", "accreditations"]
    for keyword in certificate_keywords:
        certificate_start = text.find(keyword)
        if certificate_start != -1:
            lines = text.splitlines()[certificate_start:]
            certificates = []
            for line in lines:
                if line.strip() == "":
                    continue
                certificate_match = re.search(r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)?", line, re.IGNORECASE)
                if certificate_match:
                    certificates.append(certificate_match.group())
            return certificates
    return []




# Function to process all PDFs in a folder and save the extracted info in a CSV
def process_resumes(folder_path, output_csv_path):
    data = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(pdf_path)

            name = extract_name(text)
            profile_summary = extract_profile_summary(text)
            email = extract_email(text)
            phone = extract_phone_number(text)
            address = extract_address(text)
            links = extract_links(pdf_path)
            experience = extract_experience(text)
            education = extract_education(text)
            languages = extract_languages(text)
            certificates = extract_certificates(text)
            skills = extract_skills(text)

            data.append({
                'File Name': filename,
                'Name': name,
                'Profile Summary': profile_summary,
                'Email': email,
                'Phone': phone,
                'Address': address,
                'Links': links,
                'Skills': skills,
                'Experience': experience,
                'Education': education,
                'Languages': languages,
                'Certificates': certificates,  
            })

    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)

    # Clear previous data in the CSV file
    if os.path.exists(output_csv_path):
        os.remove(output_csv_path)  # Remove the existing file

    # Save the new data to the CSV file
    df.to_csv(output_csv_path, index=False)
    print(f'Data saved to {output_csv_path}')


    # Create a DataFrame and save it to a CSV file
    df = pd.DataFrame(data)
    df.to_csv(output_csv_path, index=False)
    print(f'Data saved to {output_csv_path}')

process_resumes('C:\\Users\\santhoshs.s\\jupyter\\resumes\\data\\data\\BPO\\bb', 'output3.csv')

Data saved to output3.csv
Data saved to output3.csv
