In [1]:
import fitz  # PyMuPDF for reading PDFs
import re
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize

# Function to extract text from PDF file and convert it to lowercase
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text.lower()  # Convert the entire text to lowercase

# Function to extract the candidate's name
def extract_name(text):
    name_patterns = [
        r"[A-Z][a-z]+ [A-Z][a-z]+",  # Two words with capital letters (first name + last name)
        r"[A-Z][a-z]+-\w+"  # Hyphenated names like "Anne-Marie"
    ]

    for pattern in name_patterns:
        match = re.search(pattern, text, re.IGNORECASE)  # Case-insensitive matching
        if match:
            return match.group()

    # If regular expressions fail, try using NLTK for Named Entity Recognition (NER)
    try:
        tokens = word_tokenize(text)
        tagged_text = nltk.pos_tag(tokens)
        for word, tag in tagged_text:
            if tag == 'NNP':  # Proper noun, typically used for names
                return word
    except LookupError:
        print("NLTK data not downloaded. Install NLTK and run 'nltk.download()' to download required resources.")

    return "Not Found"

# Function to extract the profile summary
def extract_profile_summary(text):
    keywords = [
        "summary", "profile", "objective", "professional summary",
        "career summary", "executive summary", "personal summary",
        "summary of qualifications", "overview", "Profile"
    ]
    
    lines = text.splitlines()  # Split the text into lines
    summary_start = None
    keyword_patterns = [re.compile(r'\s*'.join(list(keyword.lower()))) for keyword in keywords]

    for pattern in keyword_patterns:
        for idx, line in enumerate(lines):
            if pattern.search(line.lower()):
                summary_start = idx
                break
        if summary_start is not None:
            break

    if summary_start is not None:
        extracted_lines = []
        for i in range(summary_start + 1, len(lines)):
            line = lines[i].strip()
            if line == "" or line.startswith("*") or line.startswith("#"):
                continue
            if line.isupper():
                break
            extracted_lines.append(line)
        summary_result = "\n".join(extracted_lines).strip()
        return summary_result if summary_result else "Not Found"
    
    return "Not Found"

# Function to extract email address
def extract_email(text):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    match = re.search(email_pattern, text)
    return match.group(0) if match else "Not Found"

# Function to extract the phone number
def extract_phone_number(text):
    # Regular expression pattern for phone numbers, allowing symbols like (), -, and spaces
    phone_pattern = r'\b(?:\+?(\d{1,3}))?[-. ()]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})\b'
    match = re.search(phone_pattern, text)
    
    # If a match is found, remove all non-digit characters to get a clean number
    if match:
        # Combine all groups into a single string and remove any non-numeric characters
        raw_number = match.group(0)
        clean_number = re.sub(r'\D', '', raw_number)  # Remove all non-digit characters
        return clean_number
    
    return "Not Found"

# Function to extract address
def extract_address(text):
    address_keywords = ['address', 'street', 'city', 'state', 'zip', 'postal']
    lines = text.split('\n')
    for line in lines:
        if any(keyword in line.lower() for keyword in address_keywords):
            return line.strip()
    return "Not Found"

import PyPDF2

# Function to extract links and their associated text from a PDF
def extract_links_from_pdf(pdf_path):
    """
    Extracts hyperlinks along with the associated text from a PDF file.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        dict: A dictionary containing separate lists for LinkedIn, GitHub, email, Kaggle, portfolio, Facebook, Instagram, Git, Monster, and other links.
    """
    # Create a list to store links and associated text
    links_with_text = []

    # Open the PDF file
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        # Iterate through each page in the PDF
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]

            # Check if the page has annotations (i.e., links)
            if '/Annots' in page:
                annotations = page['/Annots']

                # Extract links and associated text from each annotation
                for annotation in annotations:
                    annotation_object = annotation.get_object()
                    if '/A' in annotation_object and '/URI' in annotation_object['/A']:
                        link = annotation_object['/A']['/URI']
                        # Try to get the text associated with the link
                        text = annotation_object.get('/T', '') or annotation_object.get('/Contents', 'Unknown Text')
                        links_with_text.append(f"{text} - {link}")

    # Extract different types of links using patterns
    linkedin_links = [link for link in links_with_text if 'linkedin.com' in link]
    github_links = [link for link in links_with_text if 'github.com' in link]
    kaggle_links = [link for link in links_with_text if 'kaggle.com' in link]
    portfolio_links = [link for link in links_with_text if any(domain in link for domain in ['.com', '.net', '.org', '.io'])]
    facebook_links = [link for link in links_with_text if 'facebook.com' in link]
    instagram_links = [link for link in links_with_text if 'instagram.com' in link]
    git_links = [link for link in links_with_text if 'git' in link]
    monster_links = [link for link in links_with_text if 'monster.com' in link]

    # Filter out email links (emails won't typically have clickable links in PDFs)
    email_links = [link for link in links_with_text if '@' in link]

    # Create a dictionary to store links by type
    links_dict = {
        'LinkedIn': linkedin_links,
        'GitHub': github_links,
        'Email': email_links,
        'Kaggle': kaggle_links,
        'Portfolio': portfolio_links,
        'Facebook': facebook_links,
        'Instagram': instagram_links,
        'Git': git_links,
        'Monster': monster_links,
        'Other': [link for link in links_with_text if link not in linkedin_links and link not in github_links and link not in email_links and link not in kaggle_links and link not in portfolio_links and link not in facebook_links and link not in instagram_links and link not in git_links and link not in monster_links]
    }

    return links_dict


# Updated Function to extract experience details
def extract_experience(text):
    experience_keywords = [
        "work history", "employment history", "work experience",
        "professional experience", "career summary", "professional background",
        "job experience", "internships", "relevant experience",
        "contract work", "military experience", "volunteer work"
    ]
    
    date_patterns = [
        r"\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s*\d{4}\s*[-–—to]+\s*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s*\d{4}",
        r"\b\d{4}\s*[-–—to]+\s*\d{4}\b",
        r"\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s*\d{4}\s*[-–—to]+\s*Present",
        r"\b\d{4}\s*[-–—to]+\s*Present\b"
    ]

    job_title_keywords = [
        "manager", "director", "engineer", "analyst", "consultant",
        "assistant", "coordinator", "specialist", "developer", "designer",
        "executive", "advisor", "technician", "officer", "intern", "trainee"
    ]

    company_patterns = r"(at|with|for)\s+([A-Z][\w&,. ]+)"
    location_patterns = [
        r"\bin\s+([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)\b",
        r"\b([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)\s+(USA|Inc|Ltd|LLC|Co)\b"
    ]

    location_regex = '|'.join(location_patterns)
    experience_start = re.search('|'.join(experience_keywords), text, re.IGNORECASE)
    if not experience_start:
        return []

    content = text[experience_start.start():]
    dates = re.findall('|'.join(date_patterns), content)
    job_entries = re.split('|'.join(date_patterns), content)
    experiences = []

    for date, job in zip(dates, job_entries):
        job_title_match = re.search('|'.join(job_title_keywords), job, re.IGNORECASE)
        job_title = job_title_match.group(0) if job_title_match else ''
        company_match = re.search(company_patterns, job)
        company = company_match.group(2) if company_match else ''
        location_match = re.search(location_regex, job)
        location = location_match.group(1) if location_match else ''
        responsibilities = re.findall(r"•\s*.*|(?<=\n)-\s*.*", job)
        responsibilities_text = ' '.join(responsibilities).strip()

        experiences.append({
            "Company": company,
            "Job Title": job_title,
            "Location": location,
            "Dates": date,
            "Responsibilities": responsibilities_text
        })

    return experiences

import re

def extract_education(text):
    education_list = []
    
    # Define regex patterns for degrees, universities, schools, CGPA/SGPA/percentage, and dates
    degree_patterns = [
        r"(Bachelor|Master|Associate|Doctorate|Ph\.?D|Diploma|Certificate)\s*(of\s*(Arts|Science|Business|Engineering|Technology|Management|Education|Commerce|Law|Medicine|Fine Arts|Social Work|Computer Science|Psychology|Nursing|Physics|Chemistry|Biology))?",
        r"(BA|BS|BSc|MA|MS|MBA|MFA|LLB|LLM|PhD|M\.Ed|B\.Ed|BBA)"
    ]
    university_patterns = r"[A-Z][a-zA-Z\s&]*\s*(University|Institute|College|Academy|Polytechnic)"
    school_patterns = r"[A-Z][a-zA-Z\s&]*\s*(High School|Secondary School|Senior School|School)"
    time_period_pattern = r"((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}\s*[-–to]+\s*(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)?[a-z]*\s*\d{4})"
    year_pattern = r"\b(19|20)\d{2}\b"
    score_patterns = [
        r"(CGPA|SGPA)\s*[:\-]?\s*(\d\.\d{1,2})",  # CGPA/SGPA with possible decimals
        r"(\d{1,2}\.\d{1,2})\s*\/\s*10",  # e.g., "8.5/10"
        r"(\d{1,2})\s*%",  # Percentage
    ]
    
    # Find the lines that might contain education details
    lines = text.splitlines()
    for line in lines:
        line = line.strip()
        if not line:
            continue  # Skip empty lines

        # Extract degree details
        degree_match = None
        for pattern in degree_patterns:
            degree_match = re.search(pattern, line, re.IGNORECASE)
            if degree_match:
                degree = degree_match.group(0)
                break
        degree = degree if degree_match else "Not Found"
        
        # Extract university details
        university_match = re.search(university_patterns, line, re.IGNORECASE)
        university = university_match.group(0) if university_match else "Not Found"
        
        # Extract school details if no university found
        school_match = re.search(school_patterns, line, re.IGNORECASE)
        school = school_match.group(0) if school_match else "Not Found"
        
        # Extract time period (e.g., "August 2016 - May 2020") or standalone year
        time_period_match = re.search(time_period_pattern, line, re.IGNORECASE)
        time_period = time_period_match.group(0) if time_period_match else "Not Found"
        
        # Extract standalone year if no time period was found
        if time_period == "Not Found":
            year_match = re.search(year_pattern, line)
            time_period = year_match.group(0) if year_match else "Not Found"
        
        # Extract CGPA, SGPA, or percentage if present
        score = "Not Found"
        for pattern in score_patterns:
            score_match = re.search(pattern, line, re.IGNORECASE)
            if score_match:
                score = score_match.group(0)
                break
        
        # Only add an entry if it contains more than just "Not Found"
        if degree != "Not Found" or university != "Not Found" or school != "Not Found" or time_period != "Not Found" or score != "Not Found":
            education_list.append({
                "Degree": degree,
                "University": university,
                "School": school,
                "Year/Time Period": time_period,
                "Score": score
            })
    
    return education_list


# Function to extract languages
def extract_languages(text):
    language_keywords = ["languages", "skills", "proficiency"]
    for keyword in language_keywords:
        language_start = text.find(keyword)
        if language_start != -1:
            lines = text.splitlines()[language_start:]
            languages = []
            for line in lines:
                if line.strip() == "":
                    continue
                language_match = re.search(r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)?", line, re.IGNORECASE)
                if language_match:
                    languages.append(language_match.group())
            return languages
    return []

# Function to extract certificates
def extract_certificates(text):
    certificate_keywords = ["certifications", "accreditations"]
    for keyword in certificate_keywords:
        certificate_start = text.find(keyword)
        if certificate_start != -1:
            lines = text.splitlines()[certificate_start:]
            certificates = []
            for line in lines:
                if line.strip() == "":
                    continue
                certificate_match = re.search(r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)?", line, re.IGNORECASE)
                if certificate_match:
                    certificates.append(certificate_match.group())
            return certificates
    return []
def extract_skills(text):
    # Predefined list of common technical and soft skills
    skills_list = [
        "C", "C++", "Python", "Java", "SQL", "JavaScript", "HTML", "CSS", "React", "Angular",
        "Machine Learning", "Deep Learning", "Data Structures", "Data Analysis", "Project Management",
        "Leadership", "Communication", "Teamwork", "Problem Solving", "Critical Thinking",
        "Agile", "Scrum", "AWS", "Azure", "Docker", "Kubernetes", "DevOps", "Data Science",
        "R", "Excel", "Power BI", "Tableau", "Automation", "Selenium", "REST APIs", 
        "Networking", "Cybersecurity", "Cloud Computing", "Blockchain", "UI/UX Design",
        "Customer Service", "Salesforce", "SAP", "Digital Marketing", "SEO", "Content Writing"
    ]
    
    # Convert the skills list to lowercase for case-insensitive matching
    skills_set = set(skill.lower() for skill in skills_list)
    
    # Tokenize the text into words
    words = word_tokenize(text)
    
    # Extract the skills by checking if any word in the text matches the skills list
    extracted_skills = [word for word in words if word.lower() in skills_set]
    
    # Extract skill lines that might include proficiency symbols (e.g., "○ ○ ○ ○ ○")
    proficiency_pattern = r"([A-Za-z\s,]+)\s*([○●]{1,5})"
    proficiency_matches = re.findall(proficiency_pattern, text)
    
    # Add skills from proficiency matches
    for match in proficiency_matches:
        skill = match[0].strip()
        if skill.lower() in skills_set:
            extracted_skills.append(skill)

    # Extract skills listed in a comma-separated format
    skills_pattern = r"([A-Za-z\s\+\#\&]+(?:,\s*[A-Za-z\s\+\#\&]+)*)"
    skills_matches = re.findall(skills_pattern, text)
    for match in skills_matches:
        for skill in match.split(','):
            cleaned_skill = skill.strip()
            if cleaned_skill.lower() in skills_set:
                extracted_skills.append(cleaned_skill)

    # Remove duplicates by converting the list to a set and back to a list
    unique_skills = list(set(extracted_skills))
    
    return unique_skills if unique_skills else ["Not Found"]


# Function to process all PDFs in a folder and save the extracted info in a CSV
# Function to process all PDFs in a folder and save the extracted info in a CSV
def process_resumes(folder_path, output_csv_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(pdf_path)

            name = extract_name(text)
            profile_summary = extract_profile_summary(text)
            email = extract_email(text)
            phone = extract_phone_number(text)
            address = extract_address(text)
            links = extract_links_from_pdf(pdf_path)  # Updated to use the new link extraction method
            experience = extract_experience(text)
            education = extract_education(text)
            languages = extract_languages(text)
            certificates = extract_certificates(text)
            skills = extract_skills(text)

            data.append({
                'File Name': filename,
                'Name': name,
                'Profile Summary': profile_summary,
                'Email': email,
                'Phone': phone,
                'Address': address,
                'Links': links,
                'Experience': experience,
                'Education': education,
                'Languages': languages,
                'Certificates': certificates,
                'Skills': skills
            })

    df = pd.DataFrame(data)

    if os.path.exists(output_csv_path):
        os.remove(output_csv_path)
    df.to_csv(output_csv_path, index=False)
    print(f'Data saved to {output_csv_path}')


process_resumes('C:\\Users\\santhoshs.s\\jupyter\\resumes\\data\\data\\BPO\\bb', 'chatgpt1.csv')

Data saved to chatgpt1.csv
