In [5]:
import fitz  # PyMuPDF for reading PDFs
import re
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize

# Function to extract text from PDF file and convert it to lowercase
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text.lower()  # Convert the entire text to lowercase

# Function to extract the candidate's name
def extract_name(text):
    name_patterns = [
        r"[A-Z][a-z]+ [A-Z][a-z]+",  # Two words with capital letters (first name + last name)
        r"[A-Z][a-z]+-\w+"  # Hyphenated names like "Anne-Marie"
    ]

    for pattern in name_patterns:
        match = re.search(pattern, text, re.IGNORECASE)  # Case-insensitive matching
        if match:
            return match.group()

    # If regular expressions fail, try using NLTK for Named Entity Recognition (NER)
    try:
        tokens = word_tokenize(text)
        tagged_text = nltk.pos_tag(tokens)
        for word, tag in tagged_text:
            if tag == 'NNP':  # Proper noun, typically used for names
                return word
    except LookupError:
        print("NLTK data not downloaded. Install NLTK and run 'nltk.download()' to download required resources.")

    return "Not Found"

# Function to extract the profile summary
def extract_profile_summary(text):
    keywords = [
        "summary", "profile", "objective", "professional summary",
        "career summary", "executive summary", "personal summary",
        "summary of qualifications", "overview", "Profile"
    ]
    
    lines = text.splitlines()  # Split the text into lines
    summary_start = None
    keyword_patterns = [re.compile(r'\s*'.join(list(keyword.lower()))) for keyword in keywords]

    for pattern in keyword_patterns:
        for idx, line in enumerate(lines):
            if pattern.search(line.lower()):
                summary_start = idx
                break
        if summary_start is not None:
            break

    if summary_start is not None:
        extracted_lines = []
        for i in range(summary_start + 1, len(lines)):
            line = lines[i].strip()
            if line == "" or line.startswith("*") or line.startswith("#"):
                continue
            if line.isupper():
                break
            extracted_lines.append(line)
        summary_result = "\n".join(extracted_lines).strip()
        return summary_result if summary_result else "Not Found"
    
    return "Not Found"

# Function to extract email address
def extract_email(text):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    match = re.search(email_pattern, text)
    return match.group(0) if match else "Not Found"

# Function to extract the phone number
def extract_phone_number(text):
    # Regular expression pattern for phone numbers, allowing symbols like (), -, and spaces
    phone_pattern = r'\b(?:\+?(\d{1,3}))?[-. ()]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})\b'
    match = re.search(phone_pattern, text)
    
    # If a match is found, remove all non-digit characters to get a clean number
    if match:
        # Combine all groups into a single string and remove any non-numeric characters
        raw_number = match.group(0)
        clean_number = re.sub(r'\D', '', raw_number)  # Remove all non-digit characters
        return clean_number
    
    return "Not Found"

# Function to extract address
def extract_address(text):
    address_keywords = ['address', 'street', 'city', 'state', 'zip', 'postal']
    lines = text.split('\n')
    for line in lines:
        if any(keyword in line.lower() for keyword in address_keywords):
            return line.strip()
    return "Not Found"

def extract_links(text):
    """
    Extracts hyperlinks from text, including standard URLs, HTML-like links, and markdown links, and saves them in separate columns based on their type.

    Args:
        text (str): The text from which to extract links.

    Returns:
        dict: A dictionary containing separate lists for LinkedIn, GitHub, email, Kaggle, portfolio, Facebook, Instagram, Git, Monster, and other links.
    """

    # Standard URLs
    standard_link_pattern = r'https?://[^\s]+'

    # LinkedIn links
    linkedin_link_pattern = r'https?://www\.linkedin\.com/in/[a-zA-Z0-9-]*'

    # GitHub links
    github_link_pattern = r'https?://github\.com/[a-zA-Z0-9-]*'

    # Email links
    email_link_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'

    # Kaggle links
    kaggle_link_pattern = r'https?://www\.kaggle\.com/([a-zA-Z0-9-]*|profile/[a-zA-Z0-9-]*)'

    # Portfolio links
    portfolio_link_pattern = r'https?://(www\.)?[a-zA-Z0-9-]*\.(com|net|org|io|in)/?'

    # Facebook links
    facebook_link_pattern = r'https?://www\.facebook\.com/([a-zA-Z0-9-]*|profile\.php\?id=[0-9]+)'

    # Instagram links
    instagram_link_pattern = r'https?://www\.instagram\.com/([a-zA-Z0-9_]*)'

    # Git links
    git_link_pattern = r'https?://(git|gitlab|bitbucket)\.(com|org)/[a-zA-Z0-9-]*/[a-zA-Z0-9-]*'

    # Monster links
    monster_link_pattern = r'https?://www\.monster\.com/jobs/search/?'

    # HTML links
    html_link_pattern = r'<a\s+(?:[^>]*?\s+)?href=["\'](https?://[^\s"\']+)["\']'

    # Markdown links
    markdown_link_pattern = r'\[.*?\]\((https?://[^\s]+)\)'

    # Plain links like www.example.com or example.com
    plain_link_pattern = r'\b(?:www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:/[^\s]*)?\b'

    # Find all matches for each pattern
    standard_links = re.findall(standard_link_pattern, text)
    linkedin_links = re.findall(linkedin_link_pattern, text)
    github_links = re.findall(github_link_pattern, text)
    email_links = re.findall(email_link_pattern, text)
    kaggle_links = re.findall(kaggle_link_pattern, text)
    portfolio_links = re.findall(portfolio_link_pattern, text)
    facebook_links = re.findall(facebook_link_pattern, text)
    instagram_links = re.findall(instagram_link_pattern, text)
    git_links = re.findall(git_link_pattern, text)
    monster_links = re.findall(monster_link_pattern, text)
    html_links = re.findall(html_link_pattern, text)
    markdown_links = re.findall(markdown_link_pattern, text)
    plain_links = re.findall(plain_link_pattern, text)

    # Combine all found links into a single list
    hyperlinks = standard_links + html_links + markdown_links + plain_links

    # Remove duplicates by converting to a set, then back to a list
    unique_links = list(set(hyperlinks))

    # Create a dictionary to store links by type
    links_dict = {
        'LinkedIn': linkedin_links,
        'GitHub': github_links,
        'Email': email_links,
        'Kaggle': kaggle_links,
        'Portfolio': portfolio_links,
        'Facebook': facebook_links,
        'Instagram': instagram_links,
        'Git': git_links,
        'Monster': monster_links,
        'Other': [link for link in unique_links if link not in linkedin_links and link not in github_links and link not in email_links and link not in kaggle_links and link not in portfolio_links and link not in facebook_links and link not in instagram_links and link not in git_links and link not in monster_links]
    }

    return links_dict

# Updated Function to extract experience details
def extract_experience(text):
    experience_keywords = [
        "work history", "employment history", "work experience",
        "professional experience", "career summary", "professional background",
        "job experience", "internships", "relevant experience",
        "contract work", "military experience", "volunteer work"
    ]
    
    date_patterns = [
        r"\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s*\d{4}\s*[-–—to]+\s*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s*\d{4}",
        r"\b\d{4}\s*[-–—to]+\s*\d{4}\b",
        r"\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s*\d{4}\s*[-–—to]+\s*Present",
        r"\b\d{4}\s*[-–—to]+\s*Present\b"
    ]

    job_title_keywords = [
        "manager", "director", "engineer", "analyst", "consultant",
        "assistant", "coordinator", "specialist", "developer", "designer",
        "executive", "advisor", "technician", "officer", "intern", "trainee"
    ]

    company_patterns = r"(at|with|for)\s+([A-Z][\w&,. ]+)"
    location_patterns = [
        r"\bin\s+([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)\b",
        r"\b([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)\s+(USA|Inc|Ltd|LLC|Co)\b"
    ]

    location_regex = '|'.join(location_patterns)
    experience_start = re.search('|'.join(experience_keywords), text, re.IGNORECASE)
    if not experience_start:
        return []

    content = text[experience_start.start():]
    dates = re.findall('|'.join(date_patterns), content)
    job_entries = re.split('|'.join(date_patterns), content)
    experiences = []

    for date, job in zip(dates, job_entries):
        job_title_match = re.search('|'.join(job_title_keywords), job, re.IGNORECASE)
        job_title = job_title_match.group(0) if job_title_match else ''
        company_match = re.search(company_patterns, job)
        company = company_match.group(2) if company_match else ''
        location_match = re.search(location_regex, job)
        location = location_match.group(1) if location_match else ''
        responsibilities = re.findall(r"•\s*.*|(?<=\n)-\s*.*", job)
        responsibilities_text = ' '.join(responsibilities).strip()

        experiences.append({
            "Company": company,
            "Job Title": job_title,
            "Location": location,
            "Dates": date,
            "Responsibilities": responsibilities_text
        })

    return experiences

import re

def extract_education(text):
    education_list = []
    
    # Define regex patterns for degrees, universities, schools, CGPA/SGPA/percentage, and dates
    degree_patterns = [
        r"(Bachelor|Master|Associate|Doctorate|Ph\.?D|Diploma|Certificate)\s*(of\s*(Arts|Science|Business|Engineering|Technology|Management|Education|Commerce|Law|Medicine|Fine Arts|Social Work|Computer Science|Psychology|Nursing|Physics|Chemistry|Biology))?",
        r"(BA|BS|BSc|MA|MS|MBA|MFA|LLB|LLM|PhD|M\.Ed|B\.Ed|BBA)"
    ]
    university_patterns = r"[A-Z][a-zA-Z\s&]*\s*(University|Institute|College|Academy|Polytechnic)"
    school_patterns = r"[A-Z][a-zA-Z\s&]*\s*(High School|Secondary School|Senior School|School)"
    time_period_pattern = r"((?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}\s*[-–to]+\s*(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)?[a-z]*\s*\d{4})"
    year_pattern = r"\b(19|20)\d{2}\b"
    score_patterns = [
        r"(CGPA|SGPA)\s*[:\-]?\s*(\d\.\d{1,2})",  # CGPA/SGPA with possible decimals
        r"(\d{1,2}\.\d{1,2})\s*\/\s*10",  # e.g., "8.5/10"
        r"(\d{1,2})\s*%",  # Percentage
    ]
    
    # Find the lines that might contain education details
    lines = text.splitlines()
    for line in lines:
        line = line.strip()
        if not line:
            continue  # Skip empty lines

        # Extract degree details
        degree_match = None
        for pattern in degree_patterns:
            degree_match = re.search(pattern, line, re.IGNORECASE)
            if degree_match:
                degree = degree_match.group(0)
                break
        degree = degree if degree_match else "Not Found"
        
        # Extract university details
        university_match = re.search(university_patterns, line, re.IGNORECASE)
        university = university_match.group(0) if university_match else "Not Found"
        
        # Extract school details if no university found
        school_match = re.search(school_patterns, line, re.IGNORECASE)
        school = school_match.group(0) if school_match else "Not Found"
        
        # Extract time period (e.g., "August 2016 - May 2020") or standalone year
        time_period_match = re.search(time_period_pattern, line, re.IGNORECASE)
        time_period = time_period_match.group(0) if time_period_match else "Not Found"
        
        # Extract standalone year if no time period was found
        if time_period == "Not Found":
            year_match = re.search(year_pattern, line)
            time_period = year_match.group(0) if year_match else "Not Found"
        
        # Extract CGPA, SGPA, or percentage if present
        score = "Not Found"
        for pattern in score_patterns:
            score_match = re.search(pattern, line, re.IGNORECASE)
            if score_match:
                score = score_match.group(0)
                break
        
        # Only add an entry if it contains more than just "Not Found"
        if degree != "Not Found" or university != "Not Found" or school != "Not Found" or time_period != "Not Found" or score != "Not Found":
            education_list.append({
                "Degree": degree,
                "University": university,
                "School": school,
                "Year/Time Period": time_period,
                "Score": score
            })
    
    return education_list


# Function to extract languages
def extract_languages(text):
    language_keywords = ["languages", "skills", "proficiency"]
    for keyword in language_keywords:
        language_start = text.find(keyword)
        if language_start != -1:
            lines = text.splitlines()[language_start:]
            languages = []
            for line in lines:
                if line.strip() == "":
                    continue
                language_match = re.search(r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)?", line, re.IGNORECASE)
                if language_match:
                    languages.append(language_match.group())
            return languages
    return []

# Function to extract certificates
def extract_certificates(text):
    certificate_keywords = ["certifications", "accreditations"]
    for keyword in certificate_keywords:
        certificate_start = text.find(keyword)
        if certificate_start != -1:
            lines = text.splitlines()[certificate_start:]
            certificates = []
            for line in lines:
                if line.strip() == "":
                    continue
                certificate_match = re.search(r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)?", line, re.IGNORECASE)
                if certificate_match:
                    certificates.append(certificate_match.group())
            return certificates
    return []
def extract_skills(text):
    # Predefined list of common technical and soft skills
    skills_list = [
        "C", "C++", "Python", "Java", "SQL", "JavaScript", "HTML", "CSS", "React", "Angular",
        "Machine Learning", "Deep Learning", "Data Structures", "Data Analysis", "Project Management",
        "Leadership", "Communication", "Teamwork", "Problem Solving", "Critical Thinking",
        "Agile", "Scrum", "AWS", "Azure", "Docker", "Kubernetes", "DevOps", "Data Science",
        "R", "Excel", "Power BI", "Tableau", "Automation", "Selenium", "REST APIs", 
        "Networking", "Cybersecurity", "Cloud Computing", "Blockchain", "UI/UX Design",
        "Customer Service", "Salesforce", "SAP", "Digital Marketing", "SEO", "Content Writing"
    ]
    
    # Convert the skills list to lowercase for case-insensitive matching
    skills_set = set(skill.lower() for skill in skills_list)
    
    # Tokenize the text into words
    words = word_tokenize(text)
    
    # Extract the skills by checking if any word in the text matches the skills list
    extracted_skills = [word for word in words if word.lower() in skills_set]
    
    # Extract skill lines that might include proficiency symbols (e.g., "○ ○ ○ ○ ○")
    proficiency_pattern = r"([A-Za-z\s,]+)\s*([○●]{1,5})"
    proficiency_matches = re.findall(proficiency_pattern, text)
    
    # Add skills from proficiency matches
    for match in proficiency_matches:
        skill = match[0].strip()
        if skill.lower() in skills_set:
            extracted_skills.append(skill)

    # Extract skills listed in a comma-separated format
    skills_pattern = r"([A-Za-z\s\+\#\&]+(?:,\s*[A-Za-z\s\+\#\&]+)*)"
    skills_matches = re.findall(skills_pattern, text)
    for match in skills_matches:
        for skill in match.split(','):
            cleaned_skill = skill.strip()
            if cleaned_skill.lower() in skills_set:
                extracted_skills.append(cleaned_skill)

    # Remove duplicates by converting the list to a set and back to a list
    unique_skills = list(set(extracted_skills))
    
    return unique_skills if unique_skills else ["Not Found"]


# Function to process all PDFs in a folder and save the extracted info in a CSV
def process_resumes(folder_path, output_csv_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(pdf_path)

            name = extract_name(text)
            profile_summary = extract_profile_summary(text)
            email = extract_email(text)
            phone = extract_phone_number(text)
            address = extract_address(text)
            links = extract_links(text)
            experience = extract_experience(text)
            education = extract_education(text)
            languages = extract_languages(text)
            certificates = extract_certificates(text)
            skills = extract_skills(text)

            data.append({
                'File Name': filename,
                'Name': name,
                'Profile Summary': profile_summary,
                'Email': email,
                'Phone': phone,
                'Address': address,
                'Links': links,
                'Experience': experience,
                'Education': education,
                'Languages': languages,
                'Certificates': certificates,
                'Skills': skills
            })

    df = pd.DataFrame(data)

    if os.path.exists(output_csv_path):
        os.remove(output_csv_path)
    df.to_csv(output_csv_path, index=False)
    print(f'Data saved to {output_csv_path}')

process_resumes('C:\\Users\\santhoshs.s\\jupyter\\resumes\\data\\data\\BPO\\bb', 'chatgpt.csv')

Data saved to chatgpt.csv


In [6]:
import fitz  # PyMuPDF for reading PDFs
import re
from nltk.tokenize import word_tokenize

# Function to extract text from PDF file and convert it to lowercase
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text.lower()  # Convert the entire text to lowercase

# Function to extract skills from the text
def extract_skills(text):
    # Predefined list of common technical and soft skills
    skills_list = [
        "C", "C++", "Python", "Java", "SQL", "JavaScript", "HTML", "CSS", "React", "Angular",
        "Machine Learning", "Deep Learning", "Data Structures", "Data Analysis", "Project Management",
        "Leadership", "Communication", "Teamwork", "Problem Solving", "Critical Thinking",
        "Agile", "Scrum", "AWS", "Azure", "Docker", "Kubernetes", "DevOps", "Data Science",
        "R", "Excel", "Power BI", "Tableau", "Automation", "Selenium", "REST APIs", 
        "Networking", "Cybersecurity", "Cloud Computing", "Blockchain", "UI/UX Design",
        "Customer Service", "Salesforce", "SAP", "Digital Marketing", "SEO", "Content Writing"
    ]
    
    # Convert the skills list to lowercase for case-insensitive matching
    skills_set = set(skill.lower() for skill in skills_list)
    
    # Tokenize the text into words
    words = word_tokenize(text)
    
    # Extract the skills by checking if any word in the text matches the skills list
    extracted_skills = [word for word in words if word.lower() in skills_set]
    
    # Extract skill lines that might include proficiency symbols (e.g., "○ ○ ○ ○ ○")
    proficiency_pattern = r"([A-Za-z\s,]+)\s*([○●]{1,5})"
    proficiency_matches = re.findall(proficiency_pattern, text)
    
    # Add skills from proficiency matches
    for match in proficiency_matches:
        skill = match[0].strip()
        if skill.lower() in skills_set:
            extracted_skills.append(skill)

    # Extract skills listed in a comma-separated format
    skills_pattern = r"([A-Za-z\s\+\#\&]+(?:,\s*[A-Za-z\s\+\#\&]+)*)"
    skills_matches = re.findall(skills_pattern, text)
    for match in skills_matches:
        for skill in match.split(','):
            cleaned_skill = skill.strip()
            if cleaned_skill.lower() in skills_set:
                extracted_skills.append(cleaned_skill)

    # Remove duplicates by converting the list to a set and back to a list
    unique_skills = list(set(extracted_skills))
    
    return unique_skills if unique_skills else ["Not Found"]

# Function to parse a PDF file and display extracted skills
def parse_pdf_and_extract_skills(pdf_path):
    # Extract text from the given PDF file
    text = extract_text_from_pdf(pdf_path)
    
    # Extract skills from the text
    skills = extract_skills(text)
    
    # Display the extracted skills
    print(f"Extracted Skills from {pdf_path}:")
    print(skills)

# Example usage:
# Replace 'sample_resume.pdf' with the path to your PDF file
pdf_path = 'C:/Users/santhoshs.s/jupyter/resumes/data/data/BPO/bb/Amsterdam-Modern-Resume-Template.pdf'
parse_pdf_and_extract_skills(pdf_path)


Extracted Skills from C:/Users/santhoshs.s/jupyter/resumes/data/data/BPO/bb/Amsterdam-Modern-Resume-Template.pdf:
['c', 'r']


In [7]:
import fitz  # PyMuPDF for reading PDFs

# Function to extract text from PDF file and convert it to lowercase
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text.lower()  # Convert the entire text to lowercase

# Path to the provided PDF file
pdf_path = 'C:/Users/santhoshs.s/jupyter/resumes/data/data/BPO/bb/Amsterdam-Modern-Resume-Template.pdf'

# Extract text from the PDF file
extracted_text = extract_text_from_pdf(pdf_path)

# Display the extracted text
extracted_text[:1000]  # Displaying only the first 1000 characters for brevity


'julie monroe\nnutrition consultant\nd e t a i l s\naddress\n1515 pacific ave\nlos angeles, ca 90291\nunited states\nphone\n3868683442\nemail\nemail@email.com\nplace of birth\nsan antonio\ndriving license\nfull\nl i n k s\nlinkedin\npinterest\nresume templates\nbuild this template\ns k i l l s\nfood preparation\nkitchen maintenance\nkitchen equipment \noperation\nfood sanitation\nnutrition\nh o b b i e s\nsoccer, rugby, tennis\nl a n g u a g e s\nenglish\np r o f i l e\ntalented nutrition consultant with three years of experience. skilled in nutrition \nand food preparation and looking to deliver healthy, delicious meals at woodacre \nnursing home. at 7-star senior living, cheerfully cleaned kitchens and prepared \nthree meals daily for 120+ residents. received a promotion to head nutrition \nconsultant within five months of hiring due to efficiency and interpersonal skills.\ne m p l o y m e n t  h i s t o r y\nnutritional consultant (part-time) , wic\nport washington\njan 2021 — prese

In [2]:
import fitz  # PyMuPDF for reading PDFs
import re
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize

# Predefined lists and compiled regex patterns
name_patterns = [
    re.compile(r"[A-Z][a-z]+ [A-Z][a-z]+"),  # First + Last name
    re.compile(r"[A-Z][a-z]+-\w+")  # Hyphenated names
]

email_pattern = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')
phone_pattern = re.compile(r'\b(?:\+?(\d{1,3}))?[-. ()]*(\d{3})[-. )]*(\d{3})[-. ]*(\d{4})\b')

address_keywords = ['address', 'street', 'city', 'state', 'zip', 'postal']

keywords = [
    "summary", "profile", "objective", "professional summary",
    "career summary", "executive summary", "personal summary",
    "summary of qualifications", "overview", "Profile"
]

experience_keywords = [
    "work history", "employment history", "work experience",
    "professional experience", "career summary", "professional background",
    "job experience", "internships", "relevant experience",
    "contract work", "military experience", "volunteer work"
]

date_patterns = [
    re.compile(r"\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s*\d{4}\s*[-–—to]+\s*(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s*\d{4}"),
    re.compile(r"\b\d{4}\s*[-–—to]+\s*\d{4}\b"),
    re.compile(r"\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s*\d{4}\s*[-–—to]+\s*Present"),
    re.compile(r"\b\d{4}\s*[-–—to]+\s*Present\b")
]

# Function to extract text from PDF file
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        return "".join(page.get_text() for page in doc).lower()

# Function to extract candidate's name
def extract_name(text):
    for pattern in name_patterns:
        match = pattern.search(text)
        if match:
            return match.group()

    try:
        tokens = word_tokenize(text)
        tagged_text = nltk.pos_tag(tokens)
        for word, tag in tagged_text:
            if tag == 'NNP':  # Proper noun, typically used for names
                return word
    except LookupError:
        print("NLTK data not downloaded. Install NLTK and run 'nltk.download()' to download required resources.")

    return "Not Found"

# Function to extract profile summary
def extract_profile_summary(text):
    lines = text.splitlines()
    for idx, line in enumerate(lines):
        if any(keyword.lower() in line.lower() for keyword in keywords):
            summary_start = idx + 1
            break
    else:
        return "Not Found"

    summary_lines = []
    for line in lines[summary_start:]:
        line = line.strip()
        if line == "" or line.startswith(("*", "#")) or line.isupper():
            break
        summary_lines.append(line)

    return "\n".join(summary_lines).strip() or "Not Found"

# Function to extract email address
def extract_email(text):
    match = email_pattern.search(text)
    return match.group(0) if match else "Not Found"

# Function to extract phone number
def extract_phone_number(text):
    match = phone_pattern.search(text)
    return re.sub(r'\D', '', match.group(0)) if match else "Not Found"

# Function to extract address
def extract_address(text):
    lines = text.split('\n')
    for line in lines:
        if any(keyword in line.lower() for keyword in address_keywords):
            return line.strip()
    return "Not Found"

# Function to extract links
def extract_links(text):
    link_patterns = {
        'LinkedIn': r'https?://www\.linkedin\.com/in/[a-zA-Z0-9-]*',
        'GitHub': r'https?://github\.com/[a-zA-Z0-9-]*',
        'Kaggle': r'https?://www\.kaggle\.com/([a-zA-Z0-9-]*|profile/[a-zA-Z0-9-]*)',
        'Portfolio': r'https?://(www\.)?[a-zA-Z0-9-]*\.(com|net|org|io|in)/?',
        'Facebook': r'https?://www\.facebook\.com/([a-zA-Z0-9-]*|profile\.php\?id=\d+)',
        'Instagram': r'https?://www\.instagram\.com/([a-zA-Z0-9_]*)',
        'Git': r'https?://(git|gitlab|bitbucket)\.(com|org)/[a-zA-Z0-9-]*/[a-zA-Z0-9-]*',
        'Monster': r'https?://www\.monster\.com/jobs/search/?',
        'Email': r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}',
        'Standard URLs': r'https?://[^\s]+',
        'HTML Links': r'<a\s+(?:[^>]*?\s+)?href=["\'](https?://[^\s"\']+)["\']',
        'Markdown Links': r'\[.*?\]\((https?://[^\s]+)\)',
        'Plain Links': r'\b(?:www\.)?[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:/[^\s]*)?\b'
    }

    links_dict = {key: re.findall(pattern, text) for key, pattern in link_patterns.items()}

    # Combine all found links into a single list and remove duplicates
    all_links = set()
    for links in links_dict.values():
        all_links.update(links)

    links_dict['Other'] = list(all_links - set(links_dict['LinkedIn']) - set(links_dict['GitHub']) - set(links_dict['Email']))
    
    return links_dict

# Function to extract experience details
def extract_experience(text):
    if not re.search('|'.join(experience_keywords), text, re.IGNORECASE):
        return []

    content = text
    dates = [date_pattern.findall(content) for date_pattern in date_patterns]
    dates = [date for sublist in dates for date in sublist]  # Flatten the list

    job_entries = re.split('|'.join(date_patterns), content)[1:]  # Split content and skip the first part
    experiences = []

    for date, job in zip(dates, job_entries):
        job_title = next((title for title in re.findall(r'\b\w+\b', job) if title.lower() in job_title_keywords), '')
        company_match = re.search(r"(?:at|with|for)\s+([A-Z][\w&,. ]+)", job)
        location_match = re.search(r"\bin\s+([A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*)\b", job)

        experiences.append({
            "Company": company_match.group(1) if company_match else '',
            "Job Title": job_title,
            "Location": location_match.group(1) if location_match else '',
            "Dates": date,
            "Responsibilities": ' '.join(re.findall(r"•\s*.*|(?<=\n)-\s*.*", job)).strip()
        })

    return experiences

# Function to extract education details
def extract_education(text):
    education_list = []
    
    degree_patterns = re.compile(r"(Bachelor|Master|Associate|Doctorate|Ph\.?D|Diploma|Certificate)\s*(of\s*(Arts|Science|Business|Engineering|Technology|Management|Education|Commerce|Law|Medicine|Fine Arts|Social Work|Computer Science|Psychology|Nursing|Physics|Chemistry|Biology))?|(?:BA|BS|BSc|MA|MS|MBA|MFA|LLB|LLM|PhD|M\.Ed|B\.Ed|BBA)")
    university_patterns = re.compile(r"[A-Z][a-zA-Z\s&]*\s*(University|Institute|College|Academy|Polytechnic)")
    time_period_pattern = re.compile(r"\b\d{4}\s*[-–—to]+\s*(?:Present|\d{4})\b")

    for line in text.splitlines():
        if degree_patterns.search(line):
            degree = degree_patterns.search(line).group()
            university = university_patterns.search(line)
            time_period = time_period_pattern.search(line)

            education_list.append({
                'Degree': degree,
                'University': university.group() if university else 'Not Found',
                'Time Period': time_period.group() if time_period else 'Not Found'
            })

    return education_list

# Main function to extract all details from the resume
def extract_resume_details(resume_path):
    try:
        text = extract_text_from_pdf(resume_path)
        return {
            "Name": extract_name(text),
            "Profile Summary": extract_profile_summary(text),
            "Email": extract_email(text),
            "Phone": extract_phone_number(text),
            "Address": extract_address(text),
            "Links": extract_links(text),
            "Experience": extract_experience(text),
            "Education": extract_education(text)
        }
    except Exception as e:
        print(f"Error processing {resume_path}: {e}")
        return None

# Example usage
if __name__ == "__main__":
    resume_path = "C:\\Users\\santhoshs.s\\jupyter\\resumes\\data\data\\BPO\\bb"
    details = extract_resume_details(resume_path)
    print(details)


  resume_path = "C:\\Users\\santhoshs.s\\jupyter\\resumes\\data\data\\BPO\\bb"


Error processing C:\Users\santhoshs.s\jupyter\resumes\data\data\BPO\bb: 'C:\Users\santhoshs.s\jupyter\resumes\data\data\BPO\bb' is no file
None
