In [1]:
%pip install spacy
!python -m spacy download en_core_web_sm



Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -------- ------------------------------- 2.6/12.8 MB 15.1 MB/s eta 0:00:01
     ---------------- ----------------------- 5.2/12.8 MB 13.3 MB/s eta 0:00:01
     ----------------------- ---------------- 7.6/12.8 MB 12.4 MB/s eta 0:00:01
     ------------------------------- ------- 10.5/12.8 MB 12.6 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 12.5 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 12.0 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
import os
import re
import pdfplumber
import docx
import pandas as pd
import spacy

In [5]:
nlp = spacy.load("en_core_web_sm")


In [6]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text


In [7]:
def extract_info_nlp(text):
    doc = nlp(text)
    
    # Extracting names
    names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    
    # Extracting email
    email_pattern = r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'
    emails = re.findall(email_pattern, text)
    
    # Extracting phone numbers
    phone_pattern = r'(\+?\d{1,3}[-.\s]?(\(?\d{1,4}?\)?[-.\s]?)?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9})'
    phones = re.findall(phone_pattern, text)

    # Extracting skills
    skills = []  # Placeholder for skills extraction; this could be improved with more context
    
    # You might consider using a predefined list of skills to match against or using additional NER
    skill_keywords = ['Python', 'Java', 'SQL', 'Machine Learning', 'Data Analysis']  # Extend as needed
    for keyword in skill_keywords:
        if keyword.lower() in text.lower():
            skills.append(keyword)

    return {
        'Name': names[0] if names else None,
        'Phone': phones[0][0] if phones else None,
        'Email': emails[0] if emails else None,
        'Skills': skills if skills else None,
    }


In [11]:
from pdfplumber.exception

def process_resumes_nlp(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if filename.endswith('.pdf'):
            try:
                text = extract_text_from_pdf(file_path)
            except PDFSyntaxError:
                print(f"Skipping invalid PDF file: {filename}")
                continue
        elif filename.endswith('.docx'):
            text = extract_text_from_docx(file_path)
        else:
            continue
        
        extracted_info = extract_info_nlp(text)
        data.append(extracted_info)
    
    return pd.DataFrame(data)



ModuleNotFoundError: No module named 'pdfplumber.exceptions'

In [9]:
folder_path = 'D:\\Project\\New folder (2)\\resumes'  # Update this to your folder path
resumes_df = process_resumes_nlp(folder_path)
resumes_df.to_csv('extracted_resumes_nlp.csv', index=False)  # Save extracted data to CSV


PDFSyntaxError: No /Root object! - Is this really a PDF?

In [12]:
!pip list | grep pdfplumber


'grep' is not recognized as an internal or external command,
operable program or batch file.


In [13]:
!pip list


Package                   Version
------------------------- --------------
annotated-types           0.7.0
anyio                     4.5.0
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 2.4.1
async-lru                 2.0.4
attrs                     24.2.0
babel                     2.16.0
beautifulsoup4            4.12.3
bleach                    6.1.0
blinker                   1.8.2
blis                      0.7.11
catalogue                 2.0.10
certifi                   2024.8.30
cffi                      1.17.1
chardet                   5.2.0
charset-normalizer        3.3.2
click                     8.1.7
cloudpathlib              0.19.0
colorama                  0.4.6
comm                      0.2.2
confection                0.1.5
cryptography              43.0.1
cymem                     2.0.8
debugpy                   1.8.5
decorator                 5.1.1
defusedxml                0.7.1
docx2txt       

In [1]:
import os
import re
import pdfplumber
import docx
import pandas as pd
import spacy
import fitz  # PyMuPDF

# Load the spaCy model for NLP tasks
nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf_mupdf(pdf_path):
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            text += page.get_text()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return None
    return text

def extract_text_from_docx(docx_path):
    text = ""
    try:
        doc = docx.Document(docx_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    except Exception as e:
        print(f"Error extracting text from {docx_path}: {e}")
    return text

def extract_phone_email(text):
    phone_pattern = r'(\+?\d[\d -]{7,}\d)'  # Match phone numbers
    email_pattern = r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'  # Match email addresses
    
    phone_numbers = re.findall(phone_pattern, text)
    emails = re.findall(email_pattern, text)
    
    return phone_numbers, emails

def extract_address(text):
    address_pattern = r'([\w\s,]+(?:\s*dt\s*[\w\s]+)?(?:\s*tk\s*[\w\s]+)?(?:\s*st\s*[\w\s]+)?(?:\s+\d{6})?)'
    addresses = re.findall(address_pattern, text, re.IGNORECASE)
    return addresses

def extract_skills(text):
    skills = [
        "Python", "Flask", "Django", "FastAPI", "Pandas", "NumPy", "SciPy", "Matplotlib", "Seaborn", 
        "TensorFlow", "Keras", "PyTorch", "Scikit-learn", "NLTK", "SpaCy", "SQLAlchemy", "Celery",
        "Java", "Spring", "Spring Boot", "Hibernate", "Maven", "Gradle", "JUnit", "JSP", "JSF", 
        "JavaFX", "Swing", "JDBC", "Apache Camel", "Kotlin",
        "JavaScript", "Node.js", "React", "Angular", "Vue.js", "Next.js", "Express.js", "TypeScript", 
        "Redux", "jQuery", "Electron", "Gatsby", "ES6", "Svelte",
        "C++", "Boost", "Qt", "OpenCV", "STL", "CUDA", "OpenMP", "CMake", "GTest", 
        "C#", ".NET", "ASP.NET", "Entity Framework", "Blazor", "Xamarin", "Unity", "WPF", "WinForms", 
        "LINQ", "Razor Pages", "MVC", "NUnit", "Azure Functions", "SignalR",
        "SQL", "MySQL", "PostgreSQL", "SQLite", "MongoDB", "Redis", "Oracle", "Cassandra", "Elasticsearch", 
        "Microsoft SQL Server", "PL/SQL", "T-SQL",
        "Machine Learning", "Deep Learning", "NLP", "BERT", "Reinforcement Learning", "Computer Vision", 
        "XGBoost", "LightGBM", "CatBoost"
    ]
    extracted_skills = [skill for skill in skills if skill.lower() in text.lower()]
    return extracted_skills

def extract_name(text, filename):
    normalized_filename = os.path.splitext(filename)[0].lower()
    lines = text.split('\n')
    skills = extract_skills(text)
    skills_pattern = r'|'.join(map(re.escape, skills))

    # Define relevant patterns
    name_patterns = [
        r'^[A-Z][a-z]+\s[A-Z][a-z]+',  # First Last
        r'[A-Z]\.\s[A-Z][a-z]+',  # Initial. Last
        r'[A-Z][a-z]+\s[A-Z]\.'  # First Initial.
    ]
    
    job_title_pattern = r'(?i)\b(Engineer|Developer|Manager|Analyst|Specialist|Consultant|Designer|Lead|Officer)\b'
    phone_pattern = r'(\+?\d[\d -]{7,}\d)'
    email_pattern = r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'

    for line in lines:
        line = line.strip()
        # Check if the line contains a valid name
        if any(re.search(pattern, line) for pattern in name_patterns) and \
           not re.search(email_pattern, line) and \
           not re.search(phone_pattern, line) and \
           not re.search(job_title_pattern, line):
            return line

    # Check if the normalized filename is part of the text
    if normalized_filename in text.lower():
        return normalized_filename

    return None

def process_resumes_nlp(folder_path):
    data = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            text = extract_text_from_pdf_mupdf(os.path.join(folder_path, filename))
        elif filename.endswith('.docx'):
            text = extract_text_from_docx(os.path.join(folder_path, filename))
        else:
            continue

        if text:  # Check if text extraction was successful
            name = extract_name(text, filename)
            phone_numbers, emails = extract_phone_email(text)
            phone = phone_numbers[0] if phone_numbers else None
            email = emails[0] if emails else None
            addresses = extract_address(text)

            data.append({
                'filename': filename,
                'name': name,
                'email': email,
                'phone': phone,
                'addresses': addresses,
                'text': text
            })

    return pd.DataFrame(data)

# Example usage
folder_path = 'D:\\Project\\New folder (2)\\resumes'
resumes_df = process_resumes_nlp(folder_path)
print(resumes_df)


                        filename  \
0                          A.pdf   
1                         A2.pdf   
2                Alex Rivera.pdf   
3   Alphine_Patrick_resume 1.pdf   
4     Alphine_Patrick_resume.pdf   
5                David White.pdf   
6              Emily Davis 1.pdf   
7                Emily Davis.pdf   
8              Emily Sanders.pdf   
9               Emily Taylor.pdf   
10               Ethan Clark.pdf   
11       HEPZI CHANDRA J (3).pdf   
12              Jane Smith 3.pdf   
13       Jesper_deni_Resume1.pdf   
14   Jesvin Nithish's Resume.pdf   
15                John Doe 1.pdf   
16                 John Doe1.pdf   
17               John Martin.pdf   
18      LALITH UIUX DESIGNER.pdf   
19                Liam Baker.pdf   
20             Michael Davis.pdf   
21         Michael Johnson 1.pdf   
22         Michael Johnson 2.pdf   
23         Michael Johnson 3.pdf   
24           Michael Johnson.pdf   
25          Michael Robinson.pdf   
26           Monish-K Resume

In [4]:
import os
import re
import spacy
from PyPDF2 import PdfReader

# Load the spaCy NLP model (English small model)
nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    """Cleans up text by removing extra spaces, newlines, and unwanted characters."""
    return re.sub(r'\s+', ' ', text).strip()

def extract_name_from_filename(filename):
    """Extracts probable name from the filename if text-based name extraction fails."""
    name_parts = re.sub(r'[_\-\.]', ' ', filename).split()
    return ' '.join([word for word in name_parts if word.isalpha()])

def extract_name(text, filename=None):
    """Extracts the name from resume text or filename, handling underlined/formatted text."""
    cleaned_text = clean_text(text)

    # Define common regex patterns for detecting names
    name_patterns = [
        r'([A-Z][a-z]+(?: [A-Z][a-z]+)+)',  # Capitalized words pattern (First Last)
        r'([A-Z]+ [A-Z]+)',  # Uppercase names, often seen in underlined or header text
        r'([A-Z]\. [A-Z][a-z]+)',  # Patterns like "A. Lastname"
        r'([A-Z][a-z]+ [A-Z]\.)'  # Patterns like "Firstname L."
    ]

    # Try to match one of the name patterns
    for pattern in name_patterns:
        name_match = re.search(pattern, cleaned_text)
        if name_match:
            return name_match.group(0)

    # If no match is found, use spaCy's NLP for name entity recognition (NER)
    doc = nlp(cleaned_text)
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            return ent.text

    # If no name found in the text, fallback to the filename
    if filename:
        name_from_filename = extract_name_from_filename(filename)
        if name_from_filename:
            return name_from_filename
    
    return None  # No name could be extracted

def extract_email(text):
    """Extract email from the resume text."""
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    email_match = re.search(email_pattern, text)
    return email_match.group(0) if email_match else None

def extract_phone(text):
    """Extract phone number from the resume text."""
    phone_pattern = r'\+?\d[\d\s\-\(\)]{7,}\d'
    phone_match = re.search(phone_pattern, text)
    return phone_match.group(0) if phone_match else None

def extract_resume_text(file_path):
    """Reads the text from a PDF file."""
    text = ''
    with open(file_path, 'rb') as f:
        reader = PdfReader(f)
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text

def process_resumes_from_folder(folder_path):
    """Processes all the resumes from the specified folder."""
    resume_data_list = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            text = extract_resume_text(file_path)
            resume_data = {
                'filename': filename,
                'text': text
            }
            resume_data_list.append(resume_data)
    
    return resume_data_list

def extract_resume_data(resume):
    """Extracts name, email, phone, and other key details from a resume."""
    filename = resume.get('filename', None)
    text = resume.get('text', '')

    # Extract key information
    name = extract_name(text, filename)
    email = extract_email(text)
    phone = extract_phone(text)
    
    return {
        'filename': filename,
        'name': name,
        'email': email,
        'phone': phone,
    }

# Folder path where the resumes are located
folder_path = 'D:/Project/New folder (2)/resumes'

# Process resumes from the folder
resume_data_list = process_resumes_from_folder(folder_path)

# Extract information from each resume
for resume_data in resume_data_list:
    extracted_info = extract_resume_data(resume_data)
    print(f"Extracted Info from {resume_data['filename']}:")
    print(extracted_info)
    print("-" * 40)


Extracted Info from A.pdf:
{'filename': 'A.pdf', 'name': 'Level Java', 'email': 'emily.harris@example.com', 'phone': '310) 123 -4567'}
----------------------------------------
Extracted Info from A2.pdf:
{'filename': 'A2.pdf', 'name': 'Level Java', 'email': None, 'phone': '310) \n123-4567'}
----------------------------------------
Extracted Info from Alex Rivera.pdf:
{'filename': 'Alex Rivera.pdf', 'name': 'Alex Rivera', 'email': 'alex.rivera@example.com', 'phone': '312) 123 -4567'}
----------------------------------------
Extracted Info from Alphine_Patrick_resume 1.pdf:
{'filename': 'Alphine_Patrick_resume 1.pdf', 'name': 'Madurai Road', 'email': 'alphinepatrickf@gmail.com', 'phone': '+91 6380 293 207'}
----------------------------------------
Extracted Info from Alphine_Patrick_resume.pdf:
{'filename': 'Alphine_Patrick_resume.pdf', 'name': 'Madurai Road', 'email': None, 'phone': '+91 6380  293 207'}
----------------------------------------
Extracted Info from David White.pdf:
{'file

In [8]:
import os
import re
import glob
import pdfplumber
import fitz
import smtplib
from email.mime.text import MIMEText

# List of common skills to match
COMMON_SKILLS = [
    "Python", "Flask", "Django", "FastAPI", "Pandas", "NumPy", "SciPy", "Matplotlib", "Seaborn", 
    "TensorFlow", "Keras", "PyTorch", "Scikit-learn", "NLTK", "SpaCy", "SQLAlchemy", "Celery",
    "Java", "Spring", "Spring Boot", "Hibernate", "Maven", "Gradle", "JUnit", "JSP", "JSF", 
    "JavaFX", "Swing", "JDBC", "Apache Camel", "Kotlin",
    "JavaScript", "Node.js", "React", "Angular", "Vue.js", "Next.js", "Express.js", "TypeScript", 
    "Redux", "jQuery", "Electron", "Gatsby", "ES6", "Svelte",
    "C++", "Boost", "Qt", "OpenCV", "STL", "CUDA", "OpenMP", "CMake", "GTest", 
    "C#", ".NET", "ASP.NET", "Entity Framework", "Blazor", "Xamarin", "Unity", "WPF", "WinForms", 
    "LINQ", "Razor Pages", "MVC", "NUnit", "Azure Functions", "SignalR",
    "SQL", "MySQL", "PostgreSQL", "SQLite", "MongoDB", "Redis", "Oracle", "Cassandra", "Elasticsearch", 
    "Microsoft SQL Server", "PL/SQL", "T-SQL",
    "R", "ggplot2", "Shiny", "dplyr", "tidyverse", "caret", "data.table", "rpart", "lubridate", "forecast", 
    "R Markdown", "R Shiny",
    "HTML", "CSS", "Sass", "LESS", "Tailwind CSS", "Bootstrap", "Bulma", "Materialize", "WebAssembly", 
    "Webpack", "Gulp", "JAMstack", "Grunt", "Pug", "Handlebars",
    "Docker", "Kubernetes", "AWS", "Azure", "Google Cloud Platform", "Terraform", "Ansible", "Jenkins", 
    "CI/CD", "Helm", "OpenShift", "Serverless", "CloudFormation", "Chef", "Puppet", "Vagrant",
    "Hadoop", "Spark", "Kafka", "Airflow", "Hive", "Pig", "HBase", "Presto", "Dask", "PySpark", "Google BigQuery",
    "Machine Learning", "Deep Learning", "NLP", "OpenAI GPT", "BERT", "AutoML", "Reinforcement Learning", 
    "Computer Vision", "Generative Adversarial Networks (GANs)", "XGBoost", "LightGBM", "CatBoost",
    "Swift", "Objective-C", "Kotlin", "React Native", "Flutter", "Dart", "Xcode", "Android Studio", 
    "iOS Development", "Android Development",
    "Git", "GitHub", "GitLab", "Bitbucket", "Subversion", "Travis CI", "CircleCI", "Jenkins", "GitFlow", 
    "Perforce",
    "Selenium", "Cypress", "Appium", "Postman", "JMeter", "JUnit", "Mockito", "Cucumber", "PyTest", 
    "Robot Framework", "TestNG", "Jest", "Mocha", "Chai",
    "Agile", "Scrum", "Kanban", "JIRA", "Confluence", "Trello", "Slack", "Basecamp", "Microsoft Project", 
    "Asana", "Monday.com", "ClickUp",
    "GraphQL", "REST API", "SOAP", "gRPC", "Microservices", "Event-Driven Architecture", "Apache Kafka", 
    "RabbitMQ", "Message Queues", "OpenAPI", "Swagger", "OAuth", "JWT", "Firebase", "Heroku",
    "Bash", "PowerShell", "Perl", "Ruby", "Shell Scripting", "Groovy", "Lua",
    "Istio", "Envoy", "Linkerd", "Consul", "Prometheus", "Grafana", "Jaeger", "Fluentd", "Elastic Stack (ELK)", 
    "Logstash", "ECS", "EKS", "Fargate",
]

# Function to extract text from a PDF using PyMuPDF (fitz)
def extract_text_with_pymupdf(file_path):
    text = ""
    try:
        pdf_document = fitz.open(file_path)
        for page in pdf_document:
            text += page.get_text()
        pdf_document.close()
    except Exception as e:
        print("Error reading PDF:", e)
    return text.strip()

# Function to extract candidate's name from resume text
def extract_name_from_text(text):
    name_patterns = [
        re.compile(r'^[A-Z][a-zA-Z\s\-\.]+$', re.MULTILINE),  # Names typically start with a capital letter
        re.compile(r'Name:\s*([A-Za-z\s\-\.]+)', re.IGNORECASE),  # Format "Name: John Doe"
        re.compile(r'([A-Za-z\s\-\.]+)\s+CV', re.IGNORECASE),  # Names followed by "CV" or "Resume"
        re.compile(r'([A-Za-z\s\-\.]+)\s+Resume', re.IGNORECASE),  # Names followed by "Resume"
        re.compile(r'([A-Za-z\s\-\.]+)\s+Profile', re.IGNORECASE),  # Names followed by "Profile"
        re.compile(r'([A-Za-z\s\-\.]+)\s+Summary', re.IGNORECASE),  # Names followed by "Summary"
    ]
    
    for pattern in name_patterns:
        matches = pattern.findall(text)
        for match in matches:
            if match and match.strip():
                return match.strip()
    
    return "Name not found"

# Function to extract skills using basic text matching (or NLP if needed)
def extract_skills_with_nlp(resume_text):
    # Basic skill extraction by matching against the COMMON_SKILLS list
    extracted_skills = [skill for skill in COMMON_SKILLS if skill.lower() in resume_text.lower()]
    return extracted_skills

# Function to extract email from resume text
def extract_email(resume_text):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails = re.findall(email_pattern, resume_text)
    return emails[0] if emails else None

# Function to generate personalized suggestions based on candidate's skills
SKILL_IMPROVEMENT_MAPPING = {
    "Python": {"improvements": ["Learn advanced Python libraries"], "roles": ["Python Developer"]},
    "Machine Learning": {"improvements": ["Practice more ML projects"], "roles": ["Data Scientist"]},
    # Add more mappings for skills
}

def generate_suggestions(extracted_skills):
    improvement_suggestions = set()
    suggested_roles = set()

    for skill in extracted_skills:
        if skill in SKILL_IMPROVEMENT_MAPPING:
            improvement_suggestions.update(SKILL_IMPROVEMENT_MAPPING[skill]["improvements"])
            suggested_roles.update(SKILL_IMPROVEMENT_MAPPING[skill]["roles"])

    return improvement_suggestions, suggested_roles

# Function to send an email with personalized suggestions for rejected candidates
def send_email(recipient_email, candidate_name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles):
    sender_email = "jesperdeni002@gmail.com"
    sender_password = "dhex qzvo tpwq zkyf"  # Use environment variables for security

    subject = "Job Application Update"
    body = f"""
    Dear {candidate_name},

    Thank you for your application. 
    Your skills matched {match_percentage:.2f}% with our job requirements.

    Extracted Skills: {', '.join(extracted_skills)}

    Unfortunately, we cannot move forward with your application at this time. However, we encourage you to consider the following suggestions to enhance your skills:

    Suggestions for Improvement:
    {', '.join(improvement_suggestions) if improvement_suggestions else "No suggestions available."}

    Suggested Roles for Consideration:
    {', '.join(suggested_roles) if suggested_roles else "No roles suggested."}

    Best regards,
    [Your Name]
    """

    # Setting up the email
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = sender_email
    msg['To'] = recipient_email

    try:
        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:
            server.login(sender_email, sender_password)
            server.sendmail(sender_email, recipient_email, msg.as_string())
        print(f"Email sent to {recipient_email} successfully!")
    except Exception as e:
        print(f"Failed to send email to {recipient_email}: {e}")

# Function to extract skills from job description
def extract_skills_from_job_description(job_description):
    return [skill for skill in COMMON_SKILLS if skill.lower() in job_description.lower()]

# Main function to process resumes and send emails
def process_resumes(resume_folder, job_description):
    # Extract skills from job description
    job_skills = extract_skills_from_job_description(job_description)

    for resume_file in glob.glob(os.path.join(resume_folder, '*.pdf')):
        resume_text = extract_text_with_pymupdf(resume_file)
        
        candidate_name = extract_name_from_text(resume_text)
        extracted_skills = extract_skills_with_nlp(resume_text)
        email = extract_email(resume_text)

        if email:
            match_percentage = (len(set(extracted_skills) & set(job_skills)) / len(job_skills)) * 100
            
            if match_percentage < 75:  # Adjust the percentage threshold as needed
                improvement_suggestions, suggested_roles = generate_suggestions(extracted_skills)
                send_email(email, candidate_name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles)
        else:
            print(f"No valid email found in {resume_file}")

# Input your job description here
job_description = input()
# Set the folder path where the resumes are located
resume_folder = 'D:\\Project\\New folder (2)\\resumes'

# Start processing the resumes
process_resumes(resume_folder, job_description)


Email sent to emily.harris@example.com successfully!
Email sent to j87221304@gmail.com successfully!
Email sent to alex.rivera@example.com successfully!
Email sent to david.white@example.com successfully!
Email sent to emily.sanders@example.com successfully!
Email sent to john.martin@example.com successfully!
Email sent to michael.davis@example.com successfully!
Email sent to michael.johnson@example.com successfully!
Email sent to michael.johnson@example.com successfully!
Email sent to monishprabhu31@gmail.co successfully!
Email sent to rachel.turner@example.com successfully!
Email sent to raguramthirunavukkarasu@gmail.com successfully!
Email sent to stanjose24@gmail.com successfully!


In [14]:
import os
import re
import glob
import pdfplumber
import fitz
import smtplib
from email.mime.text import MIMEText

# List of common skills to match
COMMON_SKILLS = [
    "Python", "Flask", "Django", "FastAPI", "Pandas", "NumPy", "SciPy", "Matplotlib", "Seaborn", 
    "TensorFlow", "Keras", "PyTorch", "Scikit-learn", "NLTK", "SpaCy", "SQLAlchemy", "Celery",
    "Java", "Spring", "Spring Boot", "Hibernate", "Maven", "Gradle", "JUnit", "JSP", "JSF", 
    "JavaFX", "Swing", "JDBC", "Apache Camel", "Kotlin",
    "JavaScript", "Node.js", "React", "Angular", "Vue.js", "Next.js", "Express.js", "TypeScript", 
    "Redux", "jQuery", "Electron", "Gatsby", "ES6", "Svelte",
    "C++", "Boost", "Qt", "OpenCV", "STL", "CUDA", "OpenMP", "CMake", "GTest", 
    "C#", ".NET", "ASP.NET", "Entity Framework", "Blazor", "Xamarin", "Unity", "WPF", "WinForms", 
    "LINQ", "Razor Pages", "MVC", "NUnit", "Azure Functions", "SignalR",
    "SQL", "MySQL", "PostgreSQL", "SQLite", "MongoDB", "Redis", "Oracle", "Cassandra", "Elasticsearch", 
    "Microsoft SQL Server", "PL/SQL", "T-SQL",
    "R", "ggplot2", "Shiny", "dplyr", "tidyverse", "caret", "data.table", "rpart", "lubridate", "forecast", 
    "R Markdown", "R Shiny",
    "HTML", "CSS", "Sass", "LESS", "Tailwind CSS", "Bootstrap", "Bulma", "Materialize", "WebAssembly", 
    "Webpack", "Gulp", "JAMstack", "Grunt", "Pug", "Handlebars",
    "Docker", "Kubernetes", "AWS", "Azure", "Google Cloud Platform", "Terraform", "Ansible", "Jenkins", 
    "CI/CD", "Helm", "OpenShift", "Serverless", "CloudFormation", "Chef", "Puppet", "Vagrant",
    "Hadoop", "Spark", "Kafka", "Airflow", "Hive", "Pig", "HBase", "Presto", "Dask", "PySpark", "Google BigQuery",
    "Machine Learning", "Deep Learning", "NLP", "OpenAI GPT", "BERT", "AutoML", "Reinforcement Learning", 
    "Computer Vision", "Generative Adversarial Networks (GANs)", "XGBoost", "LightGBM", "CatBoost",
    "Swift", "Objective-C", "Kotlin", "React Native", "Flutter", "Dart", "Xcode", "Android Studio", 
    "iOS Development", "Android Development",
    "Git", "GitHub", "GitLab", "Bitbucket", "Subversion", "Travis CI", "CircleCI", "Jenkins", "GitFlow", 
    "Perforce",
    "Selenium", "Cypress", "Appium", "Postman", "JMeter", "JUnit", "Mockito", "Cucumber", "PyTest", 
    "Robot Framework", "TestNG", "Jest", "Mocha", "Chai",
    "Agile", "Scrum", "Kanban", "JIRA", "Confluence", "Trello", "Slack", "Basecamp", "Microsoft Project", 
    "Asana", "Monday.com", "ClickUp",
    "GraphQL", "REST API", "SOAP", "gRPC", "Microservices", "Event-Driven Architecture", "Apache Kafka", 
    "RabbitMQ", "Message Queues", "OpenAPI", "Swagger", "OAuth", "JWT", "Firebase", "Heroku",
    "Bash", "PowerShell", "Perl", "Ruby", "Shell Scripting", "Groovy", "Lua",
    "Istio", "Envoy", "Linkerd", "Consul", "Prometheus", "Grafana", "Jaeger", "Fluentd", "Elastic Stack (ELK)", 
    "Logstash", "ECS", "EKS", "Fargate",
]

JOB_ROLES = [
    "Developer", "Intern", "Engineer", "Manager", "Entry-Level",
    "Senior", "Junior", "Specialist", "Associate", "Lead",
    "Staff", "Principal", "Director", "Executive", "Consultant",
    "Analyst", "Programmer", "Designer", "Coordinator", "Architect",
    "Technician", "Administrator", "Data Scientist", "Data Analyst"
]

# Function to extract text from a PDF using PyMuPDF (fitz)
def extract_text_with_pymupdf(file_path):
    text = ""
    try:
        pdf_document = fitz.open(file_path)
        for page in pdf_document:
            text += page.get_text()
        pdf_document.close()
    except Exception as e:
        print("Error reading PDF:", e)
    return text.strip()
# Function to check if a string is a job role
# def is_job_role(candidate_name):
#     name_words = candidate_name.split()
#     for word in name_words:
#         if word in JOB_ROLES:
#             return True
#     return False

# # Function to check if a string is a common skill
# def is_common_skill(candidate_name):
#     # Split the name into words and check if any matches the common skills
#     name_words = candidate_name.split()
#     for word in name_words:
#         if word in COMMON_SKILLS:
#             return True
#     return False

# def extract_name_from_text(text):
#     name_patterns = [
#         re.compile(r'^[A-Z][a-zA-Z\s\-\.]+$', re.MULTILINE),
#         re.compile(r'\b[A-Z][a-zA-Z\s\-\.]+\b', re.IGNORECASE)
#     ]

#     # Split the text into lines
#     lines = text.split('\n')
    
#     # Check the first line separately for names
#     first_line = lines[0] if lines else ""

#     # Check if the first line is neither a job role nor a skill
#     if not is_job_role(first_line) and not is_common_skill(first_line):
#         for pattern in name_patterns:
#             match = pattern.search(first_line)
#             if match:
#                 return match.group(0).strip()

#     # Use regex patterns to find names in all lines
#     for line in lines:
#         for pattern in name_patterns:
#             match = pattern.search(line)
#             if match and not is_job_role(line) and not is_common_skill(line):
#                 return match.group(0).strip()

#     # Optionally use NLP for better name extraction
#     # Uncomment and define `nlp` if you plan to use NLP to extract names
#     # doc = nlp(text)
#     # names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
#     # for name in names:
#     #     if not is_job_role(name) and not is_common_skill(name):
#     #         return name

#     return "Name not found"

# import re

# def extract_name_from_text(text):
#     name_patterns = [
#         re.compile(r'^[A-Z][a-zA-Z\s\-\.]+$', re.MULTILINE),  # Standard names
#         re.compile(r'\b[A-Z\s\.\-]+[A-Z]\b', re.IGNORECASE),  # Names with uppercase letters and spaces (like "A L P H I N E P A T R I C K")
#         re.compile(r'\b[A-Z]+\s+[A-Z]+\.[A-Z]+\b', re.IGNORECASE)  # Names with initials and periods (like "LALITH AKKASH.V")
#     ]

#     # Split the text into lines
#     lines = text.split('\n')
    
#     # Check the first line separately for names
#     first_line = lines[0] if lines else ""

#     # Check if the first line is neither a job role nor a skill
#     if not is_job_role(first_line) and not is_common_skill(first_line):
#         for pattern in name_patterns:
#             match = pattern.search(first_line)
#             if match:
#                 return match.group(0).strip()

#     # Use regex patterns to find names in all lines
#     for line in lines:
#         for pattern in name_patterns:
#             match = pattern.search(line)
#             if match and not is_job_role(line) and not is_common_skill(line):
#                 return match.group(0).strip()

#     # Optionally use NLP for better name extraction
#     # Uncomment and define `nlp` if you plan to use NLP to extract names
#     # doc = nlp(text)
#     # names = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
#     # for name in names:
#     #     if not is_job_role(name) and not is_common_skill(name):
#     #         return name

#     return "Name not found"

# def is_job_role(line):
#     job_roles = [
#         "Developer", "Intern", "Engineer", "Manager", "Entry", "Level",
#         "Senior", "Junior", "Specialist", "Associate", "Lead", "Staff",
#         "Principal", "Director", "Executive", "Consultant", "Analyst",
#         "Programmer", "Designer", "Coordinator", "Architect", "Technician", "Administrator"
#     ]
#     return any(role in line for role in job_roles)

# def is_common_skill(line):
#     common_skills = ["Python", "Java", "SQL", "C++", "JavaScript", "HTML", "CSS"]
#     return any(skill in line for skill in common_skillimport re

def extract_name_from_text(text):
    # Regex patterns to capture names in various formats
    name_patterns = [
        re.compile(r'^[A-Z][a-zA-Z\s\.\-]+$', re.MULTILINE),  # Standard names
        re.compile(r'\b[A-Z]{1,3}(?:\s+[A-Z]{1,3})*\s+[A-Z][a-zA-Z\.]*$', re.MULTILINE),  # Initials followed by name
        re.compile(r'\b[A-Z]+\s+[A-Z]+\.[A-Z]+\b', re.IGNORECASE),  # Names with initials and a period (like "LALITH AKKASH.V")
        re.compile(r'(?<!\w)([A-Z\s\.\-]+)(?=\s+|$)', re.IGNORECASE)  # Captures uppercase names before space or end of line
    ]

    # Split the text into lines
    lines = text.split('\n')

    # Check the first line separately for names
    first_line = lines[0] if lines else ""

    # Debug: Print the first line being checked
    print(f"Checking first line for name: {first_line}")

    # Check if the first line is neither a job role nor a skill
    if not is_job_role(first_line) and not is_common_skill(first_line):
        for pattern in name_patterns:
            match = pattern.search(first_line)
            if match:
                name = match.group(0).strip()
                # Debug: Print what is matched
                print(f"Matched name from first line: {name}")
                return name

    # Use regex patterns to find names in all lines
    for line in lines:
        for pattern in name_patterns:
            match = pattern.search(line)
            if match and not is_job_role(line) and not is_common_skill(line):
                name = match.group(0).strip()
                # Debug: Print what is matched
                print(f"Matched name from line: {name}")
                return name

    return "Name not found"

def is_job_role(line):
    job_roles = [
        "Developer", "Intern", "Engineer", "Manager", "Entry", "Level",
        "Senior", "Junior", "Specialist", "Associate", "Lead", "Staff",
        "Principal", "Director", "Executive", "Consultant", "Analyst",
        "Programmer", "Designer", "Coordinator", "Architect", "Technician", "Administrator"
    ]
    return any(role in line for role in job_roles)

def is_common_skill(line):
    common_skills = ["Python", "Java", "SQL", "C++", "JavaScript", "HTML", "CSS"]
    return any(skill in line for skill in common_skills)






# Function to extract skills using basic text matching
def extract_skills_with_nlp(resume_text):
    extracted_skills = [skill for skill in COMMON_SKILLS if skill.lower() in resume_text.lower()]
    return extracted_skills

# Function to extract email from resume text
def extract_email(resume_text):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails = re.findall(email_pattern, resume_text)
    return emails[0] if emails else None

# Function to generate personalized suggestions based on candidate's skills
SKILL_IMPROVEMENT_MAPPING = {
    "Python": {
        "improvements": ["Practice advanced Python concepts", "Contribute to open-source projects"],
        "roles": ["Data Scientist", "Backend Developer"]
    },
    "Machine Learning": {
        "improvements": ["Study ML algorithms", "Work on Kaggle competitions"],
        "roles": ["Machine Learning Engineer", "Data Analyst"]
    },
    # Add more skills, improvements, and roles as necessary
}

def generate_suggestions(extracted_skills):
    improvement_suggestions = set()
    suggested_roles = set()

    for skill in extracted_skills:
        if skill in SKILL_IMPROVEMENT_MAPPING:
            improvement_suggestions.update(SKILL_IMPROVEMENT_MAPPING[skill]["improvements"])
            suggested_roles.update(SKILL_IMPROVEMENT_MAPPING[skill]["roles"])

    return improvement_suggestions, suggested_roles

# Function to send an email with personalized suggestions for rejected candidates
def send_email(recipient_email, candidate_name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles):
    sender_email = "jesperdeni002@gmail.com"
    sender_password = "dhex qzvo tpwq zkyf"  # Use environment variables for security

    subject = "Job Application Update"
    body = f"""
    Dear {candidate_name},

    Thank you for your application. 
    Your skills matched {match_percentage:.2f}% with our job requirements.

    Extracted Skills: {', '.join(extracted_skills)}

    Unfortunately, we cannot move forward with your application at this time. However, we encourage you to consider the following suggestions to enhance your skills:

    Suggestions for Improvement:
    {', '.join(improvement_suggestions) if improvement_suggestions else "No suggestions available."}

    Suggested Roles for Consideration:
    {', '.join(suggested_roles) if suggested_roles else "No roles suggested."}

    Best regards,
    [Your Name]
    """

    # Print extracted details for verification
    print(f"\nExtracted Details for {candidate_name}:")
    print(f"Email: {recipient_email}")
    print(f"Match Percentage: {match_percentage:.2f}%")
    print(f"Extracted Skills: {', '.join(extracted_skills)}")
    print(f"Improvement Suggestions: {', '.join(improvement_suggestions) if improvement_suggestions else 'None'}")
    print(f"Suggested Roles: {', '.join(suggested_roles) if suggested_roles else 'None'}")
    print("="*50)  # Separator for better readability

    # Setting up the email
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = sender_email
    msg['To'] = recipient_email

    try:
        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:
            server.login(sender_email, sender_password)
            server.sendmail(sender_email, recipient_email, msg.as_string())
        print(f"Email sent to {recipient_email} successfully!")
    except Exception as e:
        print(f"Failed to send email to {recipient_email}: {e}")

# Function to extract skills from job description
def extract_skills_from_job_description(job_description):
    return [skill for skill in COMMON_SKILLS if skill.lower() in job_description.lower()]

# Function to match extracted skills with job description skills and calculate match percentage
def match_skills(extracted_skills, job_skills):
    matched_skills = set(extracted_skills) & set(job_skills)
    match_percentage = len(matched_skills) / len(job_skills) * 100 if job_skills else 0
    return matched_skills, match_percentage

# Main processing function
def process_resumes(resume_folder_path, job_description):
    job_skills = extract_skills_from_job_description(job_description)

    for resume_path in glob.glob(os.path.join(resume_folder_path, '*.pdf')):
        print(f"\nProcessing resume: {resume_path}")

        resume_text = extract_text_with_pymupdf(resume_path)
        candidate_name = extract_name_from_text(resume_text)
        extracted_skills = extract_skills_with_nlp(resume_text)
        recipient_email = extract_email(resume_text)

        matched_skills, match_percentage = match_skills(extracted_skills, job_skills)

        # Prepare suggestions if the candidate is not selected
        improvement_suggestions, suggested_roles = generate_suggestions(extracted_skills)

        if recipient_email:
            # Send email if a valid email is found
            send_email(recipient_email, candidate_name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles)
        else:
            print(f"No email found for {candidate_name}")

# Example usage:
# Input your job description here
job_description = input()
# Set the folder path where the resumes are located
resume_folder = r'D:/Project/New folder (2)/resumes'

# Start processing the resumes
process_resumes(resume_folder, job_description)



Processing resume: D:/Project/New folder (2)/resumes\A.pdf
Checking first line for name: A. Entry-Level JavaScript Developer 
Matched name from line: Emily Harris

Extracted Details for Emily Harris:
Email: emily.harris@example.com
Match Percentage: 25.00%
Extracted Skills: Java, JavaScript, Node.js, React, R, HTML, CSS, Webpack, Git
Improvement Suggestions: None
Suggested Roles: None
Email sent to emily.harris@example.com successfully!

Processing resume: D:/Project/New folder (2)/resumes\A2.pdf
Checking first line for name: A. Entry-Level JavaScript Developer  
Matched name from line: Emily Harris

Extracted Details for Emily Harris:
Email: j87221304@gmail.com
Match Percentage: 25.00%
Extracted Skills: Java, JavaScript, Node.js, React, R, HTML, CSS, Webpack, Git
Improvement Suggestions: None
Suggested Roles: None
Email sent to j87221304@gmail.com successfully!

Processing resume: D:/Project/New folder (2)/resumes\Alex Rivera.pdf
Checking first line for name: Alex Rivera 
Matched nam

In [15]:
import os
import re
import glob
import pdfplumber
import fitz
import smtplib
from email.mime.text import MIMEText

# List of common skills to match
COMMON_SKILLS = [
    "Python", "Flask", "Django", "FastAPI", "Pandas", "NumPy", "SciPy", "Matplotlib", "Seaborn", 
    "TensorFlow", "Keras", "PyTorch", "Scikit-learn", "NLTK", "SpaCy", "SQLAlchemy", "Celery",
    "Java", "Spring", "Spring Boot", "Hibernate", "Maven", "Gradle", "JUnit", "JSP", "JSF", 
    "JavaFX", "Swing", "JDBC", "Apache Camel", "Kotlin",
    "JavaScript", "Node.js", "React", "Angular", "Vue.js", "Next.js", "Express.js", "TypeScript", 
    "Redux", "jQuery", "Electron", "Gatsby", "ES6", "Svelte",
    "C++", "Boost", "Qt", "OpenCV", "STL", "CUDA", "OpenMP", "CMake", "GTest", 
    "C#", ".NET", "ASP.NET", "Entity Framework", "Blazor", "Xamarin", "Unity", "WPF", "WinForms", 
    "LINQ", "Razor Pages", "MVC", "NUnit", "Azure Functions", "SignalR",
    "SQL", "MySQL", "PostgreSQL", "SQLite", "MongoDB", "Redis", "Oracle", "Cassandra", "Elasticsearch", 
    "Microsoft SQL Server", "PL/SQL", "T-SQL",
    "R", "ggplot2", "Shiny", "dplyr", "tidyverse", "caret", "data.table", "rpart", "lubridate", "forecast", 
    "R Markdown", "R Shiny",
    "HTML", "CSS", "Sass", "LESS", "Tailwind CSS", "Bootstrap", "Bulma", "Materialize", "WebAssembly", 
    "Webpack", "Gulp", "JAMstack", "Grunt", "Pug", "Handlebars",
    "Docker", "Kubernetes", "AWS", "Azure", "Google Cloud Platform", "Terraform", "Ansible", "Jenkins", 
    "CI/CD", "Helm", "OpenShift", "Serverless", "CloudFormation", "Chef", "Puppet", "Vagrant",
    "Hadoop", "Spark", "Kafka", "Airflow", "Hive", "Pig", "HBase", "Presto", "Dask", "PySpark", "Google BigQuery",
    "Machine Learning", "Deep Learning", "NLP", "OpenAI GPT", "BERT", "AutoML", "Reinforcement Learning", 
    "Computer Vision", "Generative Adversarial Networks (GANs)", "XGBoost", "LightGBM", "CatBoost",
    "Swift", "Objective-C", "Kotlin", "React Native", "Flutter", "Dart", "Xcode", "Android Studio", 
    "iOS Development", "Android Development",
    "Git", "GitHub", "GitLab", "Bitbucket", "Subversion", "Travis CI", "CircleCI", "Jenkins", "GitFlow", 
    "Perforce",
    "Selenium", "Cypress", "Appium", "Postman", "JMeter", "JUnit", "Mockito", "Cucumber", "PyTest", 
    "Robot Framework", "TestNG", "Jest", "Mocha", "Chai",
    "Agile", "Scrum", "Kanban", "JIRA", "Confluence", "Trello", "Slack", "Basecamp", "Microsoft Project", 
    "Asana", "Monday.com", "ClickUp",
    "GraphQL", "REST API", "SOAP", "gRPC", "Microservices", "Event-Driven Architecture", "Apache Kafka", 
    "RabbitMQ", "Message Queues", "OpenAPI", "Swagger", "OAuth", "JWT", "Firebase", "Heroku",
    "Bash", "PowerShell", "Perl", "Ruby", "Shell Scripting", "Groovy", "Lua",
    "Istio", "Envoy", "Linkerd", "Consul", "Prometheus", "Grafana", "Jaeger", "Fluentd", "Elastic Stack (ELK)", 
    "Logstash", "ECS", "EKS", "Fargate",
]

# Function to extract text from a PDF using PyMuPDF (fitz)
def extract_text_with_pymupdf(file_path):
    text = ""
    try:
        pdf_document = fitz.open(file_path)
        for page in pdf_document:
            text += page.get_text()
        pdf_document.close()
    except Exception as e:
        print("Error reading PDF:", e)
    return text.strip()

def extract_name_from_text(text):
    name_patterns = [
        re.compile(r'^[A-Z][a-zA-Z\s\-\.]+$', re.MULTILINE),  # Standard names
        re.compile(r'\b[A-Z][a-zA-Z\s\.\-]+\s+[A-Z][a-zA-Z\s\.\-]+\b', re.MULTILINE),  # Full names (First Last)
        re.compile(r'\b[A-Z]+\s+[A-Z]+\.[A-Z]+\b', re.IGNORECASE)  # Names with initials and periods (like "LALITH AKKASH.V")
    ]

    # Split the text into lines
    lines = text.split('\n')
    
    # Check the first line for a valid name
    first_line = lines[0] if lines else ""

    if not is_job_role(first_line) and not is_common_skill(first_line):
        for pattern in name_patterns:
            match = pattern.search(first_line)
            if match:
                return match.group(0).strip()

    # Use regex patterns to find names in all lines, avoiding common skill and job role lines
    for line in lines:
        # Skip lines that look like addresses (i.e., contain numbers or common address terms)
        if re.search(r'\d', line) or 'Street' in line or 'Avenue' in line or 'Road' in line:
            continue
        for pattern in name_patterns:
            match = pattern.search(line)
            if match and not is_job_role(line) and not is_common_skill(line):
                return match.group(0).strip()

    return "Name not found"

def is_job_role(line):
    job_roles = [
        "Developer", "Intern", "Engineer", "Manager", "Entry", "Level",
        "Senior", "Junior", "Specialist", "Associate", "Lead", "Staff",
        "Principal", "Director", "Executive", "Consultant", "Analyst",
        "Programmer", "Designer", "Coordinator", "Architect", "Technician", "Administrator"
    ]
    return any(role in line for role in job_roles)

def is_common_skill(line):
    common_skills = ["Python", "Java", "SQL", "C++", "JavaScript", "HTML", "CSS"]
    return any(skill in line for skill in common_skills)

# Function to extract skills using basic text matching
def extract_skills_with_nlp(resume_text):
    extracted_skills = [skill for skill in COMMON_SKILLS if skill.lower() in resume_text.lower()]
    return extracted_skills

# Function to extract email from resume text
def extract_email(resume_text):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails = re.findall(email_pattern, resume_text)
    return emails[0] if emails else None

# Function to generate personalized suggestions based on candidate's skills
SKILL_IMPROVEMENT_MAPPING = {
    "Python": {
        "improvements": ["Practice advanced Python concepts", "Contribute to open-source projects"],
        "roles": ["Data Scientist", "Backend Developer"]
    },
    "Machine Learning": {
        "improvements": ["Study ML algorithms", "Work on Kaggle competitions"],
        "roles": ["Machine Learning Engineer", "Data Analyst"]
    },
    # Add more skills, improvements, and roles as necessary
}

def generate_suggestions(extracted_skills):
    improvement_suggestions = set()
    suggested_roles = set()

    for skill in extracted_skills:
        if skill in SKILL_IMPROVEMENT_MAPPING:
            improvement_suggestions.update(SKILL_IMPROVEMENT_MAPPING[skill]["improvements"])
            suggested_roles.update(SKILL_IMPROVEMENT_MAPPING[skill]["roles"])

    return improvement_suggestions, suggested_roles

# Function to send an email with personalized suggestions for rejected candidates
def send_email(recipient_email, candidate_name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles):
    sender_email = "jesperdeni002@gmail.com"
    sender_password = "dhex qzvo tpwq zkyf"  # Use environment variables for security

    # Create the email content
    email_subject = f"Application Status for {candidate_name}"
    email_body = (
        f"Dear {candidate_name},\n\n"
        f"Thank you for applying for the position. "
        f"Unfortunately, your application did not meet the required match percentage of 75%.\n"
        f"Your match percentage was {match_percentage:.2f}%.\n\n"
        f"Here are your extracted skills: {', '.join(extracted_skills)}\n\n"
        f"Suggestions for improvement:\n"
        f"{'\n'.join(improvement_suggestions)}\n\n"
        f"Suggested Roles:\n"
        f"{'\n'.join(suggested_roles)}\n\n"
        "Best regards,\n"
        "Your Company"
    )

    # Send the email
    try:
        msg = MIMEText(email_body)
        msg['Subject'] = email_subject
        msg['From'] = sender_email
        msg['To'] = recipient_email

        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:
            server.login(sender_email, sender_password)
            server.sendmail(sender_email, recipient_email, msg.as_string())
            print(f"Email sent to {recipient_email}")
    except Exception as e:
        print(f"Error sending email: {e}")

# Main function to process resumes with a given job description
def process_resumes(job_description, resumes_folder_path='D:/Project/New folder (2)/resumes'):
    resumes = glob.glob(os.path.join(resumes_folder_path, "*.pdf"))
    for resume_path in resumes:
        # Extract text from the resume
        resume_text = extract_text_with_pymupdf(resume_path)
        
        # Extract candidate name
        candidate_name = extract_name_from_text(resume_text)

        # Extract email
        candidate_email = extract_email(resume_text)

        # Extract skills
        extracted_skills = extract_skills_with_nlp(resume_text)

        # Calculate match percentage (dummy logic; replace with actual logic as necessary)
        match_percentage = len(extracted_skills) / len(COMMON_SKILLS) * 100

        # Check if match percentage is below 75%
        if match_percentage < 75:
            improvement_suggestions, suggested_roles = generate_suggestions(extracted_skills)
            if candidate_email:
                send_email(candidate_email, candidate_name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles)

# Sample usage
job_description = input()

process_resumes(job_description)


Email sent to emily.harris@example.com
Email sent to j87221304@gmail.com
Email sent to alex.rivera@example.com
Email sent to alphinepatrickf@gmail.com
Email sent to alphinepatrickf@gmail.com


KeyboardInterrupt: 

In [16]:
import os
import re
import glob
import pdfplumber
import fitz
import smtplib
from email.mime.text import MIMEText

# List of common skills to match
COMMON_SKILLS = [
    "Python", "Flask", "Django", "FastAPI", "Pandas", "NumPy", "SciPy", "Matplotlib", "Seaborn", 
    "TensorFlow", "Keras", "PyTorch", "Scikit-learn", "NLTK", "SpaCy", "SQLAlchemy", "Celery",
    "Java", "Spring", "Spring Boot", "Hibernate", "Maven", "Gradle", "JUnit", "JSP", "JSF", 
    "JavaFX", "Swing", "JDBC", "Apache Camel", "Kotlin",
    "JavaScript", "Node.js", "React", "Angular", "Vue.js", "Next.js", "Express.js", "TypeScript", 
    "Redux", "jQuery", "Electron", "Gatsby", "ES6", "Svelte",
    "C++", "Boost", "Qt", "OpenCV", "STL", "CUDA", "OpenMP", "CMake", "GTest", 
    "C#", ".NET", "ASP.NET", "Entity Framework", "Blazor", "Xamarin", "Unity", "WPF", "WinForms", 
    "LINQ", "Razor Pages", "MVC", "NUnit", "Azure Functions", "SignalR",
    "SQL", "MySQL", "PostgreSQL", "SQLite", "MongoDB", "Redis", "Oracle", "Cassandra", "Elasticsearch", 
    "Microsoft SQL Server", "PL/SQL", "T-SQL",
    "R", "ggplot2", "Shiny", "dplyr", "tidyverse", "caret", "data.table", "rpart", "lubridate", "forecast", 
    "R Markdown", "R Shiny",
    "HTML", "CSS", "Sass", "LESS", "Tailwind CSS", "Bootstrap", "Bulma", "Materialize", "WebAssembly", 
    "Webpack", "Gulp", "JAMstack", "Grunt", "Pug", "Handlebars",
    "Docker", "Kubernetes", "AWS", "Azure", "Google Cloud Platform", "Terraform", "Ansible", "Jenkins", 
    "CI/CD", "Helm", "OpenShift", "Serverless", "CloudFormation", "Chef", "Puppet", "Vagrant",
    "Hadoop", "Spark", "Kafka", "Airflow", "Hive", "Pig", "HBase", "Presto", "Dask", "PySpark", "Google BigQuery",
    "Machine Learning", "Deep Learning", "NLP", "OpenAI GPT", "BERT", "AutoML", "Reinforcement Learning", 
    "Computer Vision", "Generative Adversarial Networks (GANs)", "XGBoost", "LightGBM", "CatBoost",
    "Swift", "Objective-C", "Kotlin", "React Native", "Flutter", "Dart", "Xcode", "Android Studio", 
    "iOS Development", "Android Development",
    "Git", "GitHub", "GitLab", "Bitbucket", "Subversion", "Travis CI", "CircleCI", "Jenkins", "GitFlow", 
    "Perforce",
    "Selenium", "Cypress", "Appium", "Postman", "JMeter", "JUnit", "Mockito", "Cucumber", "PyTest", 
    "Robot Framework", "TestNG", "Jest", "Mocha", "Chai",
    "Agile", "Scrum", "Kanban", "JIRA", "Confluence", "Trello", "Slack", "Basecamp", "Microsoft Project", 
    "Asana", "Monday.com", "ClickUp",
    "GraphQL", "REST API", "SOAP", "gRPC", "Microservices", "Event-Driven Architecture", "Apache Kafka", 
    "RabbitMQ", "Message Queues", "OpenAPI", "Swagger", "OAuth", "JWT", "Firebase", "Heroku",
    "Bash", "PowerShell", "Perl", "Ruby", "Shell Scripting", "Groovy", "Lua",
    "Istio", "Envoy", "Linkerd", "Consul", "Prometheus", "Grafana", "Jaeger", "Fluentd", "Elastic Stack (ELK)", 
    "Logstash", "ECS", "EKS", "Fargate",
]

JOB_ROLES = [
    "Developer", "Intern", "Engineer", "Manager", "Entry-Level",
    "Senior", "Junior", "Specialist", "Associate", "Lead",
    "Staff", "Principal", "Director", "Executive", "Consultant",
    "Analyst", "Programmer", "Designer", "Coordinator", "Architect",
    "Technician", "Administrator", "Data Scientist", "Data Analyst"
]

# Function to extract text from a PDF using PyMuPDF (fitz)
def extract_text_with_pymupdf(file_path):
    text = ""
    try:
        pdf_document = fitz.open(file_path)
        for page in pdf_document:
            text += page.get_text()
        pdf_document.close()
    except Exception as e:
        print("Error reading PDF:", e)
    return text.strip()
def extract_name_from_text(text):
    name_patterns = [
        re.compile(r'^[A-Z][a-zA-Z\s\-\.]+$', re.MULTILINE),  # Standard names
        re.compile(r'\b[A-Z][a-zA-Z\s\.\-]+\s+[A-Z][a-zA-Z\s\.\-]+\b', re.MULTILINE),  # Full names (First Last)
        re.compile(r'\b[A-Z]+\s+[A-Z]+\.[A-Z]+\b', re.IGNORECASE)  # Names with initials and periods (like "LALITH AKKASH.V")
    ]

    # Split the text into lines
    lines = text.split('\n')
    
    # Check the first line for a valid name
    first_line = lines[0] if lines else ""

    if not is_job_role(first_line) and not is_common_skill(first_line):
        for pattern in name_patterns:
            match = pattern.search(first_line)
            if match:
                return match.group(0).strip()

    # Use regex patterns to find names in all lines, avoiding common skill and job role lines
    for line in lines:
        # Skip lines that look like addresses (i.e., contain numbers or common address terms)
        if re.search(r'\d', line) or 'Street' in line or 'Avenue' in line or 'Road' in line:
            continue
        for pattern in name_patterns:
            match = pattern.search(line)
            if match and not is_job_role(line) and not is_common_skill(line):
                return match.group(0).strip()

    return "Name not found"
def is_job_role(line):
    job_roles = [
        "Developer", "Intern", "Engineer", "Manager", "Entry", "Level",
        "Senior", "Junior", "Specialist", "Associate", "Lead", "Staff",
        "Principal", "Director", "Executive", "Consultant", "Analyst",
        "Programmer", "Designer", "Coordinator", "Architect", "Technician", "Administrator"
    ]
    return any(role in line for role in job_roles)

def is_common_skill(line):
    common_skills = ["Python", "Java", "SQL", "C++", "JavaScript", "HTML", "CSS"]
    return any(skill in line for skill in common_skills)



# Function to extract skills using basic text matching
def extract_skills_with_nlp(resume_text):
    extracted_skills = [skill for skill in COMMON_SKILLS if skill.lower() in resume_text.lower()]
    return extracted_skills

# Function to extract email from resume text
def extract_email(resume_text):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails = re.findall(email_pattern, resume_text)
    return emails[0] if emails else None

# Function to generate personalized suggestions based on candidate's skills
SKILL_IMPROVEMENT_MAPPING = {
    "Python": {
        "improvements": ["Practice advanced Python concepts", "Contribute to open-source projects"],
        "roles": ["Data Scientist", "Backend Developer"]
    },
    "Machine Learning": {
        "improvements": ["Study ML algorithms", "Work on Kaggle competitions"],
        "roles": ["Machine Learning Engineer", "Data Analyst"]
    },
    # Add more skills, improvements, and roles as necessary
}

def generate_suggestions(extracted_skills):
    improvement_suggestions = set()
    suggested_roles = set()

    for skill in extracted_skills:
        if skill in SKILL_IMPROVEMENT_MAPPING:
            improvement_suggestions.update(SKILL_IMPROVEMENT_MAPPING[skill]["improvements"])
            suggested_roles.update(SKILL_IMPROVEMENT_MAPPING[skill]["roles"])

    return improvement_suggestions, suggested_roles

# Function to send an email with personalized suggestions for rejected candidates
def send_email(recipient_email, candidate_name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles):
    sender_email = "jesperdeni002@gmail.com"
    sender_password = "dhex qzvo tpwq zkyf"  # Use environment variables for security

    subject = "Job Application Update"
    body = f"""
    Dear {candidate_name},

    Thank you for your application. 
    Your skills matched {match_percentage:.2f}% with our job requirements.

    Extracted Skills: {', '.join(extracted_skills)}

    Unfortunately, we cannot move forward with your application at this time. However, we encourage you to consider the following suggestions to enhance your skills:

    Suggestions for Improvement:
    {', '.join(improvement_suggestions) if improvement_suggestions else "No suggestions available."}

    Suggested Roles for Consideration:
    {', '.join(suggested_roles) if suggested_roles else "No roles suggested."}

    Best regards,
    [Your Name]
    """

    # Print extracted details for verification
    print(f"\nExtracted Details for {candidate_name}:")
    print(f"Email: {recipient_email}")
    print(f"Match Percentage: {match_percentage:.2f}%")
    print(f"Extracted Skills: {', '.join(extracted_skills)}")
    print(f"Improvement Suggestions: {', '.join(improvement_suggestions) if improvement_suggestions else 'None'}")
    print(f"Suggested Roles: {', '.join(suggested_roles) if suggested_roles else 'None'}")
    print("="*50)  # Separator for better readability

    # Setting up the email
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = sender_email
    msg['To'] = recipient_email

    try:
        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:
            server.login(sender_email, sender_password)
            server.sendmail(sender_email, recipient_email, msg.as_string())
        print(f"Email sent to {recipient_email} successfully!")
    except Exception as e:
        print(f"Failed to send email to {recipient_email}: {e}")

# Function to extract skills from job description
def extract_skills_from_job_description(job_description):
    return [skill for skill in COMMON_SKILLS if skill.lower() in job_description.lower()]

# Function to match extracted skills with job description skills and calculate match percentage
def match_skills(extracted_skills, job_skills):
    matched_skills = set(extracted_skills) & set(job_skills)
    match_percentage = len(matched_skills) / len(job_skills) * 100 if job_skills else 0
    return matched_skills, match_percentage

# Main processing function
def process_resumes(resume_folder_path, job_description):
    job_skills = extract_skills_from_job_description(job_description)

    for resume_path in glob.glob(os.path.join(resume_folder_path, '*.pdf')):
        print(f"\nProcessing resume: {resume_path}")

        resume_text = extract_text_with_pymupdf(resume_path)
        candidate_name = extract_name_from_text(resume_text)
        extracted_skills = extract_skills_with_nlp(resume_text)
        recipient_email = extract_email(resume_text)

        matched_skills, match_percentage = match_skills(extracted_skills, job_skills)

        # Prepare suggestions if the candidate is not selected
        improvement_suggestions, suggested_roles = generate_suggestions(extracted_skills)

        if recipient_email:
            # Send email if a valid email is found
            send_email(recipient_email, candidate_name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles)
        else:
            print(f"No email found for {candidate_name}")

resume_folder = r'D:/Project/New folder (2)/resumes' # Update this with your resumes folder path
job_description_file = input()  # Update this with your job description file path

process_resumes(resume_folder, job_description_file)


Processing resume: D:/Project/New folder (2)/resumes\A.pdf

Extracted Details for Emily Harris:
Email: emily.harris@example.com
Match Percentage: 25.00%
Extracted Skills: Java, JavaScript, Node.js, React, R, HTML, CSS, Webpack, Git
Improvement Suggestions: None
Suggested Roles: None
Email sent to emily.harris@example.com successfully!

Processing resume: D:/Project/New folder (2)/resumes\A2.pdf

Extracted Details for Emily Harris:
Email: j87221304@gmail.com
Match Percentage: 25.00%
Extracted Skills: Java, JavaScript, Node.js, React, R, HTML, CSS, Webpack, Git
Improvement Suggestions: None
Suggested Roles: None
Email sent to j87221304@gmail.com successfully!

Processing resume: D:/Project/New folder (2)/resumes\Alex Rivera.pdf

Extracted Details for Alex Rivera:
Email: alex.rivera@example.com
Match Percentage: 50.00%
Extracted Skills: Python, C++, Boost, STL, CMake, R, Git
Improvement Suggestions: Contribute to open-source projects, Practice advanced Python concepts
Suggested Roles: Da

In [13]:
import os
import re
import glob
import fitz  # PyMuPDF
import smtplib
from email.mime.text import MIMEText
from PIL import Image
import pytesseract

# List of common skills to match
COMMON_SKILLS = [
    "Python", "Flask", "Django", "FastAPI", "Pandas", "NumPy", "SciPy", "Matplotlib", "Seaborn", 
    "TensorFlow", "Keras", "PyTorch", "Scikit-learn", "NLTK", "SpaCy", "SQLAlchemy", "Celery",
    "Java", "Spring", "Spring Boot", "Hibernate", "Maven", "Gradle", "JUnit", "JSP", "JSF", 
    "JavaFX", "Swing", "JDBC", "Apache Camel", "Kotlin",
    "JavaScript", "Node.js", "React", "Angular", "Vue.js", "Next.js", "Express.js", "TypeScript", 
    "Redux", "jQuery", "Electron", "Gatsby", "ES6", "Svelte",
    "C++", "Boost", "Qt", "OpenCV", "STL", "CUDA", "OpenMP", "CMake", "GTest", 
    "C#", ".NET", "ASP.NET", "Entity Framework", "Blazor", "Xamarin", "Unity", "WPF", "WinForms", 
    "LINQ", "Razor Pages", "MVC", "NUnit", "Azure Functions", "SignalR",
    "SQL", "MySQL", "PostgreSQL", "SQLite", "MongoDB", "Redis", "Oracle", "Cassandra", "Elasticsearch", 
    "Microsoft SQL Server", "PL/SQL", "T-SQL",
    "R", "ggplot2", "Shiny", "dplyr", "tidyverse", "caret", "data.table", "rpart", "lubridate", "forecast", 
    "R Markdown", "R Shiny",
    "HTML", "CSS", "Sass", "LESS", "Tailwind CSS", "Bootstrap", "Bulma", "Materialize", "WebAssembly", 
    "Webpack", "Gulp", "JAMstack", "Grunt", "Pug", "Handlebars",
    "Docker", "Kubernetes", "AWS", "Azure", "Google Cloud Platform", "Terraform", "Ansible", "Jenkins", 
    "CI/CD", "Helm", "OpenShift", "Serverless", "CloudFormation", "Chef", "Puppet", "Vagrant",
    "Hadoop", "Spark", "Kafka", "Airflow", "Hive", "Pig", "HBase", "Presto", "Dask", "PySpark", "Google BigQuery",
    "Machine Learning", "Deep Learning", "NLP", "OpenAI GPT", "BERT", "AutoML", "Reinforcement Learning", 
    "Computer Vision", "Generative Adversarial Networks (GANs)", "XGBoost", "LightGBM", "CatBoost",
    "Swift", "Objective-C", "Kotlin", "React Native", "Flutter", "Dart", "Xcode", "Android Studio", 
    "iOS Development", "Android Development",
    "Git", "GitHub", "GitLab", "Bitbucket", "Subversion", "Travis CI", "CircleCI", "Jenkins", "GitFlow", 
    "Perforce",
    "Selenium", "Cypress", "Appium", "Postman", "JMeter", "JUnit", "Mockito", "Cucumber", "PyTest", 
    "Robot Framework", "TestNG", "Jest", "Mocha", "Chai",
    "Agile", "Scrum", "Kanban", "JIRA", "Confluence", "Trello", "Slack", "Basecamp", "Microsoft Project", 
    "Asana", "Monday.com", "ClickUp",
    "GraphQL", "REST API", "SOAP", "gRPC", "Microservices", "Event-Driven Architecture", "Apache Kafka", 
    "RabbitMQ", "Message Queues", "OpenAPI", "Swagger", "OAuth", "JWT", "Firebase", "Heroku",
    "Bash", "PowerShell", "Perl", "Ruby", "Shell Scripting", "Groovy", "Lua",
    "Istio", "Envoy", "Linkerd", "Consul", "Prometheus", "Grafana", "Jaeger", "Fluentd", "Elastic Stack (ELK)", 
    "Logstash", "ECS", "EKS", "Fargate",
]

# Job roles to match
JOB_ROLES = [
    "Developer", "Intern", "Engineer", "Manager", "Entry-Level",
    "Senior", "Junior", "Specialist", "Associate", "Lead",
    "Staff", "Principal", "Director", "Executive", "Consultant",
    "Analyst", "Programmer", "Designer", "Coordinator", "Architect",
    "Technician", "Administrator", "Data Scientist", "Data Analyst"
]

# Function to extract text from a PDF using PyMuPDF (fitz)
def extract_text_with_pymupdf(file_path):
    text = ""
    try:
        pdf_document = fitz.open(file_path)
        for page in pdf_document:
            text += page.get_text()
        pdf_document.close()
    except Exception as e:
        print("Error reading PDF:", e)
    return text.strip()

# Function to extract text from images using pytesseract
def extract_text_from_image(file_path):
    try:
        image = Image.open(file_path)
        text = pytesseract.image_to_string(image)
        return text.strip()
    except Exception as e:
        print("Error reading image:", e)
        return ""
class NameExtractor:
    def __init__(self):
        # Define patterns for name extraction, including spaced, underscored, and initial-based names
        self.name_patterns = [
            r'^[A-Z][a-zA-Z\s\-\.]+<span class="math-inline">',  # Standard names \(e\.g\., "First Last"\)
r'^\[A\-Z\]\+\\s\+\[A\-Z\]\+\\\.\[A\-Z\]\+\\s\*</span>',  # Names with initials (e.g., "A. Jesper")
            r'^[A-Z\s]+<span class="math-inline">',  # Fully uppercase spaced names \(e\.g\., "A L P H I N E P A T R I C K F"\)
r'\\b\[A\-Z\]\[a\-zA\-Z\]\+\\\.?\\s\[A\-Z\]\[a\-zA\-Z\]\+\\\.?\\b',  # Names with periods \(e\.g\., "Jesper Deni A\."\)
r'^\[A\-Z\]\+\\s\[A\-Z\]\+\\b',  # Names without punctuation or initial format
r'^\[A\-Z\]\[a\-z\]\+\\s\[A\-Z\]\[a\-z\]\+</span>',  # Standard names (capitalized) "John Doe"
    r'\b[A-Z]+\s+[A-Z]+\.[A-Z]+\b', re.IGNORECASE]

    def is_valid_name(self, line):
        """Helper function to filter out lines that likely do not contain names."""
        # Ignore lines containing numbers, email addresses, job titles, or qualifications
        if re.search(r'\d', line) or re.search(r'@\S+', line) or re.search(r'\b(Street|Avenue|Developer|Engineer|Manager|BSc|MSc)\b', line, re.IGNORECASE):
            return False
        return True

    def extract_name(self, text):
        lines = text.split('\n')[:4]  # Check only the first 4 lines

        for line in lines:
            line = line.strip()

            # Validate if the line is likely a name
            if not self.is_valid_name(line):
                continue

            # Try each pattern to extract the name
            for pattern in self.name_patterns:
                match = re.search(pattern, line)
                if match:
                    return match.group(0).strip()

        # If no name found from patterns, try extracting from email
        email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
        email_match = re.search(email_pattern, text)
        if email_match:
            return email_match.group(0).split('@')[0]

        # If still no name found, return default
        return "Name not found"

# Function to check if a line contains a job role
def is_job_role(line):
    return any(role in line for role in JOB_ROLES)

# Function to check if a line contains a common skill
def is_common_skill(line):
    return any(skill in line for skill in COMMON_SKILLS)

# Function to extract skills using basic text matching
def extract_skills_with_nlp(resume_text):
    extracted_skills = [skill for skill in COMMON_SKILLS if skill.lower() in resume_text.lower()]
    return extracted_skills

# Function to extract email from resume text
def extract_email(resume_text):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails = re.findall(email_pattern, resume_text)
    return emails[0] if emails else None

# Function to generate personalized suggestions based on candidate's skills
SKILL_IMPROVEMENT_MAPPING = {
    "Python": {
        "improvements": ["Practice advanced Python concepts", "Contribute to open-source projects"],
        "roles": ["Data Scientist", "Backend Developer"]
    },
    "Machine Learning": {
        "improvements": ["Study ML algorithms", "Work on Kaggle competitions"],
        "roles": ["Machine Learning Engineer", "Data Analyst"]
    },
    # Add more skills, improvements, and roles as necessary
}

def generate_suggestions(extracted_skills):
    improvement_suggestions = set()
    suggested_roles = set()

    for skill in extracted_skills:
        if skill in SKILL_IMPROVEMENT_MAPPING:
            improvement_suggestions.update(SKILL_IMPROVEMENT_MAPPING[skill]["improvements"])
            suggested_roles.update(SKILL_IMPROVEMENT_MAPPING[skill]["roles"])

    return improvement_suggestions, suggested_roles

# Function to send an email with personalized suggestions for rejected candidates
def send_email(recipient_email, candidate_name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles):
    sender_email = "your_email@gmail.com"  # Replace with your email
    sender_password = "your_password"  # Replace with your password

    subject = "Job Application Update"
    body = f"""
    Dear {candidate_name},

    Thank you for your application. 
    Your skills matched {match_percentage:.2f}% with our job requirements.

    Extracted Skills: {', '.join(extracted_skills)}

    Unfortunately, we cannot move forward with your application at this time. However, we encourage you to consider the following suggestions to enhance your skills:

    Suggestions for Improvement:
    {', '.join(improvement_suggestions) if improvement_suggestions else "No suggestions available."}

    Suggested Roles for Consideration:
    {', '.join(suggested_roles) if suggested_roles else "No roles suggested."}

    Best regards,
    [Your Name]
    """

    # Print extracted details for verification
    print(f"\nExtracted Details for {candidate_name}:")
    print(f"Email: {recipient_email}")
    print(f"Match Percentage: {match_percentage:.2f}%")
    print(f"Extracted Skills: {', '.join(extracted_skills)}")
    print(f"Improvement Suggestions: {', '.join(improvement_suggestions) if improvement_suggestions else 'None'}")
    print(f"Suggested Roles: {', '.join(suggested_roles) if suggested_roles else 'None'}")
    print("="*50)

    # Setting up the email
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = sender_email
    msg['To'] = recipient_email

    try:
        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:
            server.login(sender_email, sender_password)
            server.sendmail(sender_email, recipient_email, msg.as_string())
        print(f"Email sent to {recipient_email} successfully!")
    except Exception as e:
        print(f"Failed to send email to {recipient_email}: {e}")

# Function to extract skills from job description
def extract_skills_from_job_description(job_description):
    return [skill for skill in COMMON_SKILLS if skill.lower() in job_description.lower()]

# Function to match extracted skills with job description skills and calculate match percentage
def match_skills(extracted_skills, job_skills):
    matched_skills = set(extracted_skills) & set(job_skills)
    match_percentage = len(matched_skills) / len(job_skills) * 100 if job_skills else 0
    return matched_skills, match_percentage

# Main processing function
def process_resumes(resume_folder_path, job_description):
    job_skills = extract_skills_from_job_description(job_description)

    # Process PDF and image resumes
    for resume_path in glob.glob(os.path.join(resume_folder_path, '*.pdf')) + \
                      glob.glob(os.path.join(resume_folder_path, '*.jpg')) + \
                      glob.glob(os.path.join(resume_folder_path, '*.png')):
        print(f"\nProcessing resume: {resume_path}")

        if resume_path.endswith('.pdf'):
            resume_text = extract_text_with_pymupdf(resume_path)
        else:  # Assume it's an image
            resume_text = extract_text_from_image(resume_path)

        candidate_name = extract_name_from_text(resume_text)
        extracted_skills = extract_skills_with_nlp(resume_text)
        recipient_email = extract_email(resume_text)

        matched_skills, match_percentage = match_skills(extracted_skills, job_skills)

        # Prepare suggestions if the candidate is not selected
        improvement_suggestions, suggested_roles = generate_suggestions(extracted_skills)

        if recipient_email:
            # Send email if a valid email is found
            send_email(recipient_email, candidate_name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles)
        else:
            print(f"No email found for {candidate_name}")

# Update these paths accordingly
resume_folder = r'D:/Project/New folder (2)/resumes'  # Path to your resumes folder
job_description_file = input()  # Path to your job description

process_resumes(resume_folder, job_description_file)



Processing resume: D:/Project/New folder (2)/resumes\A.pdf

Extracted Details for Emily Harris:
Email: emily.harris@example.com
Match Percentage: 25.00%
Extracted Skills: Java, JavaScript, Node.js, React, R, HTML, CSS, Webpack, Git
Improvement Suggestions: None
Suggested Roles: None
Failed to send email to emily.harris@example.com: (535, b'5.7.8 Username and Password not accepted. For more information, go to\n5.7.8  https://support.google.com/mail/?p=BadCredentials d9443c01a7336-20d1804b734sm36597875ad.222 - gsmtp')

Processing resume: D:/Project/New folder (2)/resumes\A2.pdf

Extracted Details for Emily Harris:
Email: j87221304@gmail.com
Match Percentage: 25.00%
Extracted Skills: Java, JavaScript, Node.js, React, R, HTML, CSS, Webpack, Git
Improvement Suggestions: None
Suggested Roles: None
Failed to send email to j87221304@gmail.com: (535, b'5.7.8 Username and Password not accepted. For more information, go to\n5.7.8  https://support.google.com/mail/?p=BadCredentials d9443c01a7336-2

In [24]:
import os
import re
import glob
import fitz  # PyMuPDF
import smtplib
from email.mime.text import MIMEText
from PIL import Image
import pytesseract
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

# List of common skills to match
COMMON_SKILLS = [
    "Python", "Flask", "Django", "FastAPI", "Pandas", "NumPy", "SciPy", "Matplotlib", "Seaborn",
    "TensorFlow", "Keras", "PyTorch", "Scikit-learn", "NLTK", "SpaCy", "SQLAlchemy", "Celery",
    "Java", "Spring", "Spring Boot", "Hibernate", "Maven", "Gradle", "JUnit", "JSP", "JSF",
    "JavaFX", "Swing", "JDBC", "Apache Camel", "Kotlin",
    "JavaScript", "Node.js", "React", "Angular", "Vue.js", "Next.js", "Express.js", "TypeScript",
    "Redux", "jQuery", "Electron", "Gatsby", "ES6", "Svelte",
    "C++", "Boost", "Qt", "OpenCV", "STL", "CUDA", "OpenMP", "CMake", "GTest",
    "C#", ".NET", "ASP.NET", "Entity Framework", "Blazor", "Xamarin", "Unity", "WPF", "WinForms",
    "LINQ", "Razor Pages", "MVC", "NUnit", "Azure Functions", "SignalR",
    "SQL", "MySQL", "PostgreSQL", "SQLite", "MongoDB", "Redis", "Oracle", "Cassandra", "Elasticsearch",
    "Microsoft SQL Server", "PL/SQL", "T-SQL",
    "R", "ggplot2", "Shiny", "dplyr", "tidyverse", "caret", "data.table", "rpart", "lubridate", "forecast",
    "R Markdown", "R Shiny",
    "HTML", "CSS", "Sass", "LESS", "Tailwind CSS", "Bootstrap", "Bulma", "Materialize", "WebAssembly",
    "Webpack", "Gulp", "JAMstack", "Grunt", "Pug", "Handlebars",
    "Docker", "Kubernetes", "AWS", "Azure", "Google Cloud Platform", "Terraform", "Ansible", "Jenkins",
    "CI/CD", "Helm", "OpenShift", "Serverless", "CloudFormation", "Chef", "Puppet", "Vagrant",
    "Hadoop", "Spark", "Kafka", "Airflow", "Hive", "Pig", "HBase", "Presto", "Dask", "PySpark", "Google BigQuery",
    "Machine Learning", "Deep Learning", "NLP", "OpenAI GPT", "BERT", "AutoML", "Reinforcement Learning",
    "Computer Vision", "Generative Adversarial Networks (GANs)", "XGBoost", "LightGBM", "CatBoost",
    "Swift", "Objective-C", "Kotlin", "React Native", "Flutter", "Dart", "Xcode", "Android Studio",
    "iOS Development", "Android Development",
    "Git", "GitHub", "GitLab", "Bitbucket", "Subversion", "Travis CI", "CircleCI", "Jenkins", "GitFlow",
    "Perforce",
    "Selenium", "Cypress", "Appium", "Postman", "JMeter", "JUnit", "Mockito", "Cucumber", "PyTest",
    "Robot Framework", "TestNG", "Jest", "Mocha", "Chai",
    "Agile", "Scrum", "Kanban", "JIRA", "Confluence", "Trello", "Slack", "Basecamp", "Microsoft Project",
    "Asana", "Monday.com", "ClickUp",
    "GraphQL", "REST API", "SOAP", "gRPC", "Microservices", "Event-Driven Architecture", "Apache Kafka",
    "RabbitMQ", "Message Queues", "OpenAPI", "Swagger", "OAuth", "JWT", "Firebase", "Heroku",
    "Bash", "PowerShell", "Perl", "Ruby", "Shell Scripting", "Groovy", "Lua",
    "Istio", "Envoy", "Linkerd", "Consul", "Prometheus", "Grafana", "Jaeger", "Fluentd", "Elastic Stack (ELK)",
    "Logstash", "ECS", "EKS", "Fargate",
]

# Job roles to match
JOB_ROLES = [
    "Developer", "Intern", "Engineer", "Manager", "Entry-Level",
    "Senior", "Junior", "Specialist", "Associate", "Lead",
    "Staff", "Principal", "Director", "Executive", "Consultant",
    "Analyst", "Programmer", "Designer", "Coordinator", "Architect",
    "Technician", "Administrator", "Data Scientist", "Data Analyst"
]

# Specify folder path for resumes
RESUME_FOLDER_PATH = 'D:/Project/New folder (2)/resumes'

# Function to extract text from a PDF using PyMuPDF (fitz)
def extract_text_with_pymupdf(file_path):
    text = ""
    try:
        pdf_document = fitz.open(file_path)
        for page in pdf_document:
            text += page.get_text()
        pdf_document.close()
    except Exception as e:
        print("Error reading PDF:", e)
    return text.strip()

# Function to extract text from images using pytesseract (with enhanced image quality)
def extract_text_from_image(file_path):
    try:
        image = Image.open(file_path)
        # Enhance image quality (optional, adjust enhancement factor as needed)
        enhanced_image = image.enhance(3.0)  # Adjust factor based on image quality
        text = pytesseract.image_to_string(enhanced_image)
        return text.strip()
    except Exception as e:
        print("Error reading image:", e)
        return ""

import re

class NameExtractor:
    def __init__(self):
        # Define patterns for name extraction
        self.name_patterns = [
            r'^[A-Z][a-zA-Z\s\-\.]+',  # Standard names (e.g., "First Last")
            r'^[A-Z]+\s[A-Z]+',         # Fully uppercase names (e.g., "A L P H I N E P A T R I C K F")
            r'\b[A-Z][a-z]*\.?\s[A-Z][a-z]*\.?\b',  # Names with periods (e.g., "Jesper Deni A.")
        ]

    def extract_name(self, text, email):
        # Normalize text and email for case-insensitive comparison
        normalized_text = text.lower()
        email_name = email.split('@')[0]  # Get the part before '@'
        normalized_email_name = email_name.lower()  # Normalize the email name

        # Return matched text if the email name is found in text (case-insensitive)
        if normalized_email_name in normalized_text:
            # Find and return the matched portion of the text excluding the email
            matched_line = re.search(r'\b' + re.escape(normalized_email_name) + r'\b', normalized_text)
            if matched_line:
                # Return the matched line in its original form
                start_index = matched_line.start()
                end_index = matched_line.end()
                return text[start_index:end_index].strip()

        # Try regular expression patterns first for names
        for pattern in self.name_patterns:
            match = re.search(pattern, text)
            if match:
                return match.group(0).strip()

        # If not found, return default
        return "Name not found"

    def extract_phone(self, text):
    # Normalize text for phone number extraction
        normalized_text = text.lower()
        
        # Try to match phone number patterns with country code and split digits
        patterns = [
            r'\+91\s*\d{10}',  # Format like +91 1234567890 or +91 123-456-7890
            r'\+91[-.\s]?\d{3}[-.\s]?\d{3}[-.\s]?\d{4}',  # Format like +91-123-456-7890 or +91.123.456.7890
            r'\b\d{10}\b',  # 10-digit number
            r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b',  # Standard formats
            r'\b\(\d{3}\)\s*\d{3}[-.\s]?\d{4}\b',  # Formats like (123) 456-7890
            r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}\b'  # International formats
        ]
        
        for pattern in patterns:
            match = re.search(pattern, text)
            if match:
                phone_number = match.group(0).strip()
                # If it starts with +91, extract the last 10 digits
                if phone_number.startswith('+91'):
                    return phone_number[-10:]  # Return only the last 10 digits
                
                return phone_number  # Return matched phone number as is

        # If no phone number is found, return default
        return "Phone number not found"

# Function to check if a line contains a job role
def is_job_role(line):
    return any(role in line for role in JOB_ROLES)

# Function to check if a line contains a common skill
def is_common_skill(line):
    return any(skill in line for skill in COMMON_SKILLS)

# Function to extract skills using basic text matching
def extract_skills_with_nlp(resume_text):
    extracted_skills = [skill for skill in COMMON_SKILLS if skill.lower() in resume_text.lower()]
    return extracted_skills

# Function to extract email from resume text
def extract_email(resume_text):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails = re.findall(email_pattern, resume_text)
    return emails[0] if emails else None

# Function to send an email with personalized suggestions
def send_email(recipient_email, candidate_name, candidate_phone, match_percentage, extracted_skills, improvement_suggestions, suggested_roles):
    sender_email = "your_email@gmail.com"  # Replace with your email
    sender_password = "your_password"  # Replace with your email password
    subject = "Application Status and Improvement Suggestions"

    body = f"Dear {candidate_name},\n\n" \
           f"Thank you for applying. Based on your resume, you have a match percentage of {match_percentage}%.\n\n" \
           f"Your Phone Number: {candidate_phone}\n\n" \
           f"Extracted Skills: {', '.join(extracted_skills) if extracted_skills else 'None'}" \
           f"\n\nImprovement Suggestions:\n" + "\n".join(improvement_suggestions) + \
           f"\n\nSuggested Roles:\n" + "\n".join(suggested_roles) + \
           f"\n\nBest Regards,\nYour Company"

    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = sender_email
    msg['To'] = recipient_email

    try:
        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:
            server.login(sender_email, sender_password)
            server.sendmail(sender_email, recipient_email, msg.as_string())
            print(f"Email sent to {recipient_email}")
    except Exception as e:
        print(f"Failed to send email to {recipient_email}: {e}")

# Function to process resumes in the specified folder
# Function to process resumes in the specified folder
def process_resumes(job_description):
    for file_path in glob.glob(os.path.join(RESUME_FOLDER_PATH, '*.pdf')):  # Change to *.pdf if only PDF files
        resume_text = extract_text_with_pymupdf(file_path)
        
        # Email extraction first to use in name extraction
        candidate_email = extract_email(resume_text)

        # Name extraction logic
        name_extractor = NameExtractor()
        candidate_name = name_extractor.extract_name(resume_text, candidate_email)  # Pass email as well
        
        # Phone extraction
        candidate_phone = extract_phone_number(resume_text)

        # Skill extraction
        extracted_skills = extract_skills_with_nlp(resume_text)

        # Dummy match percentage for demonstration
        match_percentage = 50  # You can implement an actual calculation based on skills and job description

        # Check if the candidate is suitable
        if match_percentage < 75:  # Assuming 75% is the threshold for selection
            improvement_suggestions = ["Improve your Python skills.", "Gain experience with Django."]
            suggested_roles = ["Junior Developer", "Internship Opportunities"]

            # Print candidate details
            print(f"Candidate Name: {candidate_name}")
            print(f"Candidate Email: {candidate_email}")
            print(f"Candidate Phone: {candidate_phone}")
            print(f"Match Percentage: {match_percentage}%")
            print(f"Extracted Skills: {', '.join(extracted_skills) if extracted_skills else 'None'}\n")

            # Send email if a candidate email is found
            if candidate_email:
                send_email(candidate_email, candidate_name, candidate_phone, match_percentage, extracted_skills, improvement_suggestions, suggested_roles)

# Main function to run the process
if __name__ == "__main__":
    job_description = input("Enter the job description for the role: ")
    process_resumes(job_description)


Candidate Name: emily.harris
Candidate Email: emily.harris@example.com
Candidate Phone: 90001
Match Percentage: 50%
Extracted Skills: Java, JavaScript, Node.js, React, R, HTML, CSS, Webpack, Git

Failed to send email to emily.harris@example.com: (535, b'5.7.8 Username and Password not accepted. For more information, go to\n5.7.8  https://support.google.com/mail/?p=BadCredentials 98e67ed59e1d1-2e3e08b17c2sm1138167a91.3 - gsmtp')
Candidate Name: j87221304
Candidate Email: j87221304@gmail.com
Candidate Phone: 90001
Match Percentage: 50%
Extracted Skills: Java, JavaScript, Node.js, React, R, HTML, CSS, Webpack, Git

Failed to send email to j87221304@gmail.com: (535, b'5.7.8 Username and Password not accepted. For more information, go to\n5.7.8  https://support.google.com/mail/?p=BadCredentials d9443c01a7336-20d1806b5d5sm38651565ad.291 - gsmtp')
Candidate Name: alex.rivera
Candidate Email: alex.rivera@example.com
Candidate Phone: 60601
Match Percentage: 50%
Extracted Skills: Python, C++, Bo