In [2]:
import os
import re
import glob
import pdfplumber
import fitz
import smtplib
from email.mime.text import MIMEText

# List of common skills to match
COMMON_SKILLS = [
    "Python", "Flask", "Django", "FastAPI", "Pandas", "NumPy", "SciPy", "Matplotlib", "Seaborn", 
    "TensorFlow", "Keras", "PyTorch", "Scikit-learn", "NLTK", "SpaCy", "SQLAlchemy", "Celery",
    "Java", "Spring", "Spring Boot", "Hibernate", "Maven", "Gradle", "JUnit", "JSP", "JSF", 
    "JavaFX", "Swing", "JDBC", "Apache Camel", "Kotlin",
    "JavaScript", "Node.js", "React", "Angular", "Vue.js", "Next.js", "Express.js", "TypeScript", 
    "Redux", "jQuery", "Electron", "Gatsby", "ES6", "Svelte",
    "C++", "Boost", "Qt", "OpenCV", "STL", "CUDA", "OpenMP", "CMake", "GTest", 
    "C#", ".NET", "ASP.NET", "Entity Framework", "Blazor", "Xamarin", "Unity", "WPF", "WinForms", 
    "LINQ", "Razor Pages", "MVC", "NUnit", "Azure Functions", "SignalR",
    "SQL", "MySQL", "PostgreSQL", "SQLite", "MongoDB", "Redis", "Oracle", "Cassandra", "Elasticsearch", 
    "Microsoft SQL Server", "PL/SQL", "T-SQL",
    "R", "ggplot2", "Shiny", "dplyr", "tidyverse", "caret", "data.table", "rpart", "lubridate", "forecast", 
    "R Markdown", "R Shiny",
    "HTML", "CSS", "Sass", "LESS", "Tailwind CSS", "Bootstrap", "Bulma", "Materialize", "WebAssembly", 
    "Webpack", "Gulp", "JAMstack", "Grunt", "Pug", "Handlebars",
    "Docker", "Kubernetes", "AWS", "Azure", "Google Cloud Platform", "Terraform", "Ansible", "Jenkins", 
    "CI/CD", "Helm", "OpenShift", "Serverless", "CloudFormation", "Chef", "Puppet", "Vagrant",
    "Hadoop", "Spark", "Kafka", "Airflow", "Hive", "Pig", "HBase", "Presto", "Dask", "PySpark", "Google BigQuery",
    "Machine Learning", "Deep Learning", "NLP", "OpenAI GPT", "BERT", "AutoML", "Reinforcement Learning", 
    "Computer Vision", "Generative Adversarial Networks (GANs)", "XGBoost", "LightGBM", "CatBoost",
    "Swift", "Objective-C", "Kotlin", "React Native", "Flutter", "Dart", "Xcode", "Android Studio", 
    "iOS Development", "Android Development",
    "Git", "GitHub", "GitLab", "Bitbucket", "Subversion", "Travis CI", "CircleCI", "Jenkins", "GitFlow", 
    "Perforce",
    "Selenium", "Cypress", "Appium", "Postman", "JMeter", "JUnit", "Mockito", "Cucumber", "PyTest", 
    "Robot Framework", "TestNG", "Jest", "Mocha", "Chai",
    "Agile", "Scrum", "Kanban", "JIRA", "Confluence", "Trello", "Slack", "Basecamp", "Microsoft Project", 
    "Asana", "Monday.com", "ClickUp",
    "GraphQL", "REST API", "SOAP", "gRPC", "Microservices", "Event-Driven Architecture", "Apache Kafka", 
    "RabbitMQ", "Message Queues", "OpenAPI", "Swagger", "OAuth", "JWT", "Firebase", "Heroku",
    "Bash", "PowerShell", "Perl", "Ruby", "Shell Scripting", "Groovy", "Lua",
    "Istio", "Envoy", "Linkerd", "Consul", "Prometheus", "Grafana", "Jaeger", "Fluentd", "Elastic Stack (ELK)", 
    "Logstash", "ECS", "EKS", "Fargate",
]

JOB_ROLES = [
    "Developer", "Intern", "Engineer", "Manager", "Entry-Level",
    "Senior", "Junior", "Specialist", "Associate", "Lead",
    "Staff", "Principal", "Director", "Executive", "Consultant",
    "Analyst", "Programmer", "Designer", "Coordinator", "Architect",
    "Technician", "Administrator", "Data Scientist", "Data Analyst"
]

# Function to extract text from a PDF using PyMuPDF (fitz)
def extract_text_with_pymupdf(file_path):
    text = ""
    try:
        pdf_document = fitz.open(file_path)
        for page in pdf_document:
            text += page.get_text()
        pdf_document.close()
    except Exception as e:
        print("Error reading PDF:", e)
    return text.strip()
def extract_name_from_text(text):
    name_patterns = [
        re.compile(r'^[A-Z][a-zA-Z\s\-\.]+$', re.MULTILINE),  # Standard names
        re.compile(r'\b[A-Z][a-zA-Z\s\.\-]+\s+[A-Z][a-zA-Z\s\.\-]+\b', re.MULTILINE),  # Full names (First Last)
        re.compile(r'\b[A-Z]+\s+[A-Z]+\.[A-Z]+\b', re.IGNORECASE)  # Names with initials and periods (like "LALITH AKKASH.V")
    ]

    # Split the text into lines
    lines = text.split('\n')
    
    # Check the first line for a valid name
    first_line = lines[0] if lines else ""

    if not is_job_role(first_line) and not is_common_skill(first_line):
        for pattern in name_patterns:
            match = pattern.search(first_line)
            if match:
                return match.group(0).strip()

    # Use regex patterns to find names in all lines, avoiding common skill and job role lines
    for line in lines:
        # Skip lines that look like addresses (i.e., contain numbers or common address terms)
        if re.search(r'\d', line) or 'Street' in line or 'Avenue' in line or 'Road' in line:
            continue
        for pattern in name_patterns:
            match = pattern.search(line)
            if match and not is_job_role(line) and not is_common_skill(line):
                return match.group(0).strip()

    return "Name not found"
def is_job_role(line):
    job_roles = [
        "Developer", "Intern", "Engineer", "Manager", "Entry", "Level",
        "Senior", "Junior", "Specialist", "Associate", "Lead", "Staff",
        "Principal", "Director", "Executive", "Consultant", "Analyst",
        "Programmer", "Designer", "Coordinator", "Architect", "Technician", "Administrator"
    ]
    return any(role in line for role in job_roles)

def is_common_skill(line):
    common_skills = ["Python", "Java", "SQL", "C++", "JavaScript", "HTML", "CSS"]
    return any(skill in line for skill in common_skills)



# Function to extract skills using basic text matching
def extract_skills_with_nlp(resume_text):
    extracted_skills = [skill for skill in COMMON_SKILLS if skill.lower() in resume_text.lower()]
    return extracted_skills

# Function to extract email from resume text
def extract_email(resume_text):
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    emails = re.findall(email_pattern, resume_text)
    return emails[0] if emails else None

# Function to generate personalized suggestions based on candidate's skills
SKILL_IMPROVEMENT_MAPPING = {
    "Python": {
        "improvements": ["Practice advanced Python concepts", "Contribute to open-source projects"],
        "roles": ["Data Scientist", "Backend Developer"]
    },
    "Machine Learning": {
        "improvements": ["Study ML algorithms", "Work on Kaggle competitions"],
        "roles": ["Machine Learning Engineer", "Data Analyst"]
    },
    # Add more skills, improvements, and roles as necessary
}

def generate_suggestions(extracted_skills):
    improvement_suggestions = set()
    suggested_roles = set()

    for skill in extracted_skills:
        if skill in SKILL_IMPROVEMENT_MAPPING:
            improvement_suggestions.update(SKILL_IMPROVEMENT_MAPPING[skill]["improvements"])
            suggested_roles.update(SKILL_IMPROVEMENT_MAPPING[skill]["roles"])

    return improvement_suggestions, suggested_roles

# Function to send an email with personalized suggestions for rejected candidates
def send_email(recipient_email, candidate_name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles):
    sender_email = "jesperdeni002@gmail.com"
    sender_password = "dhex qzvo tpwq zkyf"  # Use environment variables for security

    subject = "Job Application Update"
    body = f"""
    Dear {candidate_name},

    Thank you for your application. 
    Your skills matched {match_percentage:.2f}% with our job requirements.

    Extracted Skills: {', '.join(extracted_skills)}

    Unfortunately, we cannot move forward with your application at this time. However, we encourage you to consider the following suggestions to enhance your skills:

    Suggestions for Improvement:
    {', '.join(improvement_suggestions) if improvement_suggestions else "No suggestions available."}

    Suggested Roles for Consideration:
    {', '.join(suggested_roles) if suggested_roles else "No roles suggested."}

    Best regards,
    [Your Name]
    """

    # Print extracted details for verification
    print(f"\nExtracted Details for {candidate_name}:")
    print(f"Email: {recipient_email}")
    print(f"Match Percentage: {match_percentage:.2f}%")
    print(f"Extracted Skills: {', '.join(extracted_skills)}")
    print(f"Improvement Suggestions: {', '.join(improvement_suggestions) if improvement_suggestions else 'None'}")
    print(f"Suggested Roles: {', '.join(suggested_roles) if suggested_roles else 'None'}")
    print("="*50)  # Separator for better readability

    # Setting up the email
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = sender_email
    msg['To'] = recipient_email

    try:
        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:
            server.login(sender_email, sender_password)
            server.sendmail(sender_email, recipient_email, msg.as_string())
        print(f"Email sent to {recipient_email} successfully!")
    except Exception as e:
        print(f"Failed to send email to {recipient_email}: {e}")

# Function to extract skills from job description
def extract_skills_from_job_description(job_description):
    return [skill for skill in COMMON_SKILLS if skill.lower() in job_description.lower()]

# Function to match extracted skills with job description skills and calculate match percentage
def match_skills(extracted_skills, job_skills):
    matched_skills = set(extracted_skills) & set(job_skills)
    match_percentage = len(matched_skills) / len(job_skills) * 100 if job_skills else 0
    return matched_skills, match_percentage

# Main processing function
def process_resumes(resume_folder_path, job_description):
    job_skills = extract_skills_from_job_description(job_description)

    for resume_path in glob.glob(os.path.join(resume_folder_path, '*.pdf')):
        print(f"\nProcessing resume: {resume_path}")

        resume_text = extract_text_with_pymupdf(resume_path)
        candidate_name = extract_name_from_text(resume_text)
        extracted_skills = extract_skills_with_nlp(resume_text)
        recipient_email = extract_email(resume_text)

        matched_skills, match_percentage = match_skills(extracted_skills, job_skills)

        # Prepare suggestions if the candidate is not selected
        improvement_suggestions, suggested_roles = generate_suggestions(extracted_skills)

        if recipient_email:
            # Send email if a valid email is found
            send_email(recipient_email, candidate_name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles)
        else:
            print(f"No email found for {candidate_name}")

resume_folder = r'D:\\praveen\\vdart\\Resume_full'  # Update this with your resumes folder path
job_description_file = r'D:\\praveen\\vdart\\machine learning_vv.txt'  # Update this with your job description file path

process_resumes(resume_folder, job_description_file)


Processing resume: D:\\praveen\\vdart\\Resume_full\A.pdf

Extracted Details for Emily Harris:
Email: emily.harris@example.com
Match Percentage: 33.33%
Extracted Skills: Java, JavaScript, Node.js, React, R, HTML, CSS, Webpack, Git
Improvement Suggestions: None
Suggested Roles: None
Email sent to emily.harris@example.com successfully!

Processing resume: D:\\praveen\\vdart\\Resume_full\Alex Rivera.pdf

Extracted Details for Alex Rivera:
Email: alex.rivera@example.com
Match Percentage: 33.33%
Extracted Skills: Python, C++, Boost, STL, CMake, R, Git
Improvement Suggestions: Practice advanced Python concepts, Contribute to open-source projects
Suggested Roles: Data Scientist, Backend Developer
Email sent to alex.rivera@example.com successfully!

Processing resume: D:\\praveen\\vdart\\Resume_full\Alphine_Patrick_resume 1.pdf

Extracted Details for A L P H I N E P A T R I C K F:
Email: alphinepatrickf@gmail.com
Match Percentage: 66.67%
Extracted Skills: Python, Java, JavaScript, C++, Unity, 

In [None]:
import re
import pdfplumber
import pytesseract
from PIL import Image
import os
import spacy
import smtplib  # For sending emails
from email.mime.multipart import MIMEMultipart  # For email structure
from email.mime.text import MIMEText  # For adding text to emails

# Set the path for the Tesseract OCR executable
pytesseract.pytesseract.tesseract_cmd = r'C:\\Users\\praveenkumar.s\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe'

# Load spaCy's English NLP model
nlp = spacy.load("en_core_web_sm")

def is_job_role(line):
    """Check if the given line contains a known job role."""
    job_roles = ["Software Engineer", "Data Scientist", "Project Manager"]
    return any(role.lower() in line.lower() for role in job_roles)

def is_common_skill(line):
    """Check if the given line contains a common skill."""
    common_skills = ["JavaScript", "Python", "Java", "C++", "HTML", "CSS"]
    return any(skill.lower() in line.lower() for skill in common_skills)
def extract_phone_numbers(text):
    # Regular expression pattern to match common phone number formats
    phone_pattern = r'\+?\d{1,3}[-.\s]??\(?\d{1,4}\)?[-.\s]??\d{1,4}[-.\s]??\d{1,9}'
    
    # Find all matches for the pattern
    phone_numbers = re.findall(phone_pattern, text)

    # Filter to ensure realistic lengths (e.g., 10-15 characters for international numbers)
    phone_numbers = [num for num in phone_numbers if 10 <= len(re.sub(r'\D', '', num)) <= 15]

    # Sort phone numbers (optional, could be sorted by length or numerically)
    phone_numbers.sort()

    return phone_numbers

def extract_text_with_pdfminer(file_path):
    # This is a placeholder function for extracting text from PDF using pdfminer or another library
    # You'll need to implement the actual text extraction logic here
    return "Sample text from PDF with phone number: +1 (234) 567-8901"

# Example of processing PDF files
def process_pdfs(resume_folder_path, files):
    for file_name in files:
        if file_name.endswith('.pdf'):  # Check if the file is a PDF
            file_path = os.path.join(resume_folder_path, file_name)  # Get the full file path
            extracted_text = extract_text_with_pdfminer(file_path)

            # Extract phone numbers from the extracted text
            phone_numbers = extract_phone_numbers(extracted_text)

            # Print the extracted phone numbers
            print(f"Extracted Phone Numbers from {file_name}: {phone_numbers}")

def extract_name_from_text(text):
    """Extract potential names from text using regex patterns and filters."""
    if text is None:
        print("No text provided for name extraction.")
        return None
    name_patterns = [
        re.compile(r'^[A-Z][a-zA-Z\s\-\.]+$', re.MULTILINE),
        re.compile(r'\b[A-Z][a-zA-Z\s\.\-]+\s+[A-Z][a-zA-Z\s\.\-]+\b', re.MULTILINE),
        re.compile(r'\b[A-Z]+\s+[A-Z]+\.[A-Z]+\b', re.IGNORECASE)
    ]
    lines = text.split('\n')
    for line in lines:
        if re.search(r'\d', line) or 'Street' in line or 'Avenue' in line or 'Road' in line:
            continue
        for pattern in name_patterns:
            match = pattern.search(line)
            if match and not is_job_role(line) and not is_common_skill(line):
                return match.group(0).strip()
    return "Name not found"

def extract_emails(text):
    """Extract email addresses from text using enhanced regex."""
    email_pattern = r'([a-zA-Z0-9._%+-]+)(?:\s*|\s*)(@)(?:\s*|\s*)([a-zA-Z0-9.-]+)(?:\s*|\s*)(\.[a-zA-Z]{2,})'
    matches = re.findall(email_pattern, text)
    emails = [f"{username}{at}{domain}{tld}" for username, at, domain, tld in matches]
    emails = [email.replace(" ", "") for email in emails]
    if not emails:
        print("No emails found in the provided text.")
    return emails

def extract_skills(text):
    """Extract common skills from text using regex for specified skills."""
    skills_pattern = r'\b(?:JavaScript|Python|Java|C\+\+|C|HTML|CSS|React\.js|Node\.js|Git|SQL|Tableau|Machine Learning|Keras|TensorFlow|Photoshop|PowerPoint|Visual Studio|Premiere Pro|MySQL|Excel)\b'
    skills = re.findall(skills_pattern, text, flags=re.IGNORECASE)
    return list(set(skills))

def process_pdf(file_path):
    """Extract text from a PDF file."""
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def process_image(file_path):
    """Extract text from an image file using OCR."""
    image = Image.open(file_path)
    text = pytesseract.image_to_string(image)
    return text

def extract_resume_details(file_path):
    """Extract details from a resume file, either in PDF or image format."""
    if file_path.lower().endswith('.pdf'):
        text = process_pdf(file_path)
    elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
        text = process_image(file_path)
    else:
        return None
    
    names = extract_name_from_text(text)
    emails = extract_emails(text)
    phone_numbers = extract_phone_numbers(text)
    skills = extract_skills(text)

    return {
        'File': file_path,
        'Names': names,
        'Emails': emails,
        'Phone Numbers': phone_numbers,
        'Skills': skills
    }

def generate_improvement_suggestions(skills):
    """Generate improvement suggestions based on missing skills."""
    essential_skills = {"Python", "SQL", "Machine Learning", "Data Analysis"}  # Add desired skills as needed
    missing_skills = essential_skills - set(skills)
    if missing_skills:
        return f"Consider improving the following skills: {', '.join(missing_skills)}"
    return "No major skill improvements needed."

def send_email(to_address, name, suggestions):
    """Send an email with improvement suggestions to the candidate."""
    from_address = "youremail@example.com"  # Update with your email
    password = "yourpassword"  # Update with your email password

    # Set up email content
    subject = "Resume Analysis and Improvement Suggestions"
    body = f"Dear {name},\n\nWe have reviewed your resume and have some suggestions:\n\n{suggestions}\n\nBest regards,\nYour Company"

    # Set up the MIME structure
    msg = MIMEMultipart()
    msg['From'] = from_address
    msg['To'] = to_address
    msg['Subject'] = subject
    msg.attach(MIMEText(body, 'plain'))

    try:
        # Establish a secure session with Gmail's SMTP server
        with smtplib.SMTP_SSL("smtp.gmail.com", 465) as server:
            server.login(from_address, password)
            server.send_message(msg)
        print(f"Email sent to {to_address}")
    except Exception as e:
        print(f"Failed to send email to {to_address}: {e}")

def main():
    """Main function to process all resumes in a folder, display details, and send email with suggestions."""
    resume_folder = r"D:\praveen\vdart\Resume_full"  # Update the folder path to the directory containing resumes
    resume_details = []

    for filename in os.listdir(resume_folder):
        file_path = os.path.join(resume_folder, filename)
        details = extract_resume_details(file_path)
        if details:
            resume_details.append(details)
    
    # Display extracted details and send email with suggestions for each resume
    for detail in resume_details:
        print(f"File: {detail['File']}")
        print(f"Names: {detail['Names']}")
        print(f"Emails: {detail['Emails']}")
        print(f"Phone Numbers: {detail['Phone Numbers']}")
        print(f"Skills: {detail['Skills']}")
        print("-" * 50)

        # Generate improvement suggestions based on skills
        suggestions = generate_improvement_suggestions(detail['Skills'])

        # Send email if an email address is extracted
        if detail['Emails']:
            send_email(detail['Emails'][0], detail['Names'], suggestions)

if __name__ == "__main__":
    main()


No emails found in the provided text.
No emails found in the provided text.
File: D:\praveen\vdart\Resume_full\A.pdf
Names: Emily Harris
Emails: ['emily.harris@example.com']
Phone Numbers: ['310) 123-4567']
Skills: ['HTML', 'JavaScript', 'CSS', 'React.js', 'Git', 'Node.js']
--------------------------------------------------
Failed to send email to emily.harris@example.com: (535, b'5.7.8 Username and Password not accepted. For more information, go to\n5.7.8  https://support.google.com/mail/?p=BadCredentials d9443c01a7336-210bbf6d5d9sm61016285ad.75 - gsmtp')
File: D:\praveen\vdart\Resume_full\Alex Rivera.pdf
Names: Alex Rivera
Emails: ['alex.rivera@example.com']
Phone Numbers: ['312) 123-4567']
Skills: ['Python', 'Git', 'Visual Studio', 'C']
--------------------------------------------------
Failed to send email to alex.rivera@example.com: (535, b'5.7.8 Username and Password not accepted. For more information, go to\n5.7.8  https://support.google.com/mail/?p=BadCredentials d2e1a72fcca58-

In [3]:
import re
import pdfplumber
import pytesseract
from PIL import Image
import os
import spacy
import smtplib  # For sending emails
from email.mime.multipart import MIMEMultipart  # For email structure
from email.mime.text import MIMEText  # For adding text to emails
import glob

# Set the path for the Tesseract OCR executable
pytesseract.pytesseract.tesseract_cmd = r'C:\\Users\\praveenkumar.s\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe'

# Load spaCy's English NLP model
nlp = spacy.load("en_core_web_sm")

def is_job_role(line):
    """Check if the given line contains a known job role."""
    job_roles = ["Software Engineer", "Data Scientist", "Project Manager"]
    return any(role.lower() in line.lower() for role in job_roles)

def is_common_skill(line):
    """Check if the given line contains a common skill."""
    common_skills = ["JavaScript", "Python", "Java", "C++", "HTML", "CSS"]
    return any(skill.lower() in line.lower() for skill in common_skills)

def extract_phone_numbers(text):
    """Extract phone numbers from the text using regex."""
    phone_pattern = r'\+?\d{1,3}[-.\s]??\(?\d{1,4}\)?[-.\s]??\d{1,4}[-.\s]??\d{1,9}'
    phone_numbers = re.findall(phone_pattern, text)
    phone_numbers = [num for num in phone_numbers if 10 <= len(re.sub(r'\D', '', num)) <= 15]
    return phone_numbers

def extract_name_from_text(text):
    """Extract potential names from text using regex patterns and filters."""
    if text is None:
        print("No text provided for name extraction.")
        return None
    name_patterns = [
        re.compile(r'^[A-Z][a-zA-Z\s\-\.]+$', re.MULTILINE),
        re.compile(r'\b[A-Z][a-zA-Z\s\.\-]+\s+[A-Z][a-zA-Z\s\.\-]+\b', re.MULTILINE),
        re.compile(r'\b[A-Z]+\s+[A-Z]+\.[A-Z]+\b', re.IGNORECASE)
    ]
    lines = text.split('\n')
    for line in lines:
        if re.search(r'\d', line) or 'Street' in line or 'Avenue' in line or 'Road' in line:
            continue
        for pattern in name_patterns:
            match = pattern.search(line)
            if match and not is_job_role(line) and not is_common_skill(line):
                return match.group(0).strip()
    return "Name not found"

def extract_emails(text):
    """Extract email addresses from text using enhanced regex."""
    email_pattern = r'([a-zA-Z0-9._%+-]+)(?:\s*|\s*)(@)(?:\s*|\s*)([a-zA-Z0-9.-]+)(?:\s*|\s*)(\.[a-zA-Z]{2,})'
    matches = re.findall(email_pattern, text)
    emails = [f"{username}{at}{domain}{tld}" for username, at, domain, tld in matches]
    emails = [email.replace(" ", "") for email in emails]
    if not emails:
        print("No emails found in the provided text.")
    return emails

def extract_skills(text):
    """Extract common skills from text using regex for specified skills."""
    skills_pattern = r'\b(?:JavaScript|Python|Java|C\+\+|C|HTML|CSS|React\.js|Node\.js|Git|SQL|Tableau|Machine Learning|Keras|TensorFlow|Photoshop|PowerPoint|Visual Studio|Premiere Pro|MySQL|Excel)\b'
    skills = re.findall(skills_pattern, text, flags=re.IGNORECASE)
    return list(set(skills))

def process_pdf(file_path):
    """Extract text from a PDF file."""
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def process_image(file_path):
    """Extract text from an image file using OCR."""
    image = Image.open(file_path)
    text = pytesseract.image_to_string(image)
    return text

def extract_resume_details(file_path):
    """Extract details from a resume file, either in PDF or image format."""
    if file_path.lower().endswith('.pdf'):
        text = process_pdf(file_path)
    elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
        text = process_image(file_path)
    else:
        return None
    
    names = extract_name_from_text(text)
    emails = extract_emails(text)
    phone_numbers = extract_phone_numbers(text)
    skills = extract_skills(text)

    return {
        'File': file_path,
        'Names': names,
        'Emails': emails,
        'Phone Numbers': phone_numbers,
        'Skills': skills
    }

def generate_improvement_suggestions(skills):
    """Generate improvement suggestions based on missing skills."""
    essential_skills = {"Python", "SQL", "Machine Learning", "Data Analysis"}  # Add desired skills as needed
    missing_skills = essential_skills - set(skills)
    if missing_skills:
        return f"Consider improving the following skills: {', '.join(missing_skills)}"
    return "No major skill improvements needed."

# Skill improvement mapping for detailed suggestions
SKILL_IMPROVEMENT_MAPPING = {
    "Python": {
        "improvements": ["Practice advanced Python concepts", "Contribute to open-source projects"],
        "roles": ["Data Scientist", "Backend Developer"]
    },
    "Machine Learning": {
        "improvements": ["Study ML algorithms", "Work on Kaggle competitions"],
        "roles": ["Machine Learning Engineer", "Data Analyst"]
    },
    # Add more skills, improvements, and roles as necessary
}

def generate_suggestions(extracted_skills):
    improvement_suggestions = set()
    suggested_roles = set()

    for skill in extracted_skills:
        if skill in SKILL_IMPROVEMENT_MAPPING:
            improvement_suggestions.update(SKILL_IMPROVEMENT_MAPPING[skill]["improvements"])
            suggested_roles.update(SKILL_IMPROVEMENT_MAPPING[skill]["roles"])

    return improvement_suggestions, suggested_roles

def send_email(to_address, name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles):
    """Send an email with improvement suggestions to the candidate."""
    from_address = "youremail@example.com"  # Update with your email
    password = "yourpassword"  # Update with your email password

    # Set up email content
    subject = "Resume Analysis and Improvement Suggestions"
    body = f"""
    Dear {name},

    Thank you for your application. Your skills matched {match_percentage:.2f}% with our job requirements.

    Extracted Skills: {', '.join(extracted_skills)}

    Unfortunately, we cannot move forward with your application at this time. However, we encourage you to consider the following suggestions to enhance your skills:

    Suggestions for Improvement:
    {', '.join(improvement_suggestions) if improvement_suggestions else "No suggestions available."}

    Suggested Roles for Consideration:
    {', '.join(suggested_roles) if suggested_roles else "No roles suggested."}

    Best regards,
    [Your Name]
    """

    # Setting up the email
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = from_address
    msg['To'] = to_address

    try:
        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:
            server.login(from_address, password)
            server.sendmail(from_address, to_address, msg.as_string())
        print(f"Email sent to {to_address} successfully!")
    except Exception as e:
        print(f"Failed to send email to {to_address}: {e}")

def extract_skills_from_job_description(job_description):
    """Extract skills from the job description."""
    # Assuming COMMON_SKILLS is defined somewhere
    return [skill for skill in COMMON_SKILLS if skill.lower() in job_description.lower()]

def match_skills(extracted_skills, job_skills):
    """Match extracted skills with job description skills and calculate match percentage."""
    matched_skills = set(extracted_skills) & set(job_skills)
    match_percentage = len(matched_skills) / len(job_skills) * 100 if job_skills else 0
    return matched_skills, match_percentage

def process_resumes(resume_folder_path, job_description):
    """Process resumes in the specified folder and match them with the job description."""
    job_skills = extract_skills_from_job_description(job_description)

    for resume_path in glob.glob(os.path.join(resume_folder_path, '*.pdf')):
        print(f"\nProcessing resume: {resume_path}")

        resume_text = extract_text_with_pymupdf(resume_path)
        candidate_name = extract_name_from_text(resume_text)
        extracted_skills = extract_skills(resume_text)
        recipient_email = extract_emails(resume_text)

        matched_skills, match_percentage = match_skills(extracted_skills, job_skills)

        # Prepare suggestions if the candidate is not selected
        improvement_suggestions, suggested_roles = generate_suggestions(extracted_skills)

        # Display extracted details for clarity
        print("\n--- Extracted Details ---")
        print(f"Candidate Name: {candidate_name if candidate_name else 'Not Found'}")
        print(f"Extracted Skills: {', '.join(extracted_skills) if extracted_skills else 'None'}")
        print(f"Matched Skills: {', '.join(matched_skills) if matched_skills else 'None'}")
        print(f"Match Percentage: {match_percentage:.2f}%")
        
        if improvement_suggestions or suggested_roles:
            print("\n--- Improvement Suggestions ---")
            print(f"Suggestions for Improvement: {', '.join(improvement_suggestions) if improvement_suggestions else 'None'}")
            print(f"Suggested Roles: {', '.join(suggested_roles) if suggested_roles else 'None'}")

        print("------------------------------")

        if recipient_email:
            # Send email if a valid email is found
            send_email(recipient_email[0], candidate_name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles)
        else:
            print(f"No email found for {candidate_name}")

# Example paths (update these as needed)
resume_folder = r'D:\\praveen\\vdart\\Resume_full'  # Resumes folder path
job_description_file = r'D:\\praveen\\vdart\\\machine_learning_vv.txt'  # Job description file path

# Load job description text
with open(job_description_file, 'r') as f:
    job_description = f.read()

process_resumes(resume_folder, job_description)


FileNotFoundError: [Errno 2] No such file or directory: 'D:\\\\praveen\\\\vdart\\\\machine_learning_vv.txt'

In [5]:
import re
import pdfplumber
import pytesseract
from PIL import Image
import os
import spacy
import smtplib  # For sending emails
from email.mime.multipart import MIMEMultipart  # For email structure
from email.mime.text import MIMEText  # For adding text to emails
import glob

# Set the path for the Tesseract OCR executable
pytesseract.pytesseract.tesseract_cmd = r'C:\\Users\\praveenkumar.s\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe'

# Load spaCy's English NLP model
nlp = spacy.load("en_core_web_sm")

def is_job_role(line):
    """Check if the given line contains a known job role."""
    job_roles = ["Software Engineer", "Data Scientist", "Project Manager"]
    return any(role.lower() in line.lower() for role in job_roles)

def is_common_skill(line):
    """Check if the given line contains a common skill."""
    common_skills = ["JavaScript", "Python", "Java", "C++", "HTML", "CSS"]
    return any(skill.lower() in line.lower() for skill in common_skills)

def extract_phone_numbers(text):
    """Extract phone numbers from the text using regex."""
    phone_pattern = r'\+?\d{1,3}[-.\s]??\(?\d{1,4}\)?[-.\s]??\d{1,4}[-.\s]??\d{1,9}'
    phone_numbers = re.findall(phone_pattern, text)
    phone_numbers = [num for num in phone_numbers if 10 <= len(re.sub(r'\D', '', num)) <= 15]
    return phone_numbers

def extract_name_from_text(text):
    """Extract potential names from text using regex patterns and filters."""
    if text is None:
        print("No text provided for name extraction.")
        return None
    name_patterns = [
        re.compile(r'^[A-Z][a-zA-Z\s\-\.]+$', re.MULTILINE),
        re.compile(r'\b[A-Z][a-zA-Z\s\.\-]+\s+[A-Z][a-zA-Z\s\.\-]+\b', re.MULTILINE),
        re.compile(r'\b[A-Z]+\s+[A-Z]+\.[A-Z]+\b', re.IGNORECASE)
    ]
    lines = text.split('\n')
    for line in lines:
        if re.search(r'\d', line) or 'Street' in line or 'Avenue' in line or 'Road' in line:
            continue
        for pattern in name_patterns:
            match = pattern.search(line)
            if match and not is_job_role(line) and not is_common_skill(line):
                return match.group(0).strip()
    return "Name not found"

def extract_emails(text):
    """Extract email addresses from text using enhanced regex."""
    email_pattern = r'([a-zA-Z0-9._%+-]+)(?:\s*|\s*)(@)(?:\s*|\s*)([a-zA-Z0-9.-]+)(?:\s*|\s*)(\.[a-zA-Z]{2,})'
    matches = re.findall(email_pattern, text)
    emails = [f"{username}{at}{domain}{tld}" for username, at, domain, tld in matches]
    emails = [email.replace(" ", "") for email in emails]
    if not emails:
        print("No emails found in the provided text.")
    return emails

def extract_skills(text):
    """Extract common skills from text using regex for specified skills."""
    skills_pattern = r'\b(?:JavaScript|Python|Java|C\+\+|C|HTML|CSS|React\.js|Node\.js|Git|SQL|Tableau|Machine Learning|Keras|TensorFlow|Photoshop|PowerPoint|Visual Studio|Premiere Pro|MySQL|Excel)\b'
    skills = re.findall(skills_pattern, text, flags=re.IGNORECASE)
    return list(set(skills))

def process_pdf(file_path):
    """Extract text from a PDF file."""
    text = ""
    try:
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error processing PDF file {file_path}: {e}")
    return text

def process_image(file_path):
    """Extract text from an image file using OCR."""
    text = ""
    try:
        image = Image.open(file_path)
        text = pytesseract.image_to_string(image)
    except Exception as e:
        print(f"Error processing image file {file_path}: {e}")
    return text

def extract_resume_details(file_path):
    """Extract details from a resume file, either in PDF or image format."""
    text = ""
    if file_path.lower().endswith('.pdf'):
        text = process_pdf(file_path)
    elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
        text = process_image(file_path)
    else:
        print(f"Unsupported file type: {file_path}")
        return None
    
    names = extract_name_from_text(text)
    emails = extract_emails(text)
    phone_numbers = extract_phone_numbers(text)
    skills = extract_skills(text)

    return {
        'File': file_path,
        'Names': names,
        'Emails': emails,
        'Phone Numbers': phone_numbers,
        'Skills': skills
    }

def generate_improvement_suggestions(skills):
    """Generate improvement suggestions based on missing skills."""
    essential_skills = {"Python", "SQL", "Machine Learning", "Data Analysis"}  # Add desired skills as needed
    missing_skills = essential_skills - set(skills)
    if missing_skills:
        return f"Consider improving the following skills: {', '.join(missing_skills)}"
    return "No major skill improvements needed."

# Skill improvement mapping for detailed suggestions
SKILL_IMPROVEMENT_MAPPING = {
    "Python": {
        "improvements": ["Practice advanced Python concepts", "Contribute to open-source projects"],
        "roles": ["Data Scientist", "Backend Developer"]
    },
    "Machine Learning": {
        "improvements": ["Study ML algorithms", "Work on Kaggle competitions"],
        "roles": ["Machine Learning Engineer", "Data Analyst"]
    },
    # Add more skills, improvements, and roles as necessary
}

def generate_suggestions(extracted_skills):
    improvement_suggestions = set()
    suggested_roles = set()

    for skill in extracted_skills:
        if skill in SKILL_IMPROVEMENT_MAPPING:
            improvement_suggestions.update(SKILL_IMPROVEMENT_MAPPING[skill]["improvements"])
            suggested_roles.update(SKILL_IMPROVEMENT_MAPPING[skill]["roles"])

    return improvement_suggestions, suggested_roles

def send_email(to_address, name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles):
    """Send an email with improvement suggestions to the candidate."""
    from_address = "youremail@example.com"  # Update with your email
    password = "yourpassword"  # Update with your email password

    # Set up email content
    subject = "Resume Analysis and Improvement Suggestions"
    body = f"""
    Dear {name},

    Thank you for your application. Your skills matched {match_percentage:.2f}% with our job requirements.

    Extracted Skills: {', '.join(extracted_skills)}

    Unfortunately, we cannot move forward with your application at this time. However, we encourage you to consider the following suggestions to enhance your skills:

    Suggestions for Improvement:
    {', '.join(improvement_suggestions) if improvement_suggestions else "No suggestions available."}

    Suggested Roles for Consideration:
    {', '.join(suggested_roles) if suggested_roles else "No roles suggested."}

    Best regards,
    [Your Name]
    """

    # Print extracted details for verification
    print(f"File: {extracted_details['File']}")
    print(f"Names: {extracted_details['Names']}")
    print(f"Emails: {extracted_details['Emails']}")
    print(f"Phone Numbers: {extracted_details['Phone Numbers']}")
    print(f"Skills: {extracted_details['Skills']}")
  

    print(f"Match Percentage: {match_percentage:.2f}%")
    print(f"Improvement Suggestions: {', '.join(improvement_suggestions) if improvement_suggestions else 'None'}")
    print(f"Suggested Roles: {', '.join(suggested_roles) if suggested_roles else 'None'}")
    print("=" * 50)

    # Setting up the email
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = from_address
    msg['To'] = to_address

    try:
        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:
            server.login(from_address, password)
            server.sendmail(from_address, to_address, msg.as_string())
        print(f"Email sent to {to_address} successfully!")
    except Exception as e:
        print(f"Failed to send email to {to_address}: {e}")

def extract_skills_from_job_description(job_description):
    """Extract skills from the job description."""
    common_skills = ["Python", "SQL", "Machine Learning", "Data Analysis"]  # Add your job description skills
    return [skill for skill in common_skills if skill.lower() in job_description.lower()]

def match_skills(extracted_skills, job_skills):
    """Match extracted skills with job description skills and calculate match percentage."""
    matched_skills = set(extracted_skills) & set(job_skills)
    match_percentage = len(matched_skills) / len(job_skills) * 100 if job_skills else 0
    return matched_skills, match_percentage

def process_resumes(resume_folder_path, job_description):
    """Process resumes in the specified folder and match them with the job description."""
    job_skills = extract_skills_from_job_description(job_description)
    for file_path in glob.glob(os.path.join(resume_folder_path, '*')):
        extracted_details = extract_resume_details(file_path)
        if extracted_details:
            matched_skills, match_percentage = match_skills(extracted_details['Skills'], job_skills)
            improvement_suggestions, suggested_roles = generate_suggestions(extracted_details['Skills'])

            # Send email to the candidate with suggestions
            if extracted_details['Emails']:
                for email in extracted_details['Emails']:
                    send_email(email, extracted_details['Names'], match_percentage, extracted_details['Skills'], improvement_suggestions, suggested_roles)
            else:
                print(f"No valid email found in resume: {file_path}")

if __name__ == "__main__":
    resume_folder_path = r'C:\\path\\to\\your\\resume\\folder'  # Update with your resumes folder path
    job_description = "We are looking for a Data Scientist proficient in Python, SQL, and Machine Learning."  # Sample job description
    process_resumes(resume_folder_path, job_description)


In [8]:
import re
import pdfplumber
import pytesseract
from PIL import Image
import os
import spacy
import smtplib  # For sending emails
from email.mime.multipart import MIMEMultipart  # For email structure
from email.mime.text import MIMEText  # For adding text to emails
import glob

# Set the path for the Tesseract OCR executable
pytesseract.pytesseract.tesseract_cmd = r'C:\\Users\\praveenkumar.s\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe'

# Load spaCy's English NLP model
nlp = spacy.load("en_core_web_sm")

def is_job_role(line):
    """Check if the given line contains a known job role."""
    job_roles = ["Software Engineer", "Data Scientist", "Project Manager"]
    return any(role.lower() in line.lower() for role in job_roles)

def is_common_skill(line):
    """Check if the given line contains a common skill."""
    common_skills = ["JavaScript", "Python", "Java", "C++", "HTML", "CSS"]
    return any(skill.lower() in line.lower() for skill in common_skills)

def extract_phone_numbers(text):
    """Extract phone numbers from the text using regex."""
    phone_pattern = r'\+?\d{1,3}[-.\s]??\(?\d{1,4}\)?[-.\s]??\d{1,4}[-.\s]??\d{1,9}'
    phone_numbers = re.findall(phone_pattern, text)
    phone_numbers = [num for num in phone_numbers if 10 <= len(re.sub(r'\D', '', num)) <= 15]
    return phone_numbers

def extract_name_from_text(text):
    """Extract potential names from text using regex patterns and filters."""
    if text is None:
        print("No text provided for name extraction.")
        return None
    name_patterns = [
        re.compile(r'^[A-Z][a-zA-Z\s\-\.]+$', re.MULTILINE),
        re.compile(r'\b[A-Z][a-zA-Z\s\.\-]+\s+[A-Z][a-zA-Z\s\.\-]+\b', re.MULTILINE),
        re.compile(r'\b[A-Z]+\s+[A-Z]+\.[A-Z]+\b', re.IGNORECASE)
    ]
    lines = text.split('\n')
    for line in lines:
        if re.search(r'\d', line) or 'Street' in line or 'Avenue' in line or 'Road' in line:
            continue
        for pattern in name_patterns:
            match = pattern.search(line)
            if match and not is_job_role(line) and not is_common_skill(line):
                return match.group(0).strip()
    return "Name not found"

def extract_emails(text):
    """Extract email addresses from text using enhanced regex."""
    email_pattern = r'([a-zA-Z0-9._%+-]+)(?:\s*|\s*)(@)(?:\s*|\s*)([a-zA-Z0-9.-]+)(?:\s*|\s*)(\.[a-zA-Z]{2,})'
    matches = re.findall(email_pattern, text)
    emails = [f"{username}{at}{domain}{tld}" for username, at, domain, tld in matches]
    emails = [email.replace(" ", "") for email in emails]
    if not emails:
        print("No emails found in the provided text.")
    return emails

def extract_skills(text):
    """Extract common skills from text using regex for specified skills."""
    skills_pattern = r'\b(?:JavaScript|Python|Java|C\+\+|C|HTML|CSS|React\.js|Node\.js|Git|SQL|Tableau|Machine Learning|Keras|TensorFlow|Photoshop|PowerPoint|Visual Studio|Premiere Pro|MySQL|Excel)\b'
    skills = re.findall(skills_pattern, text, flags=re.IGNORECASE)
    return list(set(skills))

def process_pdf(file_path):
    """Extract text from a PDF file."""
    text = ""
    try:
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error processing PDF file {file_path}: {e}")
    return text

def process_image(file_path):
    """Extract text from an image file using OCR."""
    text = ""
    try:
        image = Image.open(file_path)
        text = pytesseract.image_to_string(image)
    except Exception as e:
        print(f"Error processing image file {file_path}: {e}")
    return text

def extract_resume_details(file_path):
    """Extract details from a resume file, either in PDF or image format."""
    text = ""
    if file_path.lower().endswith('.pdf'):
        text = process_pdf(file_path)
    elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
        text = process_image(file_path)
    else:
        print(f"Unsupported file type: {file_path}")
        return None
    
    names = extract_name_from_text(text)
    emails = extract_emails(text)
    phone_numbers = extract_phone_numbers(text)
    skills = extract_skills(text)

    return {
        'File': file_path,
        'Names': names,
        'Emails': emails,
        'Phone Numbers': phone_numbers,
        'Skills': skills
    }

def generate_improvement_suggestions(skills):
    """Generate improvement suggestions based on missing skills."""
    essential_skills = {"Python", "SQL", "Machine Learning", "Data Analysis"}  # Add desired skills as needed
    missing_skills = essential_skills - set(skills)
    if missing_skills:
        return f"Consider improving the following skills: {', '.join(missing_skills)}"
    return "No major skill improvements needed."

# Skill improvement mapping for detailed suggestions
SKILL_IMPROVEMENT_MAPPING = {
    "Python": {
        "improvements": ["Practice advanced Python concepts", "Contribute to open-source projects"],
        "roles": ["Data Scientist", "Backend Developer"]
    },
    "Machine Learning": {
        "improvements": ["Study ML algorithms", "Work on Kaggle competitions"],
        "roles": ["Machine Learning Engineer", "Data Analyst"]
    },
    # Add more skills, improvements, and roles as necessary
}

def generate_suggestions(extracted_skills):
    improvement_suggestions = set()
    suggested_roles = set()

    for skill in extracted_skills:
        if skill in SKILL_IMPROVEMENT_MAPPING:
            improvement_suggestions.update(SKILL_IMPROVEMENT_MAPPING[skill]["improvements"])
            suggested_roles.update(SKILL_IMPROVEMENT_MAPPING[skill]["roles"])

    return improvement_suggestions, suggested_roles

def send_email(to_address, name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles):
    """Send an email with improvement suggestions to the candidate."""
    from_address = "youremail@example.com"  # Update with your email
    password = "yourpassword"  # Update with your email password

    # Set up email content
    subject = "Resume Analysis and Improvement Suggestions"
    body = f"""
    Dear {name},

    Thank you for your application. Your skills matched {match_percentage:.2f}% with our job requirements.

    Extracted Skills: {', '.join(extracted_skills)}

    Unfortunately, we cannot move forward with your application at this time. However, we encourage you to consider the following suggestions to enhance your skills:

    Suggestions for Improvement:
    {', '.join(improvement_suggestions) if improvement_suggestions else "No suggestions available."}

    Suggested Roles for Consideration:
    {', '.join(suggested_roles) if suggested_roles else "No roles suggested."}

    Best regards,
    [Your Name]
    """

    # Print extracted details for verification
    print(f"File: {extract_resume_details['File']}")
    print(f"Names: {extract_name_from_text['Names']}")
    print(f"Emails: {extract_emails['Emails']}")
    print(f"Phone Numbers: {extract_phone_numbers['Phone Numbers']}")
    print(f"Skills: {extract_skills['Skills']}")
    print(f"Match Percentage: {match_percentage:.2f}%")
    print(f"Improvement Suggestions: {', '.join(improvement_suggestions) if improvement_suggestions else 'None'}")
    print(f"Suggested Roles: {', '.join(suggested_roles) if suggested_roles else 'None'}")
    print("=" * 50)

    # Setting up the email
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = from_address
    msg['To'] = to_address

    try:
        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:
            server.login(from_address, password)
            server.sendmail(from_address, to_address, msg.as_string())
        print(f"Email sent to {to_address} successfully!")
    except Exception as e:
        print(f"Failed to send email to {to_address}: {e}")

def extract_skills_from_job_description(job_description):
    """Extract skills from the job description."""
    common_skills = ["Python", "SQL", "Machine Learning", "Data Analysis"]  # Add your job description skills
    return [skill for skill in common_skills if skill.lower() in job_description.lower()]

def match_skills(extracted_skills, job_skills):
    """Calculate match percentage between extracted skills and job description skills."""
    match_count = sum(1 for skill in extracted_skills if skill in job_skills)
    return (match_count / len(job_skills)) * 100 if job_skills else 0

def process_resumes_from_folder(folder_path):
    """Process all resumes in the specified folder."""
    for file_path in glob.glob(os.path.join(folder_path, '*')):
        extracted_details = extract_resume_details(file_path)
        if extracted_details:
            job_description = "Job description with required skills."  # Placeholder for actual job description
            job_skills = extract_skills_from_job_description(job_description)
            match_percentage = match_skills(extracted_details['Skills'], job_skills)

            improvement_suggestions, suggested_roles = generate_suggestions(extracted_details['Skills'])

            if extracted_details['Names'] != "Name not found":
                for name in extracted_details['Names']:
                    send_email("candidate@example.com", name, match_percentage, extracted_details['Skills'],
                               improvement_suggestions, suggested_roles)

# Example usage
if __name__ == "__main__":
    resume_folder_path = r'C:\\path\\to\\your\\resume\\folder'  # Update with your resumes folder path
    job_description = "We are looking for a Data Scientist proficient in Python, SQL, and Machine Learning."  # Sample job description
    process_resumes(resume_folder_path, job_description)



In [16]:
import re
import pdfplumber
import pytesseract
from PIL import Image
import os
import spacy
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import glob

# Set the path for the Tesseract OCR executable
pytesseract.pytesseract.tesseract_cmd = r'C:\\Users\\praveenkumar.s\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe'

# Load spaCy's English NLP model
nlp = spacy.load("en_core_web_sm")

def is_job_role(line):
    job_roles = ["Software Engineer", "Data Scientist", "Project Manager"]
    return any(role.lower() in line.lower() for role in job_roles)

def is_common_skill(line):
    common_skills = ["JavaScript", "Python", "Java", "C++", "HTML", "CSS"]
    return any(skill.lower() in line.lower() for skill in common_skills)

def extract_phone_numbers(text):
    phone_pattern = r'\+?\d{1,3}[-.\s]??\(?\d{1,4}\)?[-.\s]??\d{1,4}[-.\s]??\d{1,9}'
    phone_numbers = re.findall(phone_pattern, text)
    phone_numbers = [num for num in phone_numbers if 10 <= len(re.sub(r'\D', '', num)) <= 15]
    return phone_numbers

def extract_name_from_text(text):
    if text is None:
        print("No text provided for name extraction.")
        return None
    name_patterns = [
        re.compile(r'^[A-Z][a-zA-Z\s\-\.]+$', re.MULTILINE),
        re.compile(r'\b[A-Z][a-zA-Z\s\.\-]+\s+[A-Z][a-zA-Z\s\.\-]+\b', re.MULTILINE),
        re.compile(r'\b[A-Z]+\s+[A-Z]+\.[A-Z]+\b', re.IGNORECASE)
    ]
    lines = text.split('\n')
    for line in lines:
        if re.search(r'\d', line) or 'Street' in line or 'Avenue' in line or 'Road' in line:
            continue
        for pattern in name_patterns:
            match = pattern.search(line)
            if match and not is_job_role(line) and not is_common_skill(line):
                return match.group(0).strip()
    return "Name not found"

def extract_emails(text):
    email_pattern = r'([a-zA-Z0-9._%+-]+)(?:\s*|\s*)(@)(?:\s*|\s*)([a-zA-Z0-9.-]+)(?:\s*|\s*)(\.[a-zA-Z]{2,})'
    matches = re.findall(email_pattern, text)
    emails = [f"{username}{at}{domain}{tld}" for username, at, domain, tld in matches]
    emails = [email.replace(" ", "") for email in emails]
    return emails

def extract_skills(text):
    skills_pattern = r'\b(?:JavaScript|Python|Java|C\+\+|C|HTML|CSS|React\.js|Node\.js|Git|SQL|Tableau|Machine Learning|Keras|TensorFlow|Photoshop|PowerPoint|Visual Studio|Premiere Pro|MySQL|Excel)\b'
    skills = re.findall(skills_pattern, text, flags=re.IGNORECASE)
    return list(set(skills))

def process_pdf(file_path):
    text = ""
    try:
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error processing PDF file {file_path}: {e}")
    return text

def process_image(file_path):
    text = ""
    try:
        image = Image.open(file_path)
        text = pytesseract.image_to_string(image)
    except Exception as e:
        print(f"Error processing image file {file_path}: {e}")
    return text

def extract_resume_details(file_path):
    text = ""
    if file_path.lower().endswith('.pdf'):
        text = process_pdf(file_path)
    elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
        text = process_image(file_path)
    else:
        print(f"Unsupported file type: {file_path}")
        return None
    
    names = extract_name_from_text(text)
    emails = extract_emails(text)
    phone_numbers = extract_phone_numbers(text)
    skills = extract_skills(text)

    return {
        'File': file_path,
        'Names': names,
        'Emails': emails,
        'Phone Numbers': phone_numbers,
        'Skills': skills
    }

def generate_improvement_suggestions(skills):
    essential_skills = {"Python", "SQL", "Machine Learning", "Data Analysis"}
    missing_skills = essential_skills - set(skills)
    if missing_skills:
        return f"Consider improving the following skills: {', '.join(missing_skills)}"
    return "No major skill improvements needed."

SKILL_IMPROVEMENT_MAPPING = {
    "Python": {
        "improvements": ["Practice advanced Python concepts", "Contribute to open-source projects"],
        "roles": ["Data Scientist", "Backend Developer"]
    },
    "Machine Learning": {
        "improvements": ["Study ML algorithms", "Work on Kaggle competitions"],
        "roles": ["Machine Learning Engineer", "Data Analyst"]
    },
}

def print_extracted_details(file_path, extracted_details, match_percentage, improvement_suggestions, suggested_roles):
    """Print extracted resume details in a structured format."""
    print(f"File: {file_path}")
    print(f"Names: {extracted_details['Names']}")
    print(f"Emails: {extracted_details['Emails']}")
    print(f"Phone Numbers: {extracted_details['Phone Numbers']}")
    print(f"Skills: {extracted_details['Skills']}")
    print(f"Match Percentage: {match_percentage:.2f}%")
    print(f"Improvement Suggestions: {', '.join(improvement_suggestions) if improvement_suggestions else 'None'}")
    print(f"Suggested Roles: {', '.join(suggested_roles) if suggested_roles else 'None'}")
    print("="*40)

def generate_suggestions(extracted_skills):
    improvement_suggestions = set()
    suggested_roles = set()

    for skill in extracted_skills:
        if skill in SKILL_IMPROVEMENT_MAPPING:
            improvement_suggestions.update(SKILL_IMPROVEMENT_MAPPING[skill]["improvements"])
            suggested_roles.update(SKILL_IMPROVEMENT_MAPPING[skill]["roles"])

    return improvement_suggestions, suggested_roles

def send_email(to_address, name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles):
    from_address = "youremail@example.com"  # Update with your email
    password = "yourpassword"  # Update with your email password

    subject = "Resume Analysis and Improvement Suggestions"
    body = f"""
    Dear {name},

    Thank you for your application. Your skills matched {match_percentage:.2f}% with our job requirements.

    Extracted Skills: {', '.join(extracted_skills)}

    Unfortunately, we cannot move forward with your application at this time. However, we encourage you to consider the following suggestions to enhance your skills:

    Suggestions for Improvement:
    {', '.join(improvement_suggestions) if improvement_suggestions else "No suggestions available."}

    Suggested Roles for Consideration:
    {', '.join(suggested_roles) if suggested_roles else "No roles suggested."}

    Best regards,
    [Your Name]
    """


    # Setting up the email
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = from_address
    msg['To'] = to_address

    try:
        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:
            server.login(from_address, password)
            server.sendmail(from_address, to_address, msg.as_string())
        print(f"Email sent to {to_address} successfully!")
    except Exception as e:
        print(f"Failed to send email to {to_address}: {e}")

def extract_skills_from_job_description(job_description):
    common_skills = ["Python", "SQL", "Machine Learning", "Data Analysis"]
    return [skill for skill in common_skills if skill.lower() in job_description.lower()]

def match_skills(extracted_skills, job_skills):
    match_count = sum(1 for skill in extracted_skills if skill in job_skills)
    return (match_count / len(job_skills)) * 100 if job_skills else 0

def process_resumes_from_folder(folder_path, job_description):
    for file_path in glob.glob(os.path.join(folder_path, '*')):
        extracted_details = extract_resume_details(file_path)
        if extracted_details:
            job_skills = extract_skills_from_job_description(job_description)
            match_percentage = match_skills(extracted_details['Skills'], job_skills)

            improvement_suggestions, suggested_roles = generate_suggestions(extracted_details['Skills'])

            if extracted_details['Names'] != "Name not found":
                for name in extracted_details['Names']:
                    send_email("candidate@example.com", name, match_percentage, extracted_details['Skills'],
                               improvement_suggestions, suggested_roles)

# Example usage
if __name__ == "__main__":
    resume_folder_path = r'D:\\praveen\\vdart\\Resume_full' # Update with your resumes folder path
    job_description = "D:\\praveen\\vdart\\machine learning_vv.txt"  # Sample job description
    process_resumes_from_folder(resume_folder_path, job_description)


Failed to send email to candidate@example.com: (535, b'5.7.8 Username and Password not accepted. For more information, go to\n5.7.8  https://support.google.com/mail/?p=BadCredentials d9443c01a7336-210bbf434adsm60541615ad.10 - gsmtp')
Failed to send email to candidate@example.com: (535, b'5.7.8 Username and Password not accepted. For more information, go to\n5.7.8  https://support.google.com/mail/?p=BadCredentials d2e1a72fcca58-7205794fd41sm7026123b3a.92 - gsmtp')
Failed to send email to candidate@example.com: (535, b'5.7.8 Username and Password not accepted. For more information, go to\n5.7.8  https://support.google.com/mail/?p=BadCredentials d9443c01a7336-210bc02f5dbsm60064985ad.224 - gsmtp')
Failed to send email to candidate@example.com: (535, b'5.7.8 Username and Password not accepted. For more information, go to\n5.7.8  https://support.google.com/mail/?p=BadCredentials d2e1a72fcca58-72057920335sm6792713b3a.28 - gsmtp')
Failed to send email to candidate@example.com: (535, b'5.7.8 Us

KeyboardInterrupt: 

In [13]:
import re
import pdfplumber
import pytesseract
from PIL import Image
import os
import spacy
import smtplib  # For sending emails
from email.mime.multipart import MIMEMultipart  # For email structure
from email.mime.text import MIMEText  # For adding text to emails
import glob

# Set the path for the Tesseract OCR executable
pytesseract.pytesseract.tesseract_cmd = r'C:\\Users\\praveenkumar.s\\AppData\\Local\\Programs\\Tesseract-OCR\\tesseract.exe'

# Load spaCy's English NLP model
nlp = spacy.load("en_core_web_sm")

def is_job_role(line):
    """Check if the given line contains a known job role."""
    job_roles = ["Software Engineer", "Data Scientist", "Project Manager"]
    return any(role.lower() in line.lower() for role in job_roles)

def is_common_skill(line):
    """Check if the given line contains a common skill."""
    common_skills = ["JavaScript", "Python", "Java", "C++", "HTML", "CSS"]
    return any(skill.lower() in line.lower() for skill in common_skills)

def extract_phone_numbers(text):
    """Extract phone numbers from the text using regex."""
    phone_pattern = r'\+?\d{1,3}[-.\s]??\(?\d{1,4}\)?[-.\s]??\d{1,4}[-.\s]??\d{1,9}'
    phone_numbers = re.findall(phone_pattern, text)
    phone_numbers = [num for num in phone_numbers if 10 <= len(re.sub(r'\D', '', num)) <= 15]
    print(f"Extracted phone numbers: {phone_numbers}")  # Debug print
    return phone_numbers

def extract_name_from_text(text):
    """Extract potential names from text using regex patterns and filters."""
    if text is None:
        print("No text provided for name extraction.")
        return None
    name_patterns = [
        re.compile(r'^[A-Z][a-zA-Z\s\-\.]+$', re.MULTILINE),
        re.compile(r'\b[A-Z][a-zA-Z\s\.\-]+\s+[A-Z][a-zA-Z\s\.\-]+\b', re.MULTILINE),
        re.compile(r'\b[A-Z]+\s+[A-Z]+\.[A-Z]+\b', re.IGNORECASE)
    ]
    lines = text.split('\n')
    for line in lines:
        if re.search(r'\d', line) or 'Street' in line or 'Avenue' in line or 'Road' in line:
            continue
        for pattern in name_patterns:
            match = pattern.search(line)
            if match and not is_job_role(line) and not is_common_skill(line):
                print(f"Extracted name: {match.group(0).strip()}")  # Debug print
                return match.group(0).strip()
    print("Name not found")
    return "Name not found"

def extract_emails(text):
    """Extract email addresses from text using enhanced regex."""
    email_pattern = r'([a-zA-Z0-9._%+-]+)(?:\s*|\s*)(@)(?:\s*|\s*)([a-zA-Z0-9.-]+)(?:\s*|\s*)(\.[a-zA-Z]{2,})'
    matches = re.findall(email_pattern, text)
    emails = [f"{username}{at}{domain}{tld}" for username, at, domain, tld in matches]
    emails = [email.replace(" ", "") for email in emails]
    print(f"Extracted emails: {emails}")  # Debug print
    return emails

def extract_skills(text):
    """Extract common skills from text using regex for specified skills."""
    skills_pattern = r'\b(?:JavaScript|Python|Java|C\+\+|C|HTML|CSS|React\.js|Node\.js|Git|SQL|Tableau|Machine Learning|Keras|TensorFlow|Photoshop|PowerPoint|Visual Studio|Premiere Pro|MySQL|Excel)\b'
    skills = re.findall(skills_pattern, text, flags=re.IGNORECASE)
    print(f"Extracted skills: {skills}")  # Debug print
    return list(set(skills))

def process_pdf(file_path):
    """Extract text from a PDF file."""
    text = ""
    try:
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                print(f"Extracted text from page: {page_text}")  # Debug print
                text += page_text + "\n"
    except Exception as e:
        print(f"Error processing PDF file {file_path}: {e}")
    return text

def process_image(file_path):
    """Extract text from an image file using OCR."""
    text = ""
    try:
        image = Image.open(file_path)
        text = pytesseract.image_to_string(image)
        print(f"Extracted text from image: {text}")  # Debug print
    except Exception as e:
        print(f"Error processing image file {file_path}: {e}")
    return text

def extract_resume_details(file_path):
    """Extract details from a resume file, either in PDF or image format."""
    text = ""
    if file_path.lower().endswith('.pdf'):
        text = process_pdf(file_path)
    elif file_path.lower().endswith(('.jpg', '.jpeg', '.png')):
        text = process_image(file_path)
    else:
        print(f"Unsupported file type: {file_path}")
        return None
    
    names = extract_name_from_text(text)
    emails = extract_emails(text)
    phone_numbers = extract_phone_numbers(text)
    skills = extract_skills(text)

    return {
        'File': file_path,
        'Names': names,
        'Emails': emails,
        'Phone Numbers': phone_numbers,
        'Skills': skills
    }

def generate_improvement_suggestions(skills):
    """Generate improvement suggestions based on missing skills."""
    essential_skills = {"Python", "SQL", "Machine Learning", "Data Analysis"}  # Add desired skills as needed
    missing_skills = essential_skills - set(skills)
    if missing_skills:
        return f"Consider improving the following skills: {', '.join(missing_skills)}"
    return "No major skill improvements needed."

# Skill improvement mapping for detailed suggestions
SKILL_IMPROVEMENT_MAPPING = {
    "Python": {
        "improvements": ["Practice advanced Python concepts", "Contribute to open-source projects"],
        "roles": ["Data Scientist", "Backend Developer"]
    },
    "Machine Learning": {
        "improvements": ["Study ML algorithms", "Work on Kaggle competitions"],
        "roles": ["Machine Learning Engineer", "Data Analyst"]
    },
    # Add more skills, improvements, and roles as necessary
}

def generate_suggestions(extracted_skills):
    improvement_suggestions = set()
    suggested_roles = set()

    for skill in extracted_skills:
        if skill in SKILL_IMPROVEMENT_MAPPING:
            improvement_suggestions.update(SKILL_IMPROVEMENT_MAPPING[skill]["improvements"])
            suggested_roles.update(SKILL_IMPROVEMENT_MAPPING[skill]["roles"])

    return improvement_suggestions, suggested_roles

def send_email(to_address, name, match_percentage, extracted_skills, improvement_suggestions, suggested_roles, file_path):
    """Send an email with improvement suggestions to the candidate."""
    from_address = "youremail@example.com"  # Update with your email
    password = "yourpassword"  # Update with your email password

    # Set up email content
    subject = "Resume Analysis and Improvement Suggestions"
    body = f"""
    Dear {name},

    Thank you for your application. Your skills matched {match_percentage:.2f}% with our job requirements.

    Extracted Skills: {', '.join(extracted_skills)}

    Unfortunately, we cannot move forward with your application at this time. However, we encourage you to consider the following suggestions to enhance your skills:

    Suggestions for Improvement:
    {', '.join(improvement_suggestions) if improvement_suggestions else "No suggestions available."}

    Suggested Roles for Consideration:
    {', '.join(suggested_roles) if suggested_roles else "No roles suggested."}

    Best regards,
    [Your Name]
    """

    # Print extracted details for verification
    print_extracted_details(file_path, extracted_details, match_percentage, improvement_suggestions, suggested_roles)

    # Setting up the email
    msg = MIMEText(body)
    msg['Subject'] = subject
    msg['From'] = from_address
    msg['To'] = to_address

    try:
        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:
            server.login(from_address, password)
            server.sendmail(from_address, to_address, msg.as_string())
        print(f"Email sent to {to_address} successfully!")
    except Exception as e:
        print(f"Failed to send email to {to_address}: {e}")

def print_extracted_details(file_path, extracted_details, match_percentage, improvement_suggestions, suggested_roles):
    """Print extracted resume details in a structured format."""
    print(f"File: {file_path}")
    print(f"Names: {extracted_details['Names']}")
    print(f"Emails: {extracted_details['Emails']}")
    print(f"Phone Numbers: {extracted_details['Phone Numbers']}")
    print(f"Skills: {extracted_details['Skills']}")
    print(f"Match Percentage: {match_percentage:.2f}%")
    print(f"Improvement Suggestions: {', '.join(improvement_suggestions) if improvement_suggestions else 'None'}")
    print(f"Suggested Roles: {', '.join(suggested_roles) if suggested_roles else 'None'}")
    print("="*40)

def main(folder_path):
    """Main function to process resumes in the specified folder."""
    resume_files = glob.glob(os.path.join(folder_path, '*.[pj]*[np]*'))  # Match PDF and image files
    for resume_file in resume_files:
        extracted_details = extract_resume_details(resume_file)

        if extracted_details:
            # Simulate a match percentage (for example purposes)
            match_percentage = 85.0  # Replace with actual logic

            # Generate suggestions
            improvement_suggestions, suggested_roles = generate_suggestions(extracted_details['Skills'])

            # Send email with suggestions
            if extracted_details['Emails']:
                for email in extracted_details['Emails']:
                    send_email(email, extracted_details['Names'], match_percentage, extracted_details['Skills'], improvement_suggestions, suggested_roles, resume_file)

if __name__ == "__main__":
    folder_path = 'path_to_your_resumes_folder'  # Update with your folder path
    main(folder_path)


In [3]:
import os
import re
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import spacy

# Load the NLP model for name entity recognition
nlp = spacy.load("en_core_web_sm")

def extract_text_from_image(image_path):
    # Extract text from an image using Tesseract OCR
    return pytesseract.image_to_string(Image.open(image_path))

def extract_text_with_pymupdf(file_path):
    text = ""
    try:
        pdf_document = fitz.open(file_path)
        for page in pdf_document:
            text += page.get_text()
        pdf_document.close()
    except Exception as e:
        print("Error reading PDF:", e)
    return text.strip()

def extract_names(text):
    # Use SpaCy to process the text and extract names
    doc = nlp(text)
    names = []

    for ent in doc.ents:
        if ent.label_ == "PERSON":
            names.append(ent.text)

    return names

def extract_name_from_file(file_path):
    text = ""

    if file_path.lower().endswith('.pdf'):
        text = extract_text_with_pymupdf(file_path)
    elif file_path.lower().endswith(('.png', '.jpg', '.jpeg')):
        text = extract_text_from_image(file_path)
    else:
        print(f"Unsupported file format: {file_path}")
        return []

    # Extract names from the text
    names = extract_names(text)
    
    return names

# Main processing function
def process_resumes(resume_folder_path):
    for resume_path in os.listdir(resume_folder_path):
        full_path = os.path.join(resume_folder_path, resume_path)

        if os.path.isfile(full_path):
            print(f"\nProcessing resume: {full_path}")
            names = extract_name_from_file(full_path)

            if names:
                print(f"Extracted Names: {', '.join(names)}")
            else:
                print("No names found.")

# Example usage
resume_folder = r'D:\\praveen\\vdart\\Resume_full'  # Update this with your resumes folder path
process_resumes(resume_folder)



Processing resume: D:\\praveen\\vdart\\Resume_full\A.pdf
Extracted Names: Emily Harris, CA 
, Relevant Coursework, Weather App, React.js, JavaScript Algorithms

Processing resume: D:\\praveen\\vdart\\Resume_full\Alex Rivera.pdf
Extracted Names: Alex Rivera, Detail, Visual Studio, Algorithms, Multithreading, C++

Processing resume: D:\\praveen\\vdart\\Resume_full\Alex Rivera_extracted.txt
Unsupported file format: D:\\praveen\\vdart\\Resume_full\Alex Rivera_extracted.txt
No names found.

Processing resume: D:\\praveen\\vdart\\Resume_full\Alex Rivera_extracted_extracted.txt
Unsupported file format: D:\\praveen\\vdart\\Resume_full\Alex Rivera_extracted_extracted.txt
No names found.

Processing resume: D:\\praveen\\vdart\\Resume_full\Alphine_Patrick_resume 1.pdf
Extracted Names: Getsketched Software, Trichy Full, Design Team|NIT Trichy, Learn Student, C++, Tina, Microsoft Word, Photoshop, Learn Student, B. Tech - Instrumentation

Processing resume: D:\\praveen\\vdart\\Resume_full\Alphine_P

TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.