# AI Resume Screening & Candidate Ranking System

This notebook implements an AI-based resume screener.

**Features:**
- Extract text from PDF resumes
- Extract technical skills
- Rank candidates using TF-IDF and Cosine Similarity

In [None]:
# Install necessary libraries
!pip install pdfplumber pandas scikit-learn fpdf

In [None]:
import os
import re
import pdfplumber
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fpdf import FPDF

# ==========================================
# 1. Resume Screener Class
# ==========================================
class ResumeScreener:
    def __init__(self, job_description, skill_list):
        """
        Initialize the screener with a job description and a list of skills to look for.
        """
        self.job_description = job_description
        self.skill_list = [skill.lower() for skill in skill_list]
        self.vectorizer = TfidfVectorizer(stop_words='english')

    def extract_text_from_pdf(self, pdf_path):
        """
        Extracts text from a single PDF file using pdfplumber.
        """
        text = ""
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    extracted = page.extract_text()
                    if extracted:
                        text += extracted + "\n"
        except Exception as e:
            print(f"Error reading {pdf_path}: {e}")
        return text

    def clean_text(self, text):
        """
        Simple preprocessing: lowercase, remove special characters.
        """
        text = text.lower()
        # Remove special characters and numbers, keep only letters and spaces
        text = re.sub(r'[^a-z\s]', ' ', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def extract_skills(self, text):
        """
        Extracts skills from the text based on the provided skill list.
        """
        found_skills = []
        cleaned_text = self.clean_text(text)
        # Simple keyword matching logic
        for skill in self.skill_list:
            # check if skill exists as a whole word
            pattern = r'\b' + re.escape(skill) + r'\b'
            if re.search(pattern, cleaned_text):
                found_skills.append(skill)
        return list(set(found_skills))

    def rank_candidates(self, pdf_files):
        """
        Process a list of PDF files, extract text, calculate similarity, and rank them.
        """
        resume_texts = []
        resume_names = []
        extracted_skills_list = []

        print("Processing Resumes...")
        for pdf_file in pdf_files:
            print(f"  - Reading {os.path.basename(pdf_file)}...")
            text = self.extract_text_from_pdf(pdf_file)
            
            if not text.strip():
                print(f"    Warning: No text found in {pdf_file}")
                continue

            cleaned_text = self.clean_text(text)
            skills = self.extract_skills(text)
            
            resume_texts.append(cleaned_text)
            resume_names.append(os.path.basename(pdf_file))
            extracted_skills_list.append(", ".join(skills))

        if not resume_texts:
            return pd.DataFrame()

        # TF-IDF Vectorization
        # Combine Job Description with Resume Texts for vectorization
        cleaned_jd = self.clean_text(self.job_description)
        all_texts = [cleaned_jd] + resume_texts

        matrix = self.vectorizer.fit_transform(all_texts)
        
        # Calculate Cosine Similarity
        # The first vector is the Job Description
        jd_vector = matrix[0]
        resume_vectors = matrix[1:]
        
        # similarity_scores is an array of shape (n_resumes, 1)
        similarity_scores = cosine_similarity(resume_vectors, jd_vector).flatten()

        # Create Results DataFrame
        results = pd.DataFrame({
            'Candidate Name': resume_names,
            'Match Score': similarity_scores,
            'Extracted Skills': extracted_skills_list
        })

        # Sort by Match Score in descending order
        results = results.sort_values(by='Match Score', ascending=False).reset_index(drop=True)
        
        # Format score as percentage
        results['Match Score'] = (results['Match Score'] * 100).round(2).astype(str) + '%'
        
        return results

In [None]:
# ==========================================
# 2. Dummy Data Generator
# ==========================================
def create_dummy_pdf(filename, content):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, content)
    pdf.output(filename)
    print(f"Created dummy PDF: {filename}")

def generate_test_data(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Candidate 1: Strong Match (Python, ML)
    create_dummy_pdf(os.path.join(directory, "candidate_alice.pdf"), 
                     """Alice
                     Software Engineer with 5 years of experience in Python and Machine Learning.
                     Skilled in: Python, Scikit-learn, Pandas, TensorFlow.
                     Passionate about building AI models and Data Analysis.
                     Education: MS in Computer Science.""")

    # Candidate 2: Moderate Match (Java, some Data)
    create_dummy_pdf(os.path.join(directory, "candidate_bob.pdf"), 
                     """Bob
                     Backend Developer focused on Java and Spring Boot.
                     Experience with SQL databases and API development.
                     Some exposure to basic Data Analysis and Python scripting.
                     Looking to transition into AI.""")

    # Candidate 3: Weak Match (Sales)
    create_dummy_pdf(os.path.join(directory, "candidate_charlie.pdf"), 
                     """Charlie
                     Sales Manager with a track record of exceeding targets.
                     Expert in CRM software, Client Relations, and Negotiation.
                     Strong communication skills and team leadership.
                     """)
    
    return [
        os.path.join(directory, "candidate_alice.pdf"),
        os.path.join(directory, "candidate_bob.pdf"),
        os.path.join(directory, "candidate_charlie.pdf")
    ]


In [None]:
# ==========================================
# 3. Run the Screener
# ==========================================

# Define Job Description
job_description = """
We are looking for an AI/ML Engineer to join our team.
The ideal candidate should have strong experience in Python, Machine Learning, and NLP.
Proficiency in libraries like Scikit-learn, Pandas, and TensorFlow is required.
knowledge of Data Analysis and model deployment is a plus.
"""

# Define Skills to Extract
skill_list = ['Python', 'Java', 'Machine Learning', 'Data Analysis', 'NLP', 
              'Scikit-learn', 'Pandas', 'TensorFlow', 'SQL', 'Communication']

print("--- AI Resume Screening System ---")

# 1. Generate Dummy Data
test_dir = "test_resumes"
pdf_files = generate_test_data(test_dir)

# 2. Initialize Screener
screener = ResumeScreener(job_description, skill_list)

# 3. Run Ranking
results = screener.rank_candidates(pdf_files)

# 4. Display Results
print("\n--- Ranking Results ---")
results