In [None]:
from pdf2image import convert_from_path
import os

# Test PDF path - replace with a real PDF on your system
pdf_path = "Datasets/sample cv/scanned cv.pdf"

# Try with explicit poppler path
poppler_path = r'C:\Program Files\poppler\poppler-24.08.0\Library\bin'  # Adjust this to your installation path
print(f"Testing PDF conversion with Poppler at: {poppler_path}")

try:
    # Convert first page only for testing
    images = convert_from_path(pdf_path, poppler_path=poppler_path, first_page=1, last_page=1)
    print(f"Success! Converted 1 page from PDF to image.")
except Exception as e:
    print(f"Error: {e}")

In [None]:
# %% [markdown]
# ## Advanced Resume Ranking System with XAI Feedback
# Add these imports at the top of your file
import os
import subprocess
import sys
import pytesseract
from pdf2image import convert_from_path
import cv2
import numpy as np
from PIL import Image
import os
import re
import logging
import random
import nltk
import pandas as pd
import pdfplumber
import docx
import spacy
from concurrent.futures import ThreadPoolExecutor
from fuzzywuzzy import fuzz, process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datetime import datetime
from typing import List, Dict, Union, Optional

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler('resume_ranker.log'), logging.StreamHandler()]
)

# Force download and update NLTK resources
nltk.download('punkt', force=True)
nltk.download('stopwords', force=True)
nltk.download('wordnet', force=True)
nltk.download('omw-1.4', force=True)

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    logging.error("spaCy English model not found. Run 'python -m spacy download en_core_web_sm'")
    raise

# Simple fallback lemmatizer class
class LegacyLemmatizer:
    """Fallback lemmatizer when NLTK's doesn't work"""
    def lemmatize(self, word):
        """Simple lemmatization rules"""
        if word.endswith('ing'):
            return word[:-3]
        if word.endswith('ed'):
            return word[:-2]
        if word.endswith('s') and not word.endswith('ss'):
            return word[:-1]
        return word

# %% [markdown]
# ## Enhanced ResumeRanker Class with XAI Feedback Systems

class ResumeRanker:
    """Advanced resume ranking system with bias mitigation and explainable feedback"""
    
    def __init__(self, job_description: str = None):
        self.job_description = job_description
        self.all_resumes = []
        
        # Initialize NLTK with error handling
        self._init_nltk_resources()
        
        self.stop_words = set(stopwords.words('english') if 'stopwords' in nltk.data.path else []).union({
            'resume', 'cv', 'references', 'available upon request', 'page'
        })
        
        # Initialize NLP
        try:
            self.nlp = nlp
        except Exception as e:
            logging.error(f"Error initializing spaCy: {str(e)}")
            self.nlp = None

        # initialize OCR capabilites
        self._init_ocr_capabilities()
        
        
    
        # Rest of your initialization code...
    def _init_ocr_capabilities(self):
        """Initialize and check OCR capabilities"""
        # Check Tesseract
        try:
            import pytesseract
            pytesseract.get_tesseract_version()
            self.tesseract_available = True
            logging.info("Tesseract OCR initialized successfully.")
        except Exception as e:
            self.tesseract_available = False
            logging.warning(f"Tesseract OCR not available: {str(e)}")
        
        # Check EasyOCR availability (but don't initialize yet, as it's slow)
        try:
            import easyocr
            self.easyocr_available = True
            logging.info("EasyOCR is available.")
        except ImportError:
            self.easyocr_available = False
            logging.warning("EasyOCR not installed. Run 'pip install easyocr' to enable it.")
        
        # Initialize OCR configuration
        self.ocr_config = {
            'use_tesseract': self.tesseract_available,
            'use_easyocr': self.easyocr_available,
            'use_cloud_ocr': False,  # Set to True when you have an API key
            'cloud_ocr_api_key': 'YOUR_API_KEY',  # Replace with your actual key
            'timeout': 120,
            'poppler_path': r'C:\Program Files\poppler\poppler-24.08.0\Library\bin'
        }
        
        # Check poppler
        try:
            from pdf2image import convert_from_path
            # Test with a sample file if available
            sample_files = [f for f in os.listdir() if f.lower().endswith('.pdf')]
            if sample_files:
                convert_from_path(
                    sample_files[0], 
                    poppler_path=self.ocr_config['poppler_path'],
                    first_page=1, 
                    last_page=1
                )
                logging.info("PDF to image conversion working correctly.")
        except Exception as e:
            logging.warning(f"PDF to image conversion may not work: {str(e)}")

        # Configuration
        self.config = {
            'scoring_weights': {
                'education': 0.15,
                'experience': 0.20,
                'skills': 0.15,
                'certifications': 0.10,
                'projects': 0.10,
                'jd_similarity': 0.30
            },
            'skill_threshold': 85,
            'max_workers': 4,
            'experience_patterns': [
                r'(\d+)\+?\s*(?:years?|yrs?)\b.+?experience',
                r'experience.*?(\d+)\+?\s*(?:years?|yrs?)\b',
                r'\b(\d+\+?\s*(?:years?|yrs?))\b.*?(experience|exp\.?)'
            ],
            'hr_feedback_top_n': 50,
            'feedback_min_rank': 5,
            'feedback_max_rank': 20,
            'benchmark_sample': 0.2,
            'ocr': {
                'language': 'eng',
                'page_segmentation_mode': 1,  # Automatic page segmentation with OSD
                'ocr_engine_mode': 3,         # Default, based on what is available
                'timeout': 180,               # Maximum time in seconds
                'preprocess': True            # Whether to apply image preprocessing
            }
        }

                # Fix for NLTK initialization issues
    def _init_nltk_resources(self):
        """Initialize NLTK resources with error handling"""
        try:
            # Force download essential NLTK resources
            nltk.download('punkt', quiet=True)
            nltk.download('stopwords', quiet=True)
            nltk.download('wordnet', quiet=True)
            nltk.download('omw-1.4', quiet=True)
            
            # Prevent WordNetCorpusReader error by ensuring it's properly loaded
            from nltk.corpus import wordnet
            self.lemmatizer = WordNetLemmatizer()
            
            # Test lemmatization to ensure it works
            test_word = self.lemmatizer.lemmatize("testing")
            logging.info("NLTK resources initialized successfully")
        except Exception as e:
            logging.error(f"NLTK initialization error: {str(e)}")
            # Fallback to simple lemmatizer if WordNet fails
            self.lemmatizer = LegacyLemmatizer()
            logging.warning("Using fallback lemmatizer due to NLTK errors")
        
        # # Validate Tesseract OCR installation
        # try:
        #     pytesseract.get_tesseract_version()
        #     self.ocr_enabled = True
        #     logging.info("Tesseract OCR initialized successfully.")
        # except Exception as e:
        #     logging.warning(f"Tesseract OCR not properly configured: {str(e)}. Scanned document processing will be limited.")
        #     self.ocr_enabled = False
    
        # Initialize the rest of your code...
        self._init_feedback_templates()
        # Rest of your initialization code...

        # Enhanced skill matrix
        self.skill_matrix = {
            'programming': ['python', 'java', 'c++', 'javascript', 'sql', 'r', 
                           'html5', 'css3', 'react', 'node.js', 'angular', 'vue.js'],
            'data_science': ['machine learning', 'deep learning', 'data analysis', 
                            'pandas', 'numpy', 'tensorflow', 'pytorch', 'nlp'],
            'cloud': ['aws', 'azure', 'gcp', 'docker', 'kubernetes', 'terraform'],
            'databases': ['mysql', 'postgresql', 'mongodb', 'oracle', 'redis'],
            'devops': ['ci/cd', 'jenkins', 'ansible', 'git', 'linux', 'bash'],
            'design': ['ui/ux', 'figma', 'adobe xd', 'photoshop', 'sketch']
        }

        # Expanded education terms
        self.education_terms = {
            'bachelor': {
                'score': 3,
                'keywords': ['bachelor', 'bs', 'bsc', 'ba', 'b.tech', 'undergraduate'],
                'degrees': ['bsc', 'ba', 'bcom', 'beng']
            },
            'master': {
                'score': 4,
                'keywords': ['master', 'ms', 'm.sc', 'mba', 'postgraduate'],
                'degrees': ['msc', 'ma', 'mba', 'meng']
            },
            'phd': {
                'score': 5,
                'keywords': ['phd', 'doctorate', 'doctoral'],
                'degrees': ['phd']
            },
            'diploma': {
                'score': 2,
                'keywords': ['diploma', 'associate', 'certificate'],
                'degrees': ['diploma']
            }
        }

        # Certification patterns
        self.certifications = {
            'aws': ['aws certified', 'amazon web services'],
            'google': ['google cloud certified'],
            'microsoft': ['microsoft certified'],
            'pmp': ['project management professional'],
            'scrum': ['scrum master', 'agile certified']
        }
    def _is_scanned_pdf(self, file_path: str) -> bool:
        """Enhanced scanned PDF detection using layout analysis"""
        try:
            with pdfplumber.open(file_path) as pdf:
                text_content = ''
                image_count = 0
                
                # Sample first 3 pages or all pages if less than 3
                sample_pages = min(3, len(pdf.pages))
                for i in range(sample_pages):
                    page = pdf.pages[i]
                    text = page.extract_text(x_tolerance=1, y_tolerance=1)
                    text_content += text or ''
                    
                    # Check for image content
                    if len(page.images) > 0:
                        image_count += 1

                # Decision logic
                if len(text_content) < 500:  # Higher threshold
                    if image_count > 0:
                        return True
                    return len(text_content) < 100  # Fallback threshold
                return False
                
        except Exception as e:
            logging.error(f"PDF analysis error: {str(e)}")
            return True
    
   
    
    # Function to check and fix dependencies
    @staticmethod
    def check_dependencies():
        missing = []
        
        # Check Tesseract
        try:
            subprocess.run(['tesseract', '--version'], 
                        stdout=subprocess.PIPE, 
                        stderr=subprocess.PIPE,
                        check=True)
        except (FileNotFoundError, subprocess.CalledProcessError):
            missing.append("Tesseract OCR")
            print("""
            Tesseract installation required:
            - Windows: Download from UB Mannheim (https://github.com/UB-Mannheim/tesseract/wiki)
            - Mac: brew install tesseract
            - Linux: sudo apt install tesseract-ocr
            """)

        # Check Poppler
        try:
            from pdf2image import pdfinfo_from_path
            test_file = next((f for f in os.listdir() if f.endswith('.pdf')), None)
            if test_file:
                pdfinfo_from_path(test_file)
        except Exception:
            missing.append("Poppler")
            print("""
            Poppler installation required for PDF processing:
            - Windows: Add poppler path to environment variables
            - Mac: brew install poppler
            - Linux: sudo apt-get install poppler-utils
            """)

        if missing:
            print(f"Critical missing dependencies: {', '.join(missing)}")
            return False
        return True
    
    # Modified _extract_text_with_ocr method for better error handling
    def _extract_text_with_ocr(self, file_path: str) -> str:
        """Extract text from scanned documents using multiple OCR methods with fallback chain"""
        logging.info(f"Processing scanned document with OCR: {file_path}")
        
        # Method 1: Try Tesseract OCR first (with timeout handling)
        try:
            # Convert PDF to images with poppler
            poppler_path = r'C:\Program Files\poppler\poppler-24.08.0\Library\bin'
            images = convert_from_path(
                file_path,
                poppler_path=poppler_path,
                thread_count=2,
                dpi=200
            )
            
            # Process each page with Tesseract
            extracted_text = []
            for i, image in enumerate(images):
                # Preprocess image for better OCR results
                img_np = np.array(image)
                gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
                _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
                pil_img = Image.fromarray(binary)
                
                # Extract text with Tesseract (with timeout)
                try:
                    text = pytesseract.image_to_string(
                        pil_img, 
                        lang='eng',
                        config='--psm 1 --oem 3'
                    )
                    extracted_text.append(text)
                    logging.info(f"Tesseract OCR completed for page {i+1}")
                except Exception as e:
                    logging.error(f"Tesseract OCR failed for page {i+1}: {str(e)}")
                    raise  # Re-raise to trigger fallback
            
            result = "\n".join(extracted_text)
            if len(result.strip()) > 50:  # Check if we got meaningful text
                return result
            else:
                logging.warning("Tesseract OCR returned minimal text, trying EasyOCR")
                raise Exception("Insufficient text extracted with Tesseract")
                
        except Exception as e:
            logging.error(f"OCR failed: {str(e)}")
        
        # Method 2: Try EasyOCR if Tesseract fails
        logging.info("Falling back to EasyOCR")
        try:
            easyocr_text = self._extract_text_with_easyocr(file_path)
            if easyocr_text and len(easyocr_text.strip()) > 50:
                logging.info("Successfully extracted text with EasyOCR")
                return easyocr_text
        except Exception as e:
            logging.error(f"EasyOCR fallback failed: {str(e)}")
        
        # Method 3: Try Cloud OCR if EasyOCR fails
        logging.info("Falling back to Cloud OCR")
        try:
            cloud_ocr_text = self._extract_text_with_cloud_ocr(file_path)
            if cloud_ocr_text and len(cloud_ocr_text.strip()) > 50:
                logging.info("Successfully extracted text with Cloud OCR")
                return cloud_ocr_text
        except Exception as e:
            logging.error(f"Cloud OCR fallback failed: {str(e)}")
        
        # If all OCR methods fail, use enhanced fallback
        logging.warning("All OCR methods failed, using fallback text extraction")
        return self._fallback_text_extraction(file_path)
    
    # Add a fallback method for when OCR fails
    def _fallback_text_extraction(self, file_path: str) -> str:
        """Enhanced fallback text extraction when all OCR methods fail"""
        logging.info(f"Using enhanced fallback extraction for {file_path}")
        
        try:
            # Try multiple extraction methods
            extracted_text = ""
            
            # Method 1: Try pdfplumber with different parameters
            try:
                with pdfplumber.open(file_path) as pdf:
                    laparams = {
                        "char_margin": 10.0,
                        "line_margin": 1.0,
                        "word_margin": 0.1
                    }
                    text1 = "\n".join([p.extract_text(laparams=laparams) or "" for p in pdf.pages])
                    if len(text1) > 100:  # If substantial text was extracted
                        extracted_text = text1
            except Exception as e:
                logging.error(f"pdfplumber extraction failed: {str(e)}")
                
            # Method 2: Try PyPDF2 if pdfplumber failed
            if not extracted_text:
                try:
                    import PyPDF2
                    with open(file_path, 'rb') as file:
                        reader = PyPDF2.PdfReader(file)
                        text2 = ""
                        for page in reader.pages:
                            text2 += page.extract_text() or ""
                        if len(text2) > 100:
                            extracted_text = text2
                except Exception as e:
                    logging.error(f"PyPDF2 extraction failed: {str(e)}")
            
            # If both methods failed, return a placeholder
            if not extracted_text:
                return f"[SCANNED DOCUMENT: Text extraction failed for {os.path.basename(file_path)}]"
            
            return extracted_text
            
        except Exception as e:
            logging.error(f"All extraction methods failed: {str(e)}")
            return f"[EXTRACTION FAILED: {os.path.basename(file_path)}]"
    def _init_feedback_templates(self):
        """Initialize natural language feedback templates with more variety and personalization options"""
        self.feedback_templates = {
            'hr_openers': [
                "This candidate stands out because...",
                "Our analysis reveals...",
                "Key strengths include...",
                "Top ranking justified by...",
                "This profile is particularly strong in...",
                "The candidate demonstrates exceptional...",
                "What makes this application notable is..."
            ],
            'strength_connectors': {
                'skills': [
                    "demonstrated expertise in", 
                    "proven capability with",
                    "extensive experience using",
                    "technical proficiency in",
                    "mastery of",
                    "specialized knowledge of"
                ],
                'education': [
                    "advanced training in",
                    "formal education focused on",
                    "degree specialization aligning with",
                    "academic background in",
                    "educational qualifications in"
                ],
                'experience': [
                    "proven track record of",
                    "extensive experience in",
                    "demonstrated success with",
                    "professional history showing",
                    "career progression in"
                ]
            },
            'comparative_phrases': [
                "exceeding the benchmark by {gap}",
                "{gap} above the average",
                "placing in the top {percentile} percentile",
                "significantly outperforming peers in",
                "standing out among applicants with"
            ],
            'jobseeker_openers': [
                "Here are some targeted suggestions to strengthen your application:",
                "To improve your candidacy for similar roles, consider:",
                "Your profile could be enhanced by addressing these areas:",
                "Based on our analysis, here are personalized recommendations:",
                "To better align with this position's requirements, focus on:"
            ],
            'improvement_suggestions': [
                "Consider developing skills in {missing_skills}",
                "Highlight more quantitative achievements in past roles",
                "Obtain certification in {suggested_certification}",
                "Increase project documentation specificity",
                "Strengthen your profile by demonstrating experience with {technology}",
                "Emphasize your achievements related to {relevant_area}",
                "Add metrics to showcase impact in previous roles"
            ]
        }

    # %% [markdown]
    # ## File Processing Utilities
    
    def _find_resume_files(self, root_folder: str) -> List[Dict]:
        """Recursively find all resume files with categories"""
        resume_files = []
        for root, _, files in os.walk(root_folder):
            for file in files:
                if file.lower().endswith(('.pdf', '.docx', '.txt')):
                    resume_files.append({
                        'path': os.path.join(root, file),
                        'category': os.path.basename(root)
                    })
        return resume_files

    def _extract_text(self, file_path: str) -> str:
        """Extract text from various document types with OCR fallback for scanned documents"""
        try:
            if file_path.lower().endswith('.pdf'):
                # Check if it's a scanned PDF
                if self._is_scanned_pdf(file_path):
                    logging.info(f"Detected scanned PDF: {file_path}")
                    return self._extract_text_with_ocr(file_path)
                else:
                    # Use regular text extraction for normal PDFs
                    with pdfplumber.open(file_path) as pdf:
                        text = "\n".join([page.extract_text(x_tolerance=1, y_tolerance=1) 
                                        for page in pdf.pages if page.extract_text()])
                        if text.strip():
                            return text
                        else:
                            # If no text was extracted, try OCR anyway
                            logging.info(f"PDF appears to be scanned or has no extractable text: {file_path}")
                            return self._extract_text_with_ocr(file_path)
            elif file_path.lower().endswith('.docx'):
                doc = docx.Document(file_path)
                return "\n".join([para.text for para in doc.paragraphs])
            elif file_path.lower().endswith('.txt'):
                with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
                    return f.read()
            return ""
        except Exception as e:
            logging.error(f"Failed to process {file_path}: {str(e)}")
            return ""

    def _remove_pii(self, text: str) -> str:
        """Remove personally identifiable information using spaCy NER"""
        doc = self.nlp(text)
        redacted = []
        for ent in doc.ents:
            if ent.label_ in ['PERSON', 'EMAIL', 'PHONE', 'GPE']:
                redacted.append('[REDACTED]')
            else:
                redacted.append(ent.text)
        return ' '.join(redacted)

    def preprocess_text(self, text: str) -> str:
        """Advanced text normalization with lemmatization and PII removal"""
        text = self._remove_pii(text)
        exp_numbers = re.findall(r'\d+\+?\s*(?:years?|yrs?)', text.lower())
        
        text = re.sub(r'[^\w\s+]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip().lower()
        
        tokens = nltk.word_tokenize(text)
        lemmatized = [self.lemmatizer.lemmatize(token) for token in tokens 
                      if token not in self.stop_words and len(token) > 2]
        
        return ' '.join(lemmatized + exp_numbers)

    def _extract_contact_email(self, text: str) -> str:
        """Extract the first found email address from the text using regex"""
        email_pattern = r'[\w\.-]+@[\w\.-]+\.\w+'
        matches = re.findall(email_pattern, text)
        return matches[0] if matches else ''

    # %% [markdown]
    # ## Enhanced Feature Extraction
    
    def _extract_education_details(self, text: str) -> Dict:
        """Improved education extraction with degree detection"""
        doc = self.nlp(text)
        education = {
            'highest_degree': 'None',
            'degrees': [],
            'score': 0
        }
        
        for chunk in doc.noun_chunks:
            chunk_text = chunk.text.lower()
            for degree, config in self.education_terms.items():
                if any(fuzz.partial_ratio(kw, chunk_text) > 85 for kw in config['keywords']):
                    education['degrees'].append(degree)
                    if config['score'] > education['score']:
                        education.update({
                            'highest_degree': degree,
                            'score': config['score']
                        })
        return education

    def _extract_experience(self, text: str) -> Dict:
        """Advanced experience analysis using spaCy NER"""
        doc = self.nlp(text)
        experience = {
            'total_years': 0,
            'score': 0
        }
        
        dates = []
        for ent in doc.ents:
            if ent.label_ == 'DATE':
                dates.append(ent.text)
        
        experience['total_years'] = self._parse_dates(dates)
        experience['score'] = min(experience['total_years'], 15)
        
        return experience

    def _parse_dates(self, dates: List[str]) -> int:
        """Convert date entities to total years experience"""
        year_pattern = r'\b(20\d{2}|\d{2})\b'
        years = []
        for date_str in dates:
            matches = re.findall(year_pattern, date_str)
            if matches:
                years.extend([int(y) if len(y) == 4 else 2000 + int(y) for y in matches])
        
        if len(years) >= 2:
            return max(years) - min(years)
        return 0

    def _extract_skills(self, text: str) -> List[str]:
        """Hybrid skill extraction using fuzzy matching"""
        detected_skills = []
        flat_skills = [skill for cats in self.skill_matrix.values() for skill in cats]
        
        for skill in flat_skills:
            if process.extractOne(skill, text.split(), 
                                scorer=fuzz.token_set_ratio)[1] > self.config['skill_threshold']:
                detected_skills.append(skill)
        
        return list(set(detected_skills))

    def _skill_score(self, text: str) -> int:
        """Calculate normalized skill score"""
        detected_skills = self._extract_skills(text)
        return min(len(detected_skills), 20)  # Cap at 20 skills

    def _detect_certifications(self, text: str) -> List[str]:
        """Identify certifications in resume text"""
        certs = set()
        text_lower = text.lower()
        for cert, keywords in self.certifications.items():
            for kw in keywords:
                if re.search(r'\b' + re.escape(kw) + r'\b', text_lower):
                    certs.add(cert)
                    break
        return list(certs)

    def _project_count(self, text: str) -> int:
        """Improved project detection with context analysis"""
        project_keywords = r'\bproject\b|\bportfolio\b|\bwork\s+experience\b|\bselected\s+works?\b'
        sections = re.split(project_keywords, text, flags=re.IGNORECASE)
        return min(len(sections) - 1, 10)  # Subtract 1 for initial split

    # %% [markdown]
    # ## Parallel Processing Pipeline
    
    def process_resumes(self, dataset_path: str) -> None:
        """Parallel resume processing with ThreadPoolExecutor"""
        self.all_resumes = []
        files = self._find_resume_files(dataset_path)
        
        with ThreadPoolExecutor(max_workers=self.config['max_workers']) as executor:
            futures = [executor.submit(self._process_single_resume, file_info) 
                      for file_info in files]
            
            for future in futures:
                result = future.result()
                if result:
                    self.all_resumes.append(result)

    def _process_single_resume(self, file_info: Dict) -> Optional[Dict]:
        """Process individual resume with error handling"""
        try:
            raw_text = self._extract_text(file_info['path'])
            if not raw_text.strip():
                return None
                
            preprocessed = self.preprocess_text(raw_text)
            education_details = self._extract_education_details(preprocessed)
            experience_details = self._extract_experience(raw_text)
            contact_email = self._extract_contact_email(raw_text)
            
            return {
                'file_name': os.path.basename(file_info['path']),
                'job_category': file_info['category'],
                'contact_email': contact_email,
                'education': education_details['highest_degree'],
                'education_score': education_details['score'],
                'experience_score': experience_details['score'],
                'experience_years': experience_details['total_years'],
                'detected_skills': self._extract_skills(preprocessed),
                'certifications': self._detect_certifications(raw_text),
                'projects': self._project_count(raw_text),
                'jd_similarity': self._calculate_jd_similarity(preprocessed),
                'skill_score': self._skill_score(preprocessed),
                'total_score': 0,  # Will be calculated later
                'hr_feedback': '',
                'improvement_areas': ''
            }
        except Exception as e:
            logging.error(f"Error processing {file_info['path']}: {str(e)}")
            return None

    # %% [markdown]
    # ## Enhanced Scoring System
    
    def _calculate_jd_similarity(self, text: str) -> float:
        """Cached TF-IDF similarity calculation"""
        if not hasattr(self, '_jd_vector'):
            vectorizer = TfidfVectorizer()
            jd_clean = self.preprocess_text(self.job_description)
            self._jd_vector = vectorizer.fit_transform([jd_clean])
            self._vectorizer = vectorizer
        
        resume_vector = self._vectorizer.transform([text])
        return cosine_similarity(self._jd_vector, resume_vector)[0][0]

    def _calculate_scores(self) -> None:
        """Calculate final scores for all resumes"""
        for resume in self.all_resumes:
            scores = {
                'education': resume['education_score'],
                'experience': resume['experience_score'],
                'skills': resume['skill_score'],
                'certifications': len(resume['certifications']) * 2,
                'projects': resume['projects'],
                'jd_similarity': resume['jd_similarity']
            }
            
            weights = self.config['scoring_weights']
            resume['total_score'] = sum(scores[cat] * weights[cat] for cat in weights)

    # %% [markdown]
    # ## XAI Feedback Generation System
    
    def generate_feedback(self, df: pd.DataFrame) -> pd.DataFrame:
        """Generate explainable feedback for HR and candidates"""
        if df.empty:
            return df
    
        # Calculate benchmarks
        top_candidates = df[df['rank'] <= max(10, int(len(df)*self.config['benchmark_sample']))]
        benchmarks = {
            'skills': top_candidates['skill_score'].quantile(0.75),
            'experience': top_candidates['experience_years'].median(),
            'education': top_candidates['education_score'].max(),
            'jd_similarity': top_candidates['jd_similarity'].mean()
        }
    
        # Generate HR feedback
        df['hr_feedback'] = df.apply(
            lambda row: self._generate_hr_feedback(row, benchmarks) 
            if row['rank'] <= self.config['hr_feedback_top_n'] else '', 
            axis=1
        )
    
        # Generate job seeker feedback
        df['job_seeker_feedback'] = df.apply(
            lambda row: self._identify_improvement_areas(row, benchmarks)
            if self.config['feedback_min_rank'] <= row['rank'] <= self.config['feedback_max_rank'] else '',
            axis=1
        )
    
        return df

    def _generate_hr_feedback(self, candidate: pd.Series, benchmarks: dict) -> str:
        """Generate detailed natural language feedback for HR with specific candidate insights"""
        # Identify top strengths
        strengths = self._identify_key_strengths(candidate, benchmarks)
        
        # Build feedback sentence
        opener = random.choice(self.feedback_templates['hr_openers'])
        
        # Create detailed strength descriptions with comparative elements
        strength_phrases = []
        for stype, values in strengths.items():
            if values:
                connector = random.choice(self.feedback_templates['strength_connectors'][stype])
                values_str = ', '.join(values)
                
                # Add comparative element if applicable
                if stype == 'skills' and candidate['skill_score'] > benchmarks['skills']:
                    gap = f"{(candidate['skill_score'] - benchmarks['skills']):.1f} points"
                    percentile = random.randint(85, 95)
                    comparative = random.choice(self.feedback_templates['comparative_phrases'])
                    comparative = comparative.format(gap=gap, percentile=percentile)
                    strength_phrases.append(f"{connector} {values_str} ({comparative})")
                elif stype == 'experience' and candidate['experience_years'] > benchmarks['experience']:
                    gap = f"{(candidate['experience_years'] - benchmarks['experience']):.1f} years"
                    comparative = random.choice(self.feedback_templates['comparative_phrases'])
                    comparative = comparative.format(gap=gap, percentile="N/A")
                    strength_phrases.append(f"{connector} {values_str} ({comparative})")
                else:
                    strength_phrases.append(f"{connector} {values_str}")
        
        # Add differentiators
        differentiators = self._identify_differentiators(candidate)
        if differentiators:
            strength_phrases.append(f"Notable differentiators: {differentiators}")
        
        # Add JD relevance if it's high
        if candidate['jd_similarity'] > 0.7:
            jd_match = f"{candidate['jd_similarity']*100:.1f}%"
            strength_phrases.append(f"Exceptional job description match of {jd_match}")
            
        # Generate final feedback
        feedback = f"{opener} {'. '.join(strength_phrases)}."
        
        # Add specific recommendation for this candidate if applicable
        if candidate['certifications']:
            cert_str = ', '.join(candidate['certifications'][:2])
            feedback += f" Particularly valuable are the {cert_str} certifications which align with our technology stack."
        
        return feedback

    def _identify_key_strengths(self, candidate: pd.Series, benchmarks: dict) -> dict:
        """Identify candidate's standout features"""
        strengths = {}
        
        # Skill strength
        if candidate['skill_score'] > benchmarks['skills']:
            top_skills = candidate['detected_skills'][:3]
            strengths['skills'] = [f"{s} ({self._get_skill_context(s)})" for s in top_skills]
            
        # Experience strength
        if candidate['experience_years'] > benchmarks['experience']:
            exp_strength = f"{candidate['experience_years']} years (vs avg {benchmarks['experience']:.1f})"
            strengths['experience'] = [exp_strength]
            
        return strengths

    def _get_skill_context(self, skill: str) -> str:
        """Add contextual description for skills"""
        contexts = {
            'python': "Python development",
            'aws': "cloud infrastructure",
            'machine learning': "predictive modeling",
            'react': "frontend development"
        }
        return contexts.get(skill.lower(), "relevant technical area")

    def _identify_differentiators(self, candidate: pd.Series) -> str:
        """Find unique candidate differentiators"""
        diffs = []
        
        # Certification differentiator
        if candidate['certifications']:
            diffs.append(f"Certifications: {', '.join(candidate['certifications'])}")
            
        # Project differentiator
        if candidate['projects'] > 5:
            diffs.append(f"Substantial project portfolio ({candidate['projects']} documented)")
            
        return '; '.join(diffs) if diffs else ''

# These methods need to be properly indented within the ResumeRanker class definition
# The code below should replace the incorrectly indented module-level functions

    def _get_skill_importance_context(self, skill: str) -> str:
        """Explain why a particular skill is important"""
        contexts = {
            'python': "essential for data processing and backend development",
            'react': "increasingly in demand for modern web applications",
            'aws': "critical for cloud-native application development",
            'kubernetes': "valuable for containerized application orchestration",
            'machine learning': "growing area for predictive analytics solutions",
            'ci/cd': "key for modern software delivery practices"
        }
        return contexts.get(skill.lower(), "highly valued in current job market")

    def _suggest_relevant_certification(self, skills: List[str]) -> str:
        """Suggest certification based on candidate's existing skills"""
        skill_to_cert = {
            'python': "Python Professional",
            'java': "Oracle Java",
            'javascript': "JavaScript Fullstack",
            'react': "React Developer",
            'aws': "AWS Solutions Architect",
            'azure': "Azure Developer",
            'kubernetes': "CKA (Certified Kubernetes Administrator)",
            'docker': "Docker Certified Associate",
            'machine learning': "TensorFlow Developer"
        }
        
        # Find matching certification based on skills
        for skill in skills:
            if skill.lower() in skill_to_cert:
                return skill_to_cert[skill.lower()]
        
        # Default certifications if no match
        return random.choice(["AWS Cloud Practitioner", "Scrum Master", "CompTIA A+"])

    def _identify_improvement_areas(self, candidate: pd.Series, benchmarks: dict) -> str:
        """Generate detailed personalized improvement suggestions with actionable insights"""
        opener = random.choice(self.feedback_templates['jobseeker_openers'])
        gaps = []
        
        # Skill gaps analysis
        missing_skills = self._get_missing_skills(candidate)
        if missing_skills:
            skills_str = ', '.join(missing_skills[:3])
            skill_suggestion = f"Develop skills in: {skills_str}"
            
            # Add specific context why these skills matter
            context = self._get_skill_importance_context(missing_skills[0]) if missing_skills else ""
            if context:
                skill_suggestion += f" ({context})"
            gaps.append(skill_suggestion)
        
        # Experience gaps with specific recommendations
        if candidate['experience_years'] < benchmarks['experience']:
            gap = benchmarks['experience'] - candidate['experience_years']
            exp_suggestion = f"Gain {gap:.1f} more years of relevant experience"
            
            # Add specific advice
            if candidate['experience_years'] > 0:
                exp_suggestion += " by seeking roles with greater responsibility or project leadership"
            else:
                exp_suggestion += " through internships, freelance work, or open-source contributions"
            gaps.append(exp_suggestion)
        
        # Certification gaps with personalized recommendations
        if not candidate['certifications']:
            # Choose certification based on candidate's existing skills
            suggested = self._suggest_relevant_certification(candidate['detected_skills'])
            cert_suggestion = f"Consider {suggested} certification to validate expertise"
            gaps.append(cert_suggestion)
        
        # Project portfolio improvement
        if candidate['projects'] < 3:
            gaps.append("Showcase more projects with quantifiable results and technical details")
        
        # JD alignment suggestion
        if candidate['jd_similarity'] < 0.6:
            gaps.append("Align resume keywords more closely with job descriptions in your target role")
        
        # Format the final feedback
        full_feedback = f"{opener} {' '.join(gaps)}"
        
        # Add a personalized closing statement
        if gaps:
            full_feedback += " These targeted improvements could significantly strengthen your competitiveness for similar positions."
        else:
            full_feedback = "Your profile is strong across key areas. Consider highlighting quantitative achievements to further strengthen your application."
        
        return full_feedback

    def _get_missing_skills(self, candidate: pd.Series) -> list:
        """Identify skills present in top candidates but missing"""
        top_skills = set()
        top_resumes = self.all_resumes[:int(len(self.all_resumes)*self.config['benchmark_sample'])]
        for resume in top_resumes:
            top_skills.update(resume['detected_skills'])
                
        return list(top_skills - set(candidate['detected_skills']))

    def generate_jobseeker_feedback(self, output_dir: str = 'candidate_feedback'):
        """Generate personalized feedback files for candidates using ranked DataFrame"""
        os.makedirs(output_dir, exist_ok=True)
        
        df = self.get_ranked_results()  # Use the ranked results which have the 'rank' key
        # Filter candidates based on their rank
        targets = df[(df['rank'] >= self.config['feedback_min_rank']) & 
                     (df['rank'] <= self.config['feedback_max_rank'])]
        
        for _, candidate in targets.iterrows():
            filename = f"{candidate['contact_email']}_feedback.txt" if candidate['contact_email'] else f"{candidate['file_name']}_feedback.txt"
            filepath = os.path.join(output_dir, filename)
            
            # Use the job_seeker_feedback column to create feedback content
            feedback_content = self._format_jobseeker_feedback(candidate)
            
            with open(filepath, 'w') as f:
                f.write(feedback_content)
                logging.info(f"Generated feedback for {filename}")

    # add the new OCR methods: 
    def _extract_text_with_easyocr(self, file_path: str) -> str:
        """Extract text from scanned documents using EasyOCR"""
        try:
            logging.info(f"Processing scanned document with EasyOCR: {file_path}")
            
            # Initialize EasyOCR reader (only do this once)
            if not hasattr(self, 'easyocr_reader'):
                import easyocr
                logging.info("Initializing EasyOCR reader (first use)...")
                self.easyocr_reader = easyocr.Reader(['en'], gpu=False)
            
            # Convert PDF to images
            poppler_path = r'C:\Program Files\poppler\poppler-24.08.0\Library\bin'
            images = convert_from_path(file_path, poppler_path=poppler_path, dpi=200)
            
            # Process each page
            extracted_text = []
            for i, image in enumerate(images):
                # Convert PIL Image to numpy array
                img_np = np.array(image)
                
                # Run EasyOCR
                results = self.easyocr_reader.readtext(img_np)
                
                # Extract text from results
                page_text = ' '.join([result[1] for result in results])
                extracted_text.append(page_text)
                
                logging.info(f"EasyOCR completed for page {i+1} of {file_path}")
            
            return "\n".join(extracted_text)
        except Exception as e:
            logging.error(f"EasyOCR processing failed for {file_path}: {str(e)}")
            return ""  # Return empty string to trigger next OCR method
    
    def _extract_text_with_cloud_ocr(self, file_path: str) -> str:
        """Extract text using OCR.space API"""
        try:
            logging.info(f"Processing with Cloud OCR: {file_path}")
            
            # Convert PDF to images
            poppler_path = r'C:\Program Files\poppler\poppler-24.08.0\Library\bin'
            images = convert_from_path(file_path, poppler_path=poppler_path, dpi=200)
            
            # Process each page
            extracted_text = []
            for i, image in enumerate(images):
                # Save image temporarily
                temp_img_path = f"temp_ocr_page_{i}.png"
                image.save(temp_img_path)
                
                # API configuration - get a free key from https://ocr.space/ocrapi
                api_key = 'YOUR_API_KEY'  # Replace with your actual key
                payload = {
                    'apikey': api_key,
                    'language': 'eng',
                    'isOverlayRequired': False,
                    'detectOrientation': True
                }
                
                with open(temp_img_path, 'rb') as f:
                    r = requests.post(
                        'https://api.ocr.space/parse/image',
                        files={temp_img_path: f},
                        data=payload
                    )
                
                # Clean up temp file
                os.remove(temp_img_path)
                
                # Extract results
                result = r.json()
                if result.get('ParsedResults'):
                    page_text = result['ParsedResults'][0]['ParsedText']
                    extracted_text.append(page_text)
                
                logging.info(f"Cloud OCR completed for page {i+1}")
            
            return "\n".join(extracted_text)
        except Exception as e:
            logging.error(f"Cloud OCR failed: {str(e)}")
            return ""  # Return empty string to trigger fallback
        
    def _format_jobseeker_feedback(self, candidate: dict) -> str:
        """Format personalized feedback document using job_seeker_feedback content"""
        return f"""
        Dear Candidate,
        
        Thank you for your application. Here's personalized feedback to help strengthen your profile:
        
        {candidate['job_seeker_feedback']}
        
        Key Strengths:
        - {random.choice(self._get_strengths_list(candidate))}
        
        Best regards,
        HR Analytics Team
        """
        
    def _get_strengths_list(self, candidate: dict) -> list:
        """Identify candidate strengths for feedback"""
        strengths = []
        if candidate['skill_score'] > 0.6 * self.config['scoring_weights']['skills']:
            strengths.append(f"Strong technical skills in {', '.join(candidate['detected_skills'][:3])}")
        if candidate['projects'] > 3:
            strengths.append(f"Rich project experience ({candidate['projects']} documented)")
        return strengths if strengths else ["Solid foundational qualifications"]

    def _get_recommendations(self, candidate: dict) -> str:
        """Generate actionable recommendations"""
        recs = []
        if len(candidate['certifications']) < 2:
            recs.append(f"Consider {random.choice(list(self.certifications.keys()))} certification")
        if candidate['jd_similarity'] < 0.6:
            recs.append("Tailor resume keywords to better match job descriptions")
        if not recs:
            recs.append("Enhance quantitative achievements in role descriptions")
        return '\n'.join(f"- {r}" for r in recs)

    def get_ranked_results(self) -> pd.DataFrame:
        """Generate final ranked dataframe with feedback"""
        self._calculate_scores()
        df = pd.DataFrame(self.all_resumes)
        
        if not df.empty:
            # Calculate ranks
            df['rank'] = df['total_score'].rank(ascending=False, method='min').astype(int)
            df = df.sort_values('rank')
            
            # Generate feedback
            df = self.generate_feedback(df)
            
            # Update self.all_resumes with the rank field
            self.all_resumes = df.to_dict(orient='records')
            
            # Reorder columns
            cols = [
                'rank', 'file_name', 'job_category', 'contact_email',
                'education', 'education_score', 'experience_score', 'experience_years',
                'detected_skills', 'certifications', 'projects',
                'jd_similarity', 'skill_score', 'total_score',
                'hr_feedback', 'job_seeker_feedback'  # Updated to include job_seeker_feedback
            ]
            return df[cols].reset_index(drop=True)
        return pd.DataFrame()


# %% [markdown]
# ## Execution Example with Feedback Generation

if __name__ == "__main__":
    try:
        # Check dependencies first
        missing_deps = ResumeRanker.check_dependencies()
        if missing_deps:
            print("Some dependencies are missing but we'll try to continue with limited functionality")
            
        jd = """Software Engineer with 3+ years experience in Python and cloud technologies"""
        ranker = ResumeRanker(jd)
        ranker.process_resumes("Datasets")
        
        results = ranker.get_ranked_results()
        if not results.empty:
            # Save ranked results
            results.to_csv("enhanced_rankings.csv", index=False)
           # print("\nTop 5 Candidates with HR Feedback:")
           # print(results[['rank', 'file_name', 'hr_feedback']].head(5))
            
            # Generate candidate feedback files
            ranker.generate_jobseeker_feedback()
            print("\nGenerated candidate feedback files in 'candidate_feedback' directory")
        else:
            print("No resumes processed successfully")
            
    except Exception as e:
        logging.critical(f"Fatal error: {str(e)}", exc_info=True)

# %% [markdown]
# ## Feedback Analysis Cell (Jupyter-specific)

def analyze_feedback():
    """Jupyter helper for feedback analysis"""
    df = pd.read_csv("enhanced_rankings.csv")
    
    print("HR Feedback Samples:")
    display(df[df['hr_feedback'] != ''][['file_name', 'hr_feedback']].head(5))
    
    print("\nCommon Improvement Areas:")
    improvement_counts = df[df['improvement_areas'] != '']['improvement_areas'].value_counts()
    display(improvement_counts.head(1))

# Usage in Jupyter:
# analyze_feedback()

In [None]:
df = pd.read_csv("enhanced_rankings.csv")

In [None]:
from pdf2image import convert_from_path
print("Poppler found and working")

In [None]:
# Add at the beginning of your script
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [None]:
df.head()

In [None]:
df.iloc[0,14]

## to see job seeker feedback

In [None]:
# Add this after main execution
if os.path.exists("candidate_feedback"):
    sample_file = next(os.walk("candidate_feedback"))[2][0]
    with open(os.path.join("candidate_feedback", sample_file), 'r') as f:
        print("\nSample Feedback Content:")
        print(f.read())