# 1. Imports and Setup

In [1]:
import os
import re
import nltk
import pandas as pd
import PyPDF2
import docx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Download NLTK resources

In [2]:
# Add this to the beginning of your code after importing nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Aman
[nltk_data]     NSU\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Aman
[nltk_data]     NSU\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# 2. ResumeRanker Class Definition

In [3]:
class ResumeRanker:
    # ... (keep all previous methods and init)

    def load_resumes(self, folder_path):
        """Load resumes from a folder into the analyzer."""
        self.resumes = []
        
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            
            # Skip directories
            if not os.path.isfile(file_path):
                continue
                
            # Extract text based on file type
            text = ""
            if filename.endswith('.pdf'):
                text = self.extract_text_from_pdf(file_path)
            elif filename.endswith('.docx'):
                text = self.extract_text_from_docx(file_path)
            elif filename.endswith('.txt'):
                text = self.extract_text_from_txt(file_path)
            
            # Store valid resumes
            if text.strip():
                self.resumes.append({
                    'filename': filename,
                    'text': text,
                    'processed_text': self.preprocess_text(text)
                })
        
        print(f"Loaded {len(self.resumes)} resumes from {folder_path}")
        return len(self.resumes)

# 3. Text Extraction Methods

In [4]:
    def extract_text_from_pdf(self, pdf_path):
        """Extracts text content from PDF files using PyPDF2."""
        text = ""
        try:
            with open(pdf_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                for page in reader.pages:
                    text += page.extract_text() + "\n"
        except Exception as e:
            print(f"PDF read error ({pdf_path}): {e}")
        return text

    def extract_text_from_docx(self, docx_path):
        """Extracts text from DOCX files using python-docx."""
        text = ""
        try:
            doc = docx.Document(docx_path)
            for para in doc.paragraphs:
                text += para.text + "\n"
        except Exception as e:
            print(f"DOCX read error ({docx_path}): {e}")
        return text

    def extract_text_from_txt(self, txt_path):
        """Extracts text from plain text files."""
        try:
            with open(txt_path, 'r', encoding='utf-8') as file:
                return file.read()
        except UnicodeDecodeError:
            try:
                with open(txt_path, 'r', encoding='latin-1') as file:
                    return file.read()
            except Exception as e:
                print(f"TXT read error ({txt_path}): {e}")
                return ""
        except Exception as e:
            print(f"TXT read error ({txt_path}): {e}")
            return ""


# 4. Text Processing Methods


In [5]:
    def preprocess_text(self, text):
        """Cleans and normalizes text for analysis."""
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'[^\w\s]', ' ', text)  # Remove special characters
        text = re.sub(r'\d+', ' ', text)  # Remove numbers
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
        words = text.split()  # Simple tokenization
        return ' '.join([w for w in words if w not in self.stop_words])  # Remove stopwords


# 5. Feature Extraction Methods


In [6]:
    def extract_education(self, text):
        """Identifies highest education level from resume text."""
        education_level = 0
        text_lower = text.lower()
        for degree, score in self.education_scores.items():
            if degree in text_lower:
                education_level = max(education_level, score)
        return education_level

    def extract_experience_years(self, text):
        """Extracts total years of experience using regex patterns."""
        text_lower = text.lower()
        experience_patterns = [
            r'(\d+)\+?\s*years?\s+(?:of\s+)?experience',
            r'experience\s+(?:of\s+)?(\d+)\+?\s*years?',
            r'worked\s+(?:for\s+)?(\d+)\+?\s*years?'
        ]
        max_years = 0
        for pattern in experience_patterns:
            matches = re.findall(pattern, text_lower)
            if matches:
                for match in matches:
                    max_years = max(max_years, int(match))
        return max_years

    def count_skill_keywords(self, text):
        """Counts skill keywords in resume text."""
        text_lower = text.lower()
        skill_counts = {category: 0 for category in self.skill_keywords}
        for category, keywords in self.skill_keywords.items():
            for keyword in keywords:
                pattern = r'\b' + re.escape(keyword) + r'\b'
                matches = re.findall(pattern, text_lower)
                skill_counts[category] += len(matches)
        return skill_counts

# 6. Scoring and Ranking Methods

In [7]:
    def calculate_keyword_score(self, skill_counts):
        """Calculates weighted keyword score."""
        weights = {
            'programming': 0.25,
            'data_science': 0.2,
            'cloud': 0.15,
            'databases': 0.15,
            'soft_skills': 0.25
        }
        score = 0
        total_keywords = sum(skill_counts.values())
        for category, count in skill_counts.items():
            if total_keywords > 0:
                score += (count / total_keywords) * weights[category] * 10
        return score

    def calculate_jd_similarity(self, resume_text):
        """Calculates cosine similarity between resume and job description."""
        if not self.job_description:
            return 0
        vectorizer = TfidfVectorizer()
        try:
            vectors = vectorizer.fit_transform([
                self.preprocess_text(self.job_description),
                self.preprocess_text(resume_text)
            ])
            return cosine_similarity(vectors[0:1], vectors[1:2])[0][0] * 10
        except:
            return 0


# 7. Resume Loading and Analysis


In [8]:
    def load_resumes(self, folder_path):
        """Loads resumes from a folder and processes them."""
        self.resumes = []
        self.resume_data = []
        
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            if not os.path.isfile(file_path):
                continue
                
            text = ""
            if filename.endswith('.pdf'):
                text = self.extract_text_from_pdf(file_path)
            elif filename.endswith('.docx'):
                text = self.extract_text_from_docx(file_path)
            elif filename.endswith('.txt'):
                text = self.extract_text_from_txt(file_path)
                
            if text:
                self.resumes.append({
                    'filename': filename,
                    'text': text,
                    'processed_text': self.preprocess_text(text)
                })
        
        print(f"Loaded {len(self.resumes)} resumes from {folder_path}")
        return len(self.resumes)

    def analyze_resumes(self):
        """Analyzes all loaded resumes and calculates scores."""
        self.resume_data = []
        for resume in self.resumes:
            text = resume['text']
            education_score = self.extract_education(text)
            experience_years = self.extract_experience_years(text)
            skill_counts = self.count_skill_keywords(text)
            keyword_score = self.calculate_keyword_score(skill_counts)
            jd_similarity = self.calculate_jd_similarity(text)
            experience_score = min(10, experience_years)
            
            total_score = (
                education_score * 0.2 +
                experience_score * 0.3 +
                keyword_score * 0.25 +
                jd_similarity * 0.25
            )
            
            self.resume_data.append({
                'filename': resume['filename'],
                'education_score': education_score,
                'experience_years': experience_years,
                'experience_score': experience_score,
                'keyword_score': keyword_score,
                'jd_similarity': jd_similarity,
                'total_score': total_score,
                'skills': skill_counts
            })

    def rank_resumes(self, ascending=True):
        """Ranks resumes based on calculated scores."""
        if not self.resume_data:
            print("No resumes analyzed yet. Call analyze_resumes() first.")
            return pd.DataFrame()
        df = pd.DataFrame(self.resume_data)
        return df.sort_values('total_score', ascending=ascending)

    def get_top_resumes(self, n=10, ascending=False):
        """Returns top N resumes based on ranking."""
        ranked_df = self.rank_resumes(ascending=not ascending)
        return ranked_df.head(n)

    def export_results(self, output_path, ascending=True):
        """Exports ranking results to CSV file."""
        ranked_df = self.rank_resumes(ascending=ascending)
        ranked_df.to_csv(output_path, index=False)
        print(f"Results exported to {output_path}")


# 8. Batch Processing Function

In [9]:
def process_all_roles(base_path="Datasets/data/data", output_dir="results"):
    """
    Processes resumes in all job role folders.
    
    Args:
        base_path (str): Root directory containing job role folders
        output_dir (str): Directory to save ranking results
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Get list of all job role folders
    job_roles = [d for d in os.listdir(base_path) 
                if os.path.isdir(os.path.join(base_path, d))]
    
    for role in job_roles:
        print(f"\nProcessing resumes for: {role}")
        role_path = os.path.join(base_path, role)
        
        # Initialize ranker with role name as JD
        ranker = ResumeRanker(job_description=role)
        
        # Load and process resumes
        if ranker.load_resumes(role_path) > 0:
            ranker.analyze_resumes()
            output_path = os.path.join(output_dir, f"{role}_rankings.csv")
            ranker.export_results(output_path)
            print(f"Saved results for {role} to {output_path}")
        else:
            print(f"No resumes found in {role_path}")

# 9. Main Execution

In [10]:
if __name__ == "__main__":
    # Process all job roles in the dataset
    process_all_roles()


Processing resumes for: ACCOUNTANT


TypeError: ResumeRanker() takes no arguments

In [11]:
# %% [markdown]
# ## 1. Imports and NLTK Setup

import os
import re
import nltk
import pandas as pd
import PyPDF2
import docx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# %% [markdown]
# ## 2. ResumeRanker Class Definition

class ResumeRanker:
    """Class to parse, analyze, and rank resumes based on job descriptions."""
    
    def __init__(self, job_description=None):
        """
        Initialize the ResumeRanker with optional job description.
        
        Args:
            job_description (str, optional): Job description text for comparison
        """
        self.job_description = job_description
        self.resumes = []  # Stores raw resume data
        self.resume_data = []  # Stores analyzed metrics
        self.stop_words = set(stopwords.words('english'))
        
        # Skill keywords configuration
        self.skill_keywords = {
            'programming': ['python', 'java', 'javascript', 'c++', 'ruby', 'php', 'sql', 'r', 
                           'html', 'css', 'react', 'node', 'angular', 'vue', 'django', 'flask'],
            'data_science': ['machine learning', 'data analysis', 'statistics', 'pandas', 
                            'numpy', 'tensorflow', 'scikit-learn', 'keras', 'pytorch', 'nlp'],
            'cloud': ['aws', 'azure', 'gcp', 'docker', 'kubernetes', 'terraform', 'ci/cd'],
            'databases': ['mysql', 'postgresql', 'mongodb', 'oracle', 'redis', 'elasticsearch'],
            'soft_skills': ['leadership', 'teamwork', 'communication', 'problem solving', 
                           'time management', 'project management']
        }
        
        # Education scoring system
        self.education_scores = {
            'bachelor': 3, 'bs': 3, 'ba': 3,
            'master': 4, 'ms': 4, 'ma': 4, 'mba': 4,
            'phd': 5, 'doctorate': 5,
            'associate': 2
        }

    # %% [markdown]
    # ## 3. Text Extraction Methods
    
    def extract_text_from_pdf(self, pdf_path):
        """Extract text content from PDF files."""
        text = ""
        try:
            with open(pdf_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                for page in reader.pages:
                    text += page.extract_text() + "\n"
        except Exception as e:
            print(f"Error reading PDF {pdf_path}: {e}")
        return text

    def extract_text_from_docx(self, docx_path):
        """Extract text from DOCX files."""
        text = ""
        try:
            doc = docx.Document(docx_path)
            for para in doc.paragraphs:
                text += para.text + "\n"
        except Exception as e:
            print(f"Error reading DOCX {docx_path}: {e}")
        return text

    def extract_text_from_txt(self, txt_path):
        """Extract text from plain text files."""
        try:
            with open(txt_path, 'r', encoding='utf-8') as file:
                return file.read()
        except UnicodeDecodeError:
            try:
                with open(txt_path, 'r', encoding='latin-1') as file:
                    return file.read()
            except Exception as e:
                print(f"Error reading TXT {txt_path}: {e}")
                return ""
        except Exception as e:
            print(f"Error reading TXT {txt_path}: {e}")
            return ""

    # %% [markdown]
    # ## 4. Text Processing
    
    def preprocess_text(self, text):
        """Clean and normalize text for analysis."""
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and numbers
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\d+', ' ', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        # Tokenize and remove stopwords
        words = text.split()
        return ' '.join([w for w in words if w not in self.stop_words])

    # %% [markdown]
    # ## 5. Feature Extraction
    
    def extract_education(self, text):
        """Identify highest education level from text."""
        education_level = 0
        text_lower = text.lower()
        for degree, score in self.education_scores.items():
            if degree in text_lower:
                education_level = max(education_level, score)
        return education_level

    def extract_experience_years(self, text):
        """Extract years of experience using regex patterns."""
        text_lower = text.lower()
        experience_patterns = [
            r'(\d+)\+?\s*years?\s+experience',
            r'experience\s+of\s+(\d+)\+?\s*years?',
            r'worked\s+for\s+(\d+)\+?\s*years?'
        ]
        max_years = 0
        for pattern in experience_patterns:
            matches = re.findall(pattern, text_lower)
            if matches:
                max_years = max(max_years, *map(int, matches))
        return max_years

    def count_skill_keywords(self, text):
        """Count skill keywords in resume text."""
        text_lower = text.lower()
        skill_counts = {category: 0 for category in self.skill_keywords}
        for category, keywords in self.skill_keywords.items():
            for keyword in keywords:
                pattern = r'\b' + re.escape(keyword) + r'\b'
                skill_counts[category] += len(re.findall(pattern, text_lower))
        return skill_counts

    # %% [markdown]
    # ## 6. Scoring & Ranking
    
    def calculate_keyword_score(self, skill_counts):
        """Calculate weighted keyword score."""
        weights = {
            'programming': 0.25,
            'data_science': 0.2,
            'cloud': 0.15,
            'databases': 0.15,
            'soft_skills': 0.25
        }
        total = sum(skill_counts.values())
        return sum((count/total)*weights[cat]*10 for cat, count in skill_counts.items()) if total > 0 else 0

    def calculate_jd_similarity(self, resume_text):
        """Calculate similarity between resume and job description."""
        if not self.job_description:
            return 0
        try:
            vectorizer = TfidfVectorizer()
            vectors = vectorizer.fit_transform([
                self.preprocess_text(self.job_description),
                self.preprocess_text(resume_text)
            ])
            return cosine_similarity(vectors[0:1], vectors[1:2])[0][0] * 10
        except:
            return 0

    # %% [markdown]
    # ## 7. Resume Processing
    
    def load_resumes(self, folder_path):
        """Load resumes from a folder."""
        self.resumes = []
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            if not os.path.isfile(file_path):
                continue
                
            text = ""
            if filename.endswith('.pdf'):
                text = self.extract_text_from_pdf(file_path)
            elif filename.endswith('.docx'):
                text = self.extract_text_from_docx(file_path)
            elif filename.endswith('.txt'):
                text = self.extract_text_from_txt(file_path)
            
            if text.strip():
                self.resumes.append({
                    'filename': filename,
                    'text': text,
                    'processed_text': self.preprocess_text(text)
                })
        print(f"Loaded {len(self.resumes)} resumes from {folder_path}")
        return len(self.resumes)

    def analyze_resumes(self):
        """Analyze all loaded resumes."""
        self.resume_data = []
        for resume in self.resumes:
            features = {
                'filename': resume['filename'],
                'education_score': self.extract_education(resume['text']),
                'experience_years': self.extract_experience_years(resume['text']),
                'skills': self.count_skill_keywords(resume['text'])
            }
            features['experience_score'] = min(10, features['experience_years'])
            features['keyword_score'] = self.calculate_keyword_score(features['skills'])
            features['jd_similarity'] = self.calculate_jd_similarity(resume['text'])
            features['total_score'] = (
                features['education_score'] * 0.2 +
                features['experience_score'] * 0.3 +
                features['keyword_score'] * 0.25 +
                features['jd_similarity'] * 0.25
            )
            self.resume_data.append(features)

    def rank_resumes(self, ascending=True):
        """Return sorted DataFrame of resumes."""
        if not self.resume_data:
            return pd.DataFrame()
        return pd.DataFrame(self.resume_data).sort_values('total_score', ascending=ascending)

    def export_results(self, output_path):
        """Export results to CSV file."""
        self.rank_resumes().to_csv(output_path, index=False)
        print(f"Results saved to {output_path}")

# %% [markdown]
# ## 8. Batch Processing Function

def process_all_roles(base_path="Datasets/data/data", output_dir="results"):
    """Process all job role folders in the dataset."""
    os.makedirs(output_dir, exist_ok=True)
    
    # Get list of job role folders
    job_roles = [d for d in os.listdir(base_path) 
                if os.path.isdir(os.path.join(base_path, d))]
    
    for role in job_roles:
        print(f"\n{'='*40}\nProcessing: {role}\n{'='*40}")
        role_path = os.path.join(base_path, role)
        ranker = ResumeRanker(job_description=role)
        
        if ranker.load_resumes(role_path) > 0:
            ranker.analyze_resumes()
            output_path = os.path.join(output_dir, f"{role}_rankings.csv")
            ranker.export_results(output_path)
            print(f"✅ Saved {role} results to {output_path}")
        else:
            print(f"⚠️ No resumes found in {role_path}")

# %% [markdown]
# ## 9. Main Execution

if __name__ == "__main__":
    process_all_roles()
    print("\nProcessing completed!")

[nltk_data] Downloading package punkt to C:\Users\Aman
[nltk_data]     NSU\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Aman
[nltk_data]     NSU\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Processing: ACCOUNTANT
Loaded 118 resumes from Datasets/data/data\ACCOUNTANT
Results saved to results\ACCOUNTANT_rankings.csv
✅ Saved ACCOUNTANT results to results\ACCOUNTANT_rankings.csv

Processing: ADVOCATE
Loaded 118 resumes from Datasets/data/data\ADVOCATE
Results saved to results\ADVOCATE_rankings.csv
✅ Saved ADVOCATE results to results\ADVOCATE_rankings.csv

Processing: AGRICULTURE
Loaded 63 resumes from Datasets/data/data\AGRICULTURE
Results saved to results\AGRICULTURE_rankings.csv
✅ Saved AGRICULTURE results to results\AGRICULTURE_rankings.csv

Processing: APPAREL
Loaded 97 resumes from Datasets/data/data\APPAREL
Results saved to results\APPAREL_rankings.csv
✅ Saved APPAREL results to results\APPAREL_rankings.csv

Processing: ARTS
Loaded 103 resumes from Datasets/data/data\ARTS
Results saved to results\ARTS_rankings.csv
✅ Saved ARTS results to results\ARTS_rankings.csv

Processing: AUTOMOBILE
Loaded 36 resumes from Datasets/data/data\AUTOMOBILE
Results saved to results\AUTOM