In [5]:
import os
import pandas as pd
import PyPDF2
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download NLTK resources (you might need to run this line once)
nltk.download('punkt')
nltk.download('stopwords')

# Define the desired skills as keywords
desired_skills = ['Machine Learning', 'Data Analysis', 'Python', 'Statistics', 'SQL', 'Data Visualization']

# Function to extract text from PDFs in a folder
def extract_text_from_pdfs(folder_path):
    data = {'filename': [], 'text': []}
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'rb') as doc:
                pdf_reader = PyPDF2.PdfReader(doc)
                text = ""
                for i in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[i]
                    page_text = page.extract_text()
                    text += page_text
                data['filename'].append(filename)
                data['text'].append(text)
    
    resumes_df = pd.DataFrame(data)
    return resumes_df

# Provide the path to the folder containing PDF files
folder_path = "E:\Data Science Resumes"
resumes_df = extract_text_from_pdfs(folder_path)

# Function to calculate a basic score for each resume
def calculate_resume_score(text):
    score = 0
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    for skill in desired_skills:
        if skill.lower() in tokens:
            score += 1
    
    # Calculate education level
    if re.search('Bachelor', text, re.IGNORECASE):
        score += 1  # Increment for a bachelor's degree
    
    # Calculate work experience (extract years from text)
    work_experience = re.search(r'(\d+)\s*(?:year|yr)s?\s*of experience', text, re.IGNORECASE)
    if work_experience:
        experience = int(work_experience.group(1))
        score += experience
    
    return score

# Apply the function to calculate scores
resumes_df['score'] = resumes_df['text'].apply(calculate_resume_score)

# Specify the number of resumes to shortlist
num_to_shortlist = 10

# Shortlist resumes based on score
shortlisted_resumes = resumes_df.nlargest(num_to_shortlist, 'score')

# Display the shortlisted resumes
for index, row in shortlisted_resumes.iterrows():
    print(f"Filename: {row['filename']} | Score: {row['score']}")
    print("------")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Filename: Aditi_Resume.pdf | Score: 4
------
Filename: Ali_Resume.pdf | Score: 4
------
Filename: Harshit_Resume.pdf | Score: 4
------
Filename: Purva_Resume.pdf | Score: 4
------
Filename: Adarsh_Resume.pdf | Score: 3
------
Filename: Aditya_Resume.pdf | Score: 3
------
Filename: Anjor_Resume.pdf | Score: 3
------
Filename: Bhushan's_Resume .pdf | Score: 3
------
Filename: Gauri_Resume.pdf | Score: 3
------
Filename: Harshad_Resume.pdf | Score: 3
------
