In [None]:
import os
import docx2txt
import PyPDF2
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Define functions for parsing resumes
def extract_text_from_docx(file_path):
    text = docx2txt.process(file_path)
    return text

def extract_text_from_pdf(file_path):
    text = ''
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfFileReader(file)
        for page_num in range(pdf_reader.numPages):
            page = pdf_reader.getPage(page_num)
            text += page.extractText()
    return text

# Step 2: Define function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_tokens)

# Step 3: Define function to extract features from resumes
def extract_features(resume_path):
    if resume_path.endswith('.docx'):
        text = extract_text_from_docx(resume_path)
    elif resume_path.endswith('.pdf'):
        text = extract_text_from_pdf(resume_path)
    else:
        raise ValueError("Unsupported file format")
    
    preprocessed_text = preprocess_text(text)
    return preprocessed_text

# Step 4: Define function to compare resumes with job description and rank candidates
def rank_candidates(resume_paths, job_description_path):
    job_description = extract_features(job_description_path)
    tfidf_vectorizer = TfidfVectorizer()
    job_description_vector = tfidf_vectorizer.fit_transform([job_description])

    ranked_candidates = []
    for resume_path in resume_paths:
        resume_text = extract_features(resume_path)
        resume_vector = tfidf_vectorizer.transform([resume_text])
        similarity_score = cosine_similarity(job_description_vector, resume_vector)[0][0]
        ranked_candidates.append((resume_path, similarity_score))

    ranked_candidates.sort(key=lambda x: x[1], reverse=True)
    return ranked_candidates

# Sample usage
if __name__ == "__main__":
    nltk.download('punkt')
    nltk.download('stopwords')

    resumes_directory = 'resumes'
    job_description_file = 'job_description.txt'

    resume_paths = [os.path.join(resumes_directory, file) for file in os.listdir(resumes_directory)]
    ranked_candidates = rank_candidates(resume_paths, job_description_file)

    print("Ranked candidates:")
    for idx, (resume_path, similarity_score) in enumerate(ranked_candidates, start=1):
        print(f"{idx}. {resume_path} - Similarity Score: {similarity_score}")
