# The first step is to import the required librairies

In [13]:
"""This module provides a portable way of using operating system dependent functionality. If you
just want to read or write a file see open(), if you want to manipulate paths, see the os.path 
module, and if you want to read all the lines in all the files on the command line see the 
fileinput module"""
import os
"""
pdfplumber is a powerful library that allows for easy extraction of text and data from PDFs, 
making it a valuable tool for data analysis and automation tasks.
"""
import pdfplumber
"""The Natural Language Toolkit (NLTK) is an open source Python library for Natural Language Processing"""
import nltk
from nltk.corpus import stopwords
"""scikit-learn is a Python module for machine learning built on top of SciPy and is distributed
under the 3-Clause BSD license."""
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
"""
The csv library provides functionality to both read from and write to CSV files. Designed to work out
of the box with Excel-generated CSV files, it is easily adapted to work with a variety of CSV formats.
The csv library contains objects and other code to read, write, and process data from and to CSV files.
"""
import csv

# Extraction of text  from  job description and remove stop Wrods from it

In [14]:
# Load the job description from a PDF
with pdfplumber.open(r"C:\Users\kabir\Desktop\Projects\Resumes_Ranking_Scranning\data\raw\Data\JobDescription\10399912.pdf") as pdf:
    job_description_text = ""
    for page in pdf.pages:
        job_description_text += page.extract_text()
# Process the job description text
stop_words = stopwords.words('english') 
# we will create a vectorizer object using `TfidfVectorizer()` and fit and transform the text data into vectors
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words)
job_description_tfidf = tfidf_vectorizer.fit_transform([job_description_text])

In [15]:
"""
 returns the term-document matrix that you want to obtain. So save what it returns, and use todense, 
 as it will be in sparse format:
Returns: X : sparse matrix, [n_samples, n_features]. Tf-idf-weighted document-term matrix.
"""
#job_description_tfidf.todense()

'\n returns the term-document matrix that you want to obtain. So save what it returns, and use todense, \n as it will be in sparse format:\nReturns: X : sparse matrix, [n_samples, n_features]. Tf-idf-weighted document-term matrix.\n'

In [16]:
# Load the job description from a PDF
with pdfplumber.open(r"C:\Users\kabir\Desktop\Projects\Resumes_Ranking_Scranning\data\raw\Data\JobDescription\10399912.pdf") as pdf:
    job_description_text = ""
    for page in pdf.pages:
        job_description_text += page.extract_text()

In [18]:
# Folder containing resume files
resume_folder = r"C:\Users\kabir\Desktop\Projects\Resumes_Ranking\data\raw\Data\HR"
# Process each resume and calculate cosine similarity
similarities = {}
for resume_file in os.listdir(resume_folder):
    if resume_file.endswith('.pdf'):
        try:
            with pdfplumber.open(os.path.join(resume_folder, resume_file)) as pdf:
                resume_text = ""
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        resume_text += page_text  # Append page text if not empty
                
                if resume_text.strip():  # Ensure extracted text is not empty
                    # Process the resume text
                    resume_tfidf = tfidf_vectorizer.transform([resume_text])
                    
                    # Calculate cosine similarity
                    similarity = cosine_similarity(job_description_tfidf, resume_tfidf)
                    similarities[resume_file] = similarity[0][0]
                    #print(type(similarity))
                    #print(similarities[resume_file])
                
                else:
                    print(f"Warning: Empty text in {resume_file}")
        except Exception as e:
            print(f"Error processing {resume_file}: {e}")

# Sort and rank resumes based on cosine similarity
"""
Parameter	Description
iterable	Required. The sequence to sort, list, dictionary, tuple etc.
key	Optional. A Function to execute to decide the order. Default is None
reverse	Optional. A Boolean. False will sort ascending, True will sort descending. Default is False
"""
#print(similarities)
CVs = sorted(similarities.items(),key=lambda x: x[1],  reverse=True)
#print(sorted_resumes)

In [21]:
def save_as_csv_File(csv_file_name,fieldnames,sorted_resumes):

    # Write the results to the CSV file
    with open(output_csv_file, mode='w', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        #we will create header in our csv file
        writer.writeheader()
        #we fill  our csv file with 
        for idx, (resume_file, similarity) in enumerate(sorted_resumes, start=1):
            writer.writerow({'Rank': idx, 'Resume File': resume_file, 'Cosine Similarity': similarity})

    print(f"Results saved to {output_csv_file}")
    
CVs = sorted(similarities.items(),key=lambda x: x[1],  reverse=True)   
Rank_ResumeFile_CosineSimilarity = ['Rank', 'Resume File', 'Cosine Similarity']
output_csv_file = 'similarity_scores.csv'
save_as_csv_File(output_csv_file,Rank_ResumeFile_CosineSimilarity,CVs)


Results saved to similarity_scores.csv
