In [1]:
#!pip install pdfplumber

In [1]:
import pdfplumber
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\santhoshs.s\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\santhoshs.s\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\santhoshs.s\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Step 1: Extract text from PDF

In [3]:
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

### Step 2: Extract "Objective" section from the resume

In [4]:
def extract_objective(text):
    objective_regex = re.compile(r"(objective|career objective)", re.IGNORECASE | re.DOTALL)
    match = objective_regex.search(text)
    if match:
        return match.group(2).strip()  # Return the objective part of the match
    else:
        return "Objective not found"

In [5]:
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

In [6]:
resume_pdf_path = input("Enter the resume[pdf] path: ")

Enter the resume[pdf] path:  C:\Users\santhoshs.s\jupyter\resumes\data\data\ACCOUNTANT\10554236.pdf


In [7]:
resume_text = extract_text_from_pdf(resume_pdf_path)
objective_text = extract_objective(resume_text)

# If objective is found, process it
if objective_text != "Objective not found":
    preprocessed_resume = preprocess_text(objective_text)
    print("Extracted Objective:", objective_text)
else:
    preprocessed_resume = preprocess_text(resume_text)  # Fallback to whole resume if no objective
    print("Using whole resume for analysis.")

Using whole resume for analysis.


In [8]:
job_description = input("Enter the job description :")

Enter the job description : Responsible for analyzing the Time sheet and generate invoices  Responsible for getting approval for generated invoices  Responsible for Sending the invoices to the client and Vendors  Responsible for follow-up with client and vendors for payment and update the same in our DB.  Must possess Good Communication and Willing to work in Night Shift.


In [9]:
preprocessed_job_description = preprocess_text(job_description)

### Step 3: TF-IDF Vectorization

In [10]:
vectorizer = TfidfVectorizer()

documents = [preprocessed_job_description, preprocessed_resume]
tfidf_matrix = vectorizer.fit_transform(documents)

### Step 4: Calculate Cosine Similarity

In [12]:
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

# Output the similarity score
similarity_percentage = cosine_sim[0][0] * 100
print(f"Resume is {similarity_percentage:.2f}% similar to the job description.")

Resume is 10.81% similar to the job description.
