In [None]:
#!pip install pdfplumber

In [None]:
import pdfplumber
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\santhoshs.s\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\santhoshs.s\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\santhoshs.s\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Step 1: Extract text from PDF

In [None]:
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

### Step 2: Extract "Objective" section from the resume

In [None]:
def extract_objective(text):
    objective_regex = re.compile(r"(objective|career objective)(.*?)(\n[A-Z]|$)", re.IGNORECASE | re.DOTALL)
    match = objective_regex.search(text)
    if match:
        return match.group(2).strip()  # Return the text between Objective and next section
    else:
        return "Objective not found"

In [None]:
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

In [None]:
resume_pdf_path = input("Enter the resume[pdf] path: ")

Enter the resume[pdf] path:  C:\\Users\\santhoshs.s\\Downloads\\oliver.pdf


In [None]:
resume_text = extract_text_from_pdf(resume_pdf_path)
objective_text = extract_objective(resume_text)

# If objective is found, process it
if objective_text != "Objective not found":
    preprocessed_resume = preprocess_text(objective_text)
    print("Extracted Objective:", objective_text)
else:
    preprocessed_resume = preprocess_text(resume_text)  # Fallback to whole resume if no objective
    print("Using whole resume for analysis.")

Using whole resume for analysis.


In [None]:
job_description = input("Enter the job description :")

Enter the job description : CA Finalist Minimum of 5 years of professional bookkeeping or accounting experience. Strong proficiency in SAP (two years working experience required). Strong proficiency in Zoho (two years working experience required). Strong English Communication skills Solid understanding of accounting principles and financial reporting. Excellent attention to detail and accuracy in data entry and reporting. Strong organizational and time management skills. Ability to work independently and collaboratively within a team. Proficiency in Microsoft Excel and other accounting software is a plus. Should be able to work during US time zone.  Job Description: As a Bookkeeper/Accountant, you will be responsible for managing the financial records, ensuring accuracy and efficiency in all bookkeeping activities. You will play a critical role in maintaining the integrity of our financial data and supporting financial decision-making.  Key Responsibilities:  Maintain accurate and up-t

In [None]:
preprocessed_job_description = preprocess_text(job_description)

### Step 3: TF-IDF Vectorization

In [None]:
vectorizer = TfidfVectorizer()

documents = [preprocessed_job_description, preprocessed_resume]
tfidf_matrix = vectorizer.fit_transform(documents)

### Step 4: Calculate Cosine Similarity

In [None]:
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

# Output the similarity score
similarity_percentage = cosine_sim[0][0] * 100
print(f"Resume is {similarity_percentage:.2f}% similar to the job description.")

Resume is 0.00% similar to the job description.
