## Help was taken from ChatGPT to generate some of the code

In [None]:
from google.colab import drive

In [None]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install sentence_transformers
!pip install pipeline
!pip install fitz
!pip install frontend
!pip install docx2txt
!pip install PyPDF2
!pip install keybert

Collecting sentence_transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.1.1
Collecting pipeline
  Downloading pipeline-0.1.0-py3-none-any.whl.metadata (483 bytes)
Downloading pipeline-0.1.0-py3-none-any.whl (2.6 kB)
Installing collected packages: pipeline
Successfully installed pipeline-0.1.0
Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting configobj (from fitz)
  Downloading configobj-5.0.9.tar.gz (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.5/101.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to bui

In [None]:
import docx2txt
import PyPDF2
from transformers import pipeline
import re
from keybert import KeyBERT

## Resume Text extraction from the sections

In [None]:
def extract_text(file_path):
    if file_path.endswith(".docx"):
        # Extract text from DOCX file
        return docx2txt.process(file_path)

    elif file_path.endswith(".pdf"):
        # Extract text from PDF file
        text = ""
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text()
        return text

    else:
        raise ValueError("Unsupported file type")

In [None]:

def clean_extracted_text(text):
    # Remove extra spaces
    text = re.sub(r'\s{2,}', ' ', text)
    # Fix misplaced hyphens and split words
    text = re.sub(r'\b-\s+', '', text)
    # Remove line breaks if they are not ending sections
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    # Fix instances of split words (e.g., "D atabase" -> "Database")
    text = re.sub(r'\b(\w)\s+(\w)\b', r'\1\2', text)
    # Remove any email, phone, and URLs if needed
    text = re.sub(r'http\S+', '', text)  # URLs
    text = re.sub(r'\S+@\S+', '', text)  # Email addresses
    text = re.sub(r'\b\d{3}[-.\s]??\d{3}[-.\s]??\d{4}\b', '', text)  # Phone numbers
    # Remove extra line breaks
    text = re.sub(r'\n+', '\n', text).strip()

    return text

# Example usage


In [None]:

Resume_text=extract_text("/content/drive/MyDrive/Resume/Anushka_Bhat_latest.docx")
job_des=extract_text("/content/drive/MyDrive/Resume/Job_description.docx")

cleaned_text = clean_extracted_text(Resume_text)
cleaned_text_jd = clean_extracted_text(job_des)




## Extract key words using keyBERT and check the similarity score

In [None]:
# Load KeyBERT model
kw_model = KeyBERT('all-MiniLM-L6-v2')

# Extract keywords from resume and job description
resume_keywords = kw_model.extract_keywords(cleaned_text, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=15)
job_description_keywords = kw_model.extract_keywords(cleaned_text_jd, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=15)

# Convert keywords to a single string (focused text)
focused_resume_text = ' '.join([kw[0] for kw in resume_keywords])
focused_jd_text = ' '.join([kw[0] for kw in job_description_keywords])

In [None]:
# Step 6: Example job description to compare with
from sentence_transformers import SentenceTransformer, util
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


In [None]:
job_embedding = sbert_model.encode(focused_jd_text)

In [None]:
resume_embedding = sbert_model.encode(focused_resume_text)

In [None]:
# Step 7: Calculate similarity for each section
similarity_scores = {}

# Calculate cosine similarity between job description and resume embeddings
similarity_score = util.pytorch_cos_sim(job_embedding, resume_embedding).item()

# Print similarity score
print(f"Similarity score between job description and resume: {similarity_score:.2f}")

Similarity score between job description and resume: 0.27


In [None]:
focused_jd_text

'manufacturing engineering engineering manufacturing processes manufacturing manufacturing equipment industrial engineering engineering industrial new manufacturing manufacturing eng lean manufacturing op manufacturing mechanical engineering job requirements manufacturing fit engineering manufacturing techniques'

In [None]:
focused_resume_text

'database skills technology intern gaining expertise expertise machine business intelligence data engineers utilize ai learning capabilities currently graduate bhat graduate business engine skills knowledge computer science software engineer expertise'

In [None]:
cleaned_text

'Anushka Bhat Graduate Student, Department of Computer Science Purdue University, Fort Wayne, IN Email:  LinkedIn:anushkabhat07 Phone: 1- SUMMARY Over 7.5 years of experience in developing Database and Business Intelligence applications. Proven track in delivering high quality technical deliverables in data intensive environments. Currently a graduate student in Department of Computer Science at Purdue University Fort Wayne, with the motivation of further expanding my skills set with different tools and technologies. My current focus is on gaining expertise in Machine Learning and Deep Learning through targeted coursework. ACADEMIC COURSEWORK Master of science in Computer Science – Machine learning, Natural Language Processing, Software Engineering, Web Development, Database Management Systems, Corporate Partners | GPA 4.0 2023-2025 Bachelor of Engineering in Computer Science | First class 2011-2015 LANGUAGES and TECHNOLOGIES Azure SQL, T-SQL, Python, HTML, CSS, JavaScript Azure Data F

## Check with cosine similarity using TF-IDF vectoriser

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

In [None]:

# Combine the cleaned texts into a corpus
corpus = [cleaned_text, cleaned_text_jd]

# Create the TF-IDF Vectorizer and fit-transform the corpus
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(corpus)

# Calculate the cosine similarity between the resume and job description
similarity_matrix = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
tfidf_similarity_score = similarity_matrix[0][0]

print(f"TF-IDF Cosine Similarity Score: {tfidf_similarity_score:.2f}")

TF-IDF Cosine Similarity Score: 0.09


## check weighted cosine similairty with sbert and tf-idf vectoriser

In [None]:
job_embedding = sbert_model.encode(cleaned_text)

In [None]:

resume_embedding = sbert_model.encode(cleaned_text_jd)

In [None]:
Sbert_similarity_score = util.pytorch_cos_sim(job_embedding, resume_embedding).item()

In [None]:
# Weighted combination of TF-IDF and SBERT scores
final_similarity_score = 0.7 * tfidf_similarity_score + 0.3 * Sbert_similarity_score
print(f"Combined Similarity Score: {final_similarity_score:.2f}")

Combined Similarity Score: 0.17
