In [None]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3


In [None]:
import fitz  # PyMuPDF
import os
import re
import spacy
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering
from transformers import pipeline


In [None]:
nlp = spacy.load("en_core_web_sm")
embedder = SentenceTransformer('all-MiniLM-L6-v2')
ner_pipe = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")

# Define known skills (you can load from CSV if you have a larger list)
COMMON_SKILLS = {
    "python", "sql", "machine learning", "deep learning", "tensorflow", "keras",
    "scikit-learn", "pytorch", "nlp", "pandas", "numpy", "matplotlib", "seaborn",
    "data preprocessing", "model evaluation", "transformers", "data visualization"
}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
def extract_text_from_pdfs(pdf_dir):
    extracted_data = {}
    for file in os.listdir(pdf_dir):
        if file.endswith('.pdf'):
            path = os.path.join(pdf_dir, file)
            with fitz.open(path) as doc:
                text = "\n".join([page.get_text() for page in doc])
            extracted_data[file] = text
    return extracted_data


In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens)


In [None]:
def split_into_sections(text):
    lines = text.split('\n')
    return [line.strip() for line in lines if len(line.strip()) > 10]

def detect_semantic_sections(text):
    lines = split_into_sections(text)
    if not lines:
        return {}
    embeddings = embedder.encode(lines)
    clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5).fit(embeddings)

    sections = {}
    for label in set(clustering.labels_):
        indices = np.where(clustering.labels_ == label)[0]
        cluster_text = "\n".join([lines[i] for i in indices])
        sections[f"Section_{label}"] = cluster_text
    return sections


In [None]:
def extract_named_entities(text):
    entities = ner_pipe(text)
    return list(set(e['word'].lower() for e in entities if e['entity_group'] in ['ORG', 'MISC']))

def extract_skills_by_keywords(text):
    return [skill for skill in COMMON_SKILLS if skill in text.lower()]

def combined_skill_extraction(text):
    ner_skills = extract_named_entities(text)
    keyword_skills = extract_skills_by_keywords(text)
    return list(set(ner_skills + keyword_skills))


In [None]:
def extract_experience_years(text):
    match = re.search(r'(\d+)\+?\s+years?', text.lower())
    return int(match.group(1)) if match else 0

def extract_education(text):
    text = text.lower()
    if any(kw in text for kw in ['b.tech', 'bachelor', 'b.e']):
        return 1
    return 0


In [None]:
def score_resume(resume_embedding, jd_embedding, resume_skills, jd_skills, exp_years, education_flag,
                 alpha=0.5, beta=0.3, gamma=0.1, delta=0.1):
    semantic_score = cosine_similarity([resume_embedding], [jd_embedding])[0][0]

    skill_score = len(set(resume_skills) & set(jd_skills)) / max(1, len(jd_skills))
    experience_score = min(exp_years / 10, 1.0)  # Normalize
    return round(alpha * semantic_score + beta * skill_score + gamma * experience_score + delta * education_flag, 3)


In [None]:
JD_TEXT = """
We are hiring a Data Science Intern with experience in NLP, Machine Learning and Deep Learning with knowledge of data preprocessing and building end-to-end models.
"""

JD_EMBEDDING = embedder.encode(JD_TEXT)
JD_SKILLS = extract_skills_by_keywords(JD_TEXT)


In [None]:
pdf_texts = extract_text_from_pdfs('/data/')
ranking_results = []

for filename, raw_text in pdf_texts.items():
    cleaned_text = preprocess_text(raw_text)
    sections = detect_semantic_sections(cleaned_text)
    combined_section_text = " ".join(sections.values())

    resume_embedding = embedder.encode(combined_section_text)
    extracted_skills = combined_skill_extraction(combined_section_text)

    exp_years = extract_experience_years(combined_section_text)
    education_flag = extract_education(combined_section_text)

    score = score_resume(resume_embedding, JD_EMBEDDING, extracted_skills, JD_SKILLS, exp_years, education_flag)

    ranking_results.append({
        "filename": filename,
        "score": score,
        "skills_matched": list(set(extracted_skills) & set(JD_SKILLS)),
        "exp_years": exp_years,
        "education": "Yes" if education_flag else "No"
    })


In [None]:
df = pd.DataFrame(ranking_results).sort_values(by="score", ascending=False).reset_index(drop=True)

# Display top 5 resumes
print(df[["filename", "score", "skills_matched", "exp_years", "education"]].head())

# Save full results
df.to_csv("ranked_resumes.csv", index=False)


                    filename  score                     skills_matched  \
0                Atharva.pdf  0.342  [machine learning, deep learning]   
1    Atharva_Atterkar_DS.pdf  0.330  [machine learning, deep learning]   
2             Atharva_CV.pdf  0.170                                 []   
3  Atharva_Atterkar_Viit.pdf  0.160                                 []   

   exp_years education  
0          0        No  
1          0        No  
2          0        No  
3          0        No  
