In [None]:
!pip install pdfplumber python-docx sentence-transformers spacy matplotlib plotly wordcloud
!python -m spacy download en_core_web_sm


Collecting pdfplumber
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting pdfminer.six==20251107 (from pdfplumber)
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.1.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.8-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloadin

In [2]:
import io, os, json
from collections import Counter


In [3]:
# extract helpers - copy to VS Code later
import pdfplumber
from docx import Document

def extract_text_from_pdf_bytes(file_bytes):
    text = []
    with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
        for page in pdf.pages:
            p = page.extract_text()
            if p:
                text.append(p)
    return "\n".join(text)

def extract_text_from_docx_bytes(file_bytes):
    doc = Document(io.BytesIO(file_bytes))
    return "\n".join([p.text for p in doc.paragraphs])


In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    tokens = [t.lemma_.lower() for t in doc if not t.is_stop and t.is_alpha]
    return " ".join(tokens)

def extract_noun_phrases(text, top_k=30):
    doc = nlp(text)
    phrases = [chunk.text.lower().strip() for chunk in doc.noun_chunks if len(chunk.text.strip())>1]
    c = Counter(phrases)
    return [p for p,_ in c.most_common(top_k)]


In [5]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')  # small & accurate for prototyping


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
def similarity_score(resume_text, jd_text):
    emb_r = model.encode(resume_text, convert_to_tensor=True)
    emb_j = model.encode(jd_text, convert_to_tensor=True)
    score = util.cos_sim(emb_r, emb_j).item()  # [-1,1]
    score = max(0, score)  # avoid negatives
    return round(score * 100, 1)  # 0-100

# Predefined skill list (augment this for better results)
PREDEFINED_SKILLS = ["python","java","c++","sql","docker","kubernetes","aws","azure","gcp","tensorflow","pytorch","machine learning","data analysis","react","node.js","django","flask","rest api"]

def matched_skills(resume_text, jd_text, threshold=0.6):
    results = {"matched": [], "missing": []}
    r_emb = model.encode(resume_text, convert_to_tensor=True)
    jd_emb = model.encode(jd_text, convert_to_tensor=True)
    for s in PREDEFINED_SKILLS:
        s_emb = model.encode(s, convert_to_tensor=True)
        sim_resume = util.cos_sim(r_emb, s_emb).item()
        sim_jd = util.cos_sim(jd_emb, s_emb).item()
        if sim_jd > 0.45:  # JD likely requires this skill
            if sim_resume > threshold:
                results["matched"].append(s)
            else:
                results["missing"].append(s)
    return results


In [7]:
def analyze_resume_jd(resume_text, jd_text):
    pr_resume = preprocess(resume_text)
    pr_jd = preprocess(jd_text)
    overall = similarity_score(pr_resume, pr_jd)
    skills = matched_skills(pr_resume, pr_jd)
    noun_phrases = extract_noun_phrases(resume_text, top_k=30)
    chart_data = {"matched": len(skills["matched"]), "missing": len(skills["missing"])}
    return {
        "overall_match": overall,
        "matched_skills": skills["matched"],
        "missing_skills": skills["missing"],
        "top_phrases": noun_phrases,
        "chart_data": chart_data
    }


In [8]:
import plotly.graph_objects as go
def plot_skill_match(matched, missing):
    labels = ["Matched", "Missing"]
    values = [len(matched), len(missing)]
    fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.4)])
    fig.update_layout(title="Skill Match")
    fig.show()

In [13]:
with open("/content/Aadhithya R resume  (15).pdf","rb") as f:
    resume_bytes = f.read()
resume_text = extract_text_from_pdf_bytes(resume_bytes)

# Create a placeholder sample_jd.txt if it doesn't exist
if not os.path.exists("sample_jd.txt"):
    with open("sample_jd.txt", "w") as f:
        f.write("This is a placeholder job description. Please replace with actual job description content.")

jd_text = open("sample_jd.txt").read()
analysis = analyze_resume_jd(resume_text, jd_text)
print(json.dumps(analysis, indent=2))
plot_skill_match(analysis["matched_skills"], analysis["missing_skills"])

{
  "overall_match": 8.6,
  "matched_skills": [],
  "missing_skills": [],
  "top_phrases": [
    "pollachi",
    "react",
    "flask",
    "a predictive web application",
    "traffic volume",
    "weather",
    "date",
    "holiday data",
    "an mlpregressor model",
    "real-time inputs",
    "apis",
    "model interaction",
    "pandas",
    "visualization",
    "happygarden",
    "suleswaranpatti",
    "+91 9629628246\naadhithyaa120@gmail.com\nportfolio\nsummary\naspiring full stack developer",
    "machine learning enthusiast",
    "strong expertise",
    "full-stack\nweb and app development",
    "scalable applications",
    "intelligent solutions",
    "a passion",
    "ai",
    "data-driven insights",
    "an\nopportunity",
    "technical expertise",
    "a dynamic organization",
    "education\nb.tech",
    "artificial intelligence and data science"
  ],
  "chart_data": {
    "matched": 0,
    "missing": 0
  }
}
