<a href="https://colab.research.google.com/github/amanchauhan786/Unthinkable_Resume_ScreenResumer/blob/main/unthinkable_ScreenResume.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Install required libraries (run once)
!pip install --quiet pdfplumber python-docx spacy sentence-transformers faiss-cpu fastapi uvicorn pydantic requests sqlalchemy sqlite-utils streamlit
!python -m spacy download en_core_web_sm


# 2. Imports & basic helpers

# %%
import os
import json
import uuid
import sqlite3
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any

import pdfplumber
import docx
import re
import spacy
import numpy as np

nlp = spacy.load("en_core_web_sm")


# 3. File -> Text extraction utilities

# %%
def pdf_to_text(path: str) -> str:
    text_pages = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text() or ""
            text_pages.append(page_text)
    return "\n".join(text_pages)


def docx_to_text(path: str) -> str:
    doc = docx.Document(path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return "\n".join(full_text)


def txt_to_text(path: str) -> str:
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        return f.read()


def file_to_text(path: str) -> str:
    ext = Path(path).suffix.lower()
    if ext == '.pdf':
        return pdf_to_text(path)
    elif ext in ['.docx', '.doc']:
        return docx_to_text(path)
    else:
        return txt_to_text(path)


# 4. Simple parsing: name/contact/education/experience/skills (rule-based + spaCy)

# %%
SKILL_MASTER_PATH = 'skill_master_list.txt'  # we will create a small starter list below

# Quick starter skill list (you should replace/extend with a larger canonical list)
starter_skills = [
    'python','java','c++','c','sql','postgresql','mongodb','tensorflow','pytorch','keras',
    'scikit-learn','pandas','numpy','opencv','aws','azure','docker','kubernetes','react','node.js',
    'fastapi','flask','git','linux','spark','hadoop'
]

with open(SKILL_MASTER_PATH, 'w') as f:
    f.write('\n'.join(starter_skills))


def load_skill_master(path: str = SKILL_MASTER_PATH) -> List[str]:
    with open(path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f if line.strip()]

skill_master = load_skill_master()

# %%
# Basic skill extraction using token matching + fuzzy matching helpers
from difflib import get_close_matches


def extract_skills(text: str, skill_master: List[str], cutoff: float = 0.85) -> List[str]:
    text_low = text.lower()
    found = set()
    tokens = set(re.findall(r"[a-zA-Z+.#]+", text_low))
    for skill in skill_master:
        sk = skill.lower()
        if sk in text_low:
            found.add(skill)
        else:
            # try token-level fuzzy match
            cand = get_close_matches(sk, tokens, n=1, cutoff=cutoff)
            if cand:
                found.add(skill)
    return sorted(found)


# Simple NER and heuristics for experience & education

def extract_entities(text: str) -> Dict[str, Any]:
    doc = nlp(text)
    names = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
    orgs = [ent.text for ent in doc.ents if ent.label_ in ('ORG','GPE')]

    # education heuristic
    education_lines = []
    for line in text.splitlines():
        if re.search(r"b\.sc|m\.sc|bachelor|master|phd|degree|bachelor's|master's|bs\b|ms\b|mba", line, re.I):
            education_lines.append(line.strip())

    # experience heuristic: look for years and position/company lines
    experience_lines = []
    for line in text.splitlines():
        if re.search(r"\d{4}", line) and ('-' in line or 'to' in line.lower()):
            experience_lines.append(line.strip())

    return {
        'person_candidates': list(dict.fromkeys(names))[:3],
        'organizations': list(dict.fromkeys(orgs))[:6],
        'education': education_lines[:6],
        'experience_snippets': experience_lines[:20]
    }


# 5. Embeddings (placeholder for Gemini). Replace with proper Gemini embeddings calls.
# NOTE: You must use your Gemini key and Google client library / REST calls in Colab.

# %%

def get_embedding_placeholder(text: str) -> np.ndarray:
    """Placeholder embedding generator: use sentence-transformers in Colab for fast experiments.
    Replace this with Gemini embeddings REST or client code when ready.
    """
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('all-MiniLM-L6-v2')
    vec = model.encode(text, show_progress_bar=False)
    return np.array(vec)


def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    if a is None or b is None: return 0.0
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return float(np.dot(a, b) / (na * nb))


# 6. Scoring function that combines skill overlap, semantic similarity, experience heuristic

# %%

def compute_match_score(job_text: str, resume_text: str, job_skills: List[str]=None, required_years: int=None) -> Dict[str, Any]:
    # embeddings (use placeholder or Gemini)
    emb_job = get_embedding_placeholder(job_text)
    emb_resume = get_embedding_placeholder(resume_text)
    semantic = cosine_sim(emb_job, emb_resume)  # in [-1,1] typically; for SBERT it's [0,1]
    semantic_pct = max(0, min(1, semantic)) * 100

    # skill overlap
    jm_skills = job_skills or extract_skills(job_text, skill_master)
    res_skills = extract_skills(resume_text, skill_master)
    skill_overlap_pct = (len(set(jm_skills).intersection(res_skills)) / max(1,len(jm_skills))) * 100 if jm_skills else 0

    # experience heuristic: estimate years from text (simple)
    years = estimate_years_experience(resume_text)
    if required_years:
        exp_pct = min(100, (years / required_years) * 100)
    else:
        exp_pct = min(100, years / 10 * 100)  # assume 10 yrs -> 100

    # education heuristic (0-100)
    edu_pct = 50 if extract_entities(resume_text)['education'] else 25

    final = 0.4*skill_overlap_pct + 0.3*semantic_pct + 0.2*exp_pct + 0.1*edu_pct
    final_1_10 = max(1, min(10, round(final/10)))

    return {
        'skill_overlap_pct': round(skill_overlap_pct,2),
        'semantic_pct': round(semantic_pct,2),
        'experience_pct': round(exp_pct,2),
        'education_pct': round(edu_pct,2),
        'final_score_0_100': round(final,2),
        'final_score_1_10': int(final_1_10),
        'matched_skills': sorted(list(set(jm_skills).intersection(res_skills)))[:20],
        'resume_skills': res_skills,
        'job_skills': jm_skills
    }


# helper to estimate years of experience (very simple heuristic)
def estimate_years_experience(text: str) -> float:
    # Look for patterns like 'X years', 'X+ years', or year ranges to approximate.
    m = re.findall(r"(\d{1,2})\+?\s+years", text, re.I)
    if m:
        nums = [int(x) for x in m]
        return max(nums)
    # try year ranges
    yrs = re.findall(r"(19|20)\d{2}", text)
    if yrs:
        yrs_int = [int(y) for y in yrs]
        if len(yrs_int) >= 2:
            return max(0, max(yrs_int)-min(yrs_int))
    return 2.0  # fallback small experience

# %% [markdown]
# 7. Local storage: simple SQLite persistence for parsed resumes & scores

# %%
DB_PATH = 'resumes.db'

conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
cur.execute('''
CREATE TABLE IF NOT EXISTS resumes (
    id TEXT PRIMARY KEY,
    filename TEXT,
    uploaded_at TEXT,
    source_text TEXT,
    parsed_json TEXT
)
''')
cur.execute('''
CREATE TABLE IF NOT EXISTS scores (
    id TEXT PRIMARY KEY,
    resume_id TEXT,
    job_id TEXT,
    score_json TEXT,
    created_at TEXT
)
''')
conn.commit()


def save_resume(filename: str, source_text: str, parsed_json: Dict[str,Any]) -> str:
    rid = str(uuid.uuid4())
    cur.execute('INSERT INTO resumes (id,filename,uploaded_at,source_text,parsed_json) VALUES (?,?,?,?,?)',
                (rid, filename, datetime.utcnow().isoformat(), source_text, json.dumps(parsed_json)))
    conn.commit()
    return rid


def save_score(resume_id: str, job_id: str, score_json: Dict[str,Any]) -> str:
    sid = str(uuid.uuid4())
    cur.execute('INSERT INTO scores (id,resume_id,job_id,score_json,created_at) VALUES (?,?,?,?,?)',
                (sid, resume_id, job_id, json.dumps(score_json), datetime.utcnow().isoformat()))
    conn.commit()
    return sid

# %% [markdown]
# 8. Demo: run the pipeline on a sample text

# %%
SAMPLE_JOB = """
We are hiring a Senior Data Scientist with 5+ years experience in Python, PyTorch, and production ML systems. Experience with AWS, Docker, and SQL required. Must be able to lead an ML team.
"""

SAMPLE_RESUME_TEXT = """
John Doe\nSenior ML Engineer\nExperience: Worked on production ML using Python, PyTorch, Docker. 6 years total experience. Worked at Acme Corp from 2018-2024. Skills: Python, PyTorch, TensorFlow, SQL, AWS, Docker.
Education: B.Sc. Computer Science.
"""

parsed = extract_entities(SAMPLE_RESUME_TEXT)
parsed['skills'] = extract_skills(SAMPLE_RESUME_TEXT, skill_master)
resume_id = save_resume('sample_resume.txt', SAMPLE_RESUME_TEXT, parsed)
print('Saved resume id:', resume_id)

score = compute_match_score(SAMPLE_JOB, SAMPLE_RESUME_TEXT)
print('Computed score:', score)
save_score(resume_id, 'sample_job_1', score)

# %% [markdown]
# 9. LLM integration (Gemini) — prompt examples & placeholders
# Replace with real Gemini calls. Use the Gemini embedding API for embeddings when ready.

# %%
LLM_JSON_PROMPT = '''
Compare the following resume (JSON fields) with the job description. Return a JSON object with:
{
  "score": integer 1-10,
  "score_breakdown": {"skill_overlap":0-100, "semantic":0-100, "experience":0-100, "education":0-100},
  "justification": "Short 1-2 sentence justification.",
  "top_skills": ["skill1","skill2"]
}

Job Description:\n{job}

Resume JSON:\n{resume_json}
'''

print('Prompt template ready. Use LLM client to call Gemini with your key and parse JSON.')

# %% [markdown]
# 10. Streamlit quick preview (run locally or in Colab with ngrok if needed)

# %%
STREAMLIT_APP = '''
import streamlit as st
import sqlite3, json
st.title('Smart Resume Screener — Demo')
st.write('Upload resumes and a job description to score candidates.')

uploaded = st.file_uploader('Upload resume (PDF/DOCX/TXT)', type=['pdf','docx','txt'], accept_multiple_files=True)
job_text = st.text_area('Job description')
if st.button('Process'):
    for f in uploaded:
        bytes_data = f.read()
        path = f.name
        with open(path,'wb') as out:
            out.write(bytes_data)
        # naive: call file_to_text
        txt = file_to_text(path)
        parsed = extract_entities(txt)
        parsed['skills'] = extract_skills(txt, skill_master)
        rid = save_resume(path, txt, parsed)
        sc = compute_match_score(job_text, txt)
        save_score(rid, 'job_demo', sc)
        st.write('Processed', f.name, 'Score:', sc['final_score_1_10'])
'''
with open('streamlit_demo.py','w') as f:
    f.write(STREAMLIT_APP)
print('Streamlit demo script written: streamlit_demo.py')

# %% [markdown]
# 11. Next steps & checklist
# - Replace get_embedding_placeholder with Gemini embeddings client (or use Google SDK). Store embeddings in DB as arrays (or use vector DB).
# - Integrate Gemini (or other LLM) for the final justification JSON using the LLM_JSON_PROMPT.
# - Expand the SKILL_MASTER list (upload a CSV of canonical skills).
# - Add more robust experience parsing (company/role/date extraction using regex & heuristics).
# - Write unit tests for parsing & scoring.
# - Move FastAPI/production code into /app and containerize for deployment.

# %% [markdown]
# 12. Save a copy of the notebook state (optional)

print('Starter Colab notebook script complete. Run cells in Colab and iterate.')


In [None]:
# 1. Install required libraries (run once)
!pip install --quiet pdfplumber python-docx spacy sentence-transformers faiss-cpu fastapi uvicorn pydantic requests sqlalchemy sqlite-utils streamlit
!python -m spacy download en_core_web_sm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m843.1 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.2/68.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m114.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:

import os
import json
import uuid
import sqlite3
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any


import pdfplumber
import docx
import re
import spacy
import numpy as np


nlp = spacy.load("en_core_web_sm")

In [None]:
def pdf_to_text(path: str) -> str:
  text_pages = []
  with pdfplumber.open(path) as pdf:
    for page in pdf.pages:
      page_text = page.extract_text() or ""
      text_pages.append(page_text)
  return "\n".join(text_pages)




def docx_to_text(path: str) -> str:
  doc = docx.Document(path)
  full_text = []
  for para in doc.paragraphs:
    full_text.append(para.text)
  return "\n".join(full_text)




def txt_to_text(path: str) -> str:
  with open(path, 'r', encoding='utf-8', errors='ignore') as f:
    return f.read()




def file_to_text(path: str) -> str:
  ext = Path(path).suffix.lower()
  if ext == '.pdf':
    return pdf_to_text(path)
  elif ext in ['.docx', '.doc']:
    return docx_to_text(path)
  else:
    return txt_to_text(path)

In [None]:
# 4. Simple parsing: name/contact/education/experience/skills (rule-based + spaCy)

# %%
SKILL_MASTER_PATH = 'skill_master_list.txt'  # we will create a small starter list below

# Quick starter skill list (you should replace/extend with a larger canonical list)
starter_skills = [
    'python','java','c++','c','sql','postgresql','mongodb','tensorflow','pytorch','keras',
    'scikit-learn','pandas','numpy','opencv','aws','azure','docker','kubernetes','react','node.js',
    'fastapi','flask','git','linux','spark','hadoop'
]

with open(SKILL_MASTER_PATH, 'w') as f:
    f.write('\n'.join(starter_skills))


def load_skill_master(path: str = SKILL_MASTER_PATH) -> List[str]:
    with open(path, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f if line.strip()]

skill_master = load_skill_master()

# %%
# Basic skill extraction using token matching + fuzzy matching helpers
from difflib import get_close_matches


def extract_skills(text: str, skill_master: List[str], cutoff: float = 0.85) -> List[str]:
    text_low = text.lower()
    found = set()
    tokens = set(re.findall(r"[a-zA-Z+.#]+", text_low))
    for skill in skill_master:
        sk = skill.lower()
        if sk in text_low:
            found.add(skill)
        else:
            # try token-level fuzzy match
            cand = get_close_matches(sk, tokens, n=1, cutoff=cutoff)
            if cand:
                found.add(skill)
    return sorted(found)


# Simple NER and heuristics for experience & education

def extract_entities(text: str) -> Dict[str, Any]:
    doc = nlp(text)
    names = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
    orgs = [ent.text for ent in doc.ents if ent.label_ in ('ORG','GPE')]

    # education heuristic
    education_lines = []
    for line in text.splitlines():
        if re.search(r"b\.sc|m\.sc|bachelor|master|phd|degree|bachelor's|master's|bs\b|ms\b|mba", line, re.I):
            education_lines.append(line.strip())

    # experience heuristic: look for years and position/company lines
    experience_lines = []
    for line in text.splitlines():
        if re.search(r"\d{4}", line) and ('-' in line or 'to' in line.lower()):
            experience_lines.append(line.strip())

    return {
        'person_candidates': list(dict.fromkeys(names))[:3],
        'organizations': list(dict.fromkeys(orgs))[:6],
        'education': education_lines[:6],
        'experience_snippets': experience_lines[:20]
    }

In [None]:
# 5. Embeddings (placeholder for Gemini). Replace with proper Gemini embeddings calls.
# NOTE: You must use your Gemini key and Google client library / REST calls in Colab.

# %%

def get_embedding_placeholder(text: str) -> np.ndarray:
    """Placeholder embedding generator: use sentence-transformers in Colab for fast experiments.
    Replace this with Gemini embeddings REST or client code when ready.
    """
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('all-MiniLM-L6-v2')
    vec = model.encode(text, show_progress_bar=False)
    return np.array(vec)


def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    if a is None or b is None: return 0.0
    na = np.linalg.norm(a)
    nb = np.linalg.norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return float(np.dot(a, b) / (na * nb))


In [None]:
# 6. Scoring function that combines skill overlap, semantic similarity, experience heuristic

# %%

def compute_match_score(job_text: str, resume_text: str, job_skills: List[str]=None, required_years: int=None) -> Dict[str, Any]:
    # embeddings (use placeholder or Gemini)
    emb_job = get_embedding_placeholder(job_text)
    emb_resume = get_embedding_placeholder(resume_text)
    semantic = cosine_sim(emb_job, emb_resume)  # in [-1,1] typically; for SBERT it's [0,1]
    semantic_pct = max(0, min(1, semantic)) * 100

    # skill overlap
    jm_skills = job_skills or extract_skills(job_text, skill_master)
    res_skills = extract_skills(resume_text, skill_master)
    skill_overlap_pct = (len(set(jm_skills).intersection(res_skills)) / max(1,len(jm_skills))) * 100 if jm_skills else 0

    # experience heuristic: estimate years from text (simple)
    years = estimate_years_experience(resume_text)
    if required_years:
        exp_pct = min(100, (years / required_years) * 100)
    else:
        exp_pct = min(100, years / 10 * 100)  # assume 10 yrs -> 100

    # education heuristic (0-100)
    edu_pct = 50 if extract_entities(resume_text)['education'] else 25

    final = 0.4*skill_overlap_pct + 0.3*semantic_pct + 0.2*exp_pct + 0.1*edu_pct
    final_1_10 = max(1, min(10, round(final/10)))

    return {
        'skill_overlap_pct': round(skill_overlap_pct,2),
        'semantic_pct': round(semantic_pct,2),
        'experience_pct': round(exp_pct,2),
        'education_pct': round(edu_pct,2),
        'final_score_0_100': round(final,2),
        'final_score_1_10': int(final_1_10),
        'matched_skills': sorted(list(set(jm_skills).intersection(res_skills)))[:20],
        'resume_skills': res_skills,
        'job_skills': jm_skills
    }


# helper to estimate years of experience (very simple heuristic)
def estimate_years_experience(text: str) -> float:
    # Look for patterns like 'X years', 'X+ years', or year ranges to approximate.
    m = re.findall(r"(\d{1,2})\+?\s+years", text, re.I)
    if m:
        nums = [int(x) for x in m]
        return max(nums)
    # try year ranges
    yrs = re.findall(r"(19|20)\d{2}", text)
    if yrs:
        yrs_int = [int(y) for y in yrs]
        if len(yrs_int) >= 2:
            return max(0, max(yrs_int)-min(yrs_int))
    return 2.0  # fallback small experience

In [None]:
# 7. Local storage: simple SQLite persistence for parsed resumes & scores

# %%
DB_PATH = 'resumes.db'

conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
cur.execute('''
CREATE TABLE IF NOT EXISTS resumes (
    id TEXT PRIMARY KEY,
    filename TEXT,
    uploaded_at TEXT,
    source_text TEXT,
    parsed_json TEXT
)
''')
cur.execute('''
CREATE TABLE IF NOT EXISTS scores (
    id TEXT PRIMARY KEY,
    resume_id TEXT,
    job_id TEXT,
    score_json TEXT,
    created_at TEXT
)
''')
conn.commit()


def save_resume(filename: str, source_text: str, parsed_json: Dict[str,Any]) -> str:
    rid = str(uuid.uuid4())
    cur.execute('INSERT INTO resumes (id,filename,uploaded_at,source_text,parsed_json) VALUES (?,?,?,?,?)',
                (rid, filename, datetime.utcnow().isoformat(), source_text, json.dumps(parsed_json)))
    conn.commit()
    return rid


def save_score(resume_id: str, job_id: str, score_json: Dict[str,Any]) -> str:
    sid = str(uuid.uuid4())
    cur.execute('INSERT INTO scores (id,resume_id,job_id,score_json,created_at) VALUES (?,?,?,?,?)',
                (sid, resume_id, job_id, json.dumps(score_json), datetime.utcnow().isoformat()))
    conn.commit()
    return sid


In [None]:
# 8. Demo: run the pipeline on a sample text

# %%
SAMPLE_JOB = """
We are hiring a Senior Data Scientist with 5+ years experience in Python, PyTorch, and production ML systems. Experience with AWS, Docker, and SQL required. Must be able to lead an ML team.
"""

SAMPLE_RESUME_TEXT = """
John Doe\nSenior ML Engineer\nExperience: Worked on production ML using Python, PyTorch, Docker. 6 years total experience. Worked at Acme Corp from 2018-2024. Skills: Python, PyTorch, TensorFlow, SQL, AWS, Docker.
Education: B.Sc. Computer Science.
"""

parsed = extract_entities(SAMPLE_RESUME_TEXT)
parsed['skills'] = extract_skills(SAMPLE_RESUME_TEXT, skill_master)
resume_id = save_resume('sample_resume.txt', SAMPLE_RESUME_TEXT, parsed)
print('Saved resume id:', resume_id)

score = compute_match_score(SAMPLE_JOB, SAMPLE_RESUME_TEXT)
print('Computed score:', score)
save_score(resume_id, 'sample_job_1', score)

# %% [markdown]
# 9. LLM integration (Gemini) — prompt examples & placeholders
# Replace with real Gemini calls. Use the Gemini embedding API for embeddings when ready.

# %%
LLM_JSON_PROMPT = '''
Compare the following resume (JSON fields) with the job description. Return a JSON object with:
{
  "score": integer 1-10,
  "score_breakdown": {"skill_overlap":0-100, "semantic":0-100, "experience":0-100, "education":0-100},
  "justification": "Short 1-2 sentence justification.",
  "top_skills": ["skill1","skill2"]
}

Job Description:\n{job}

Resume JSON:\n{resume_json}
'''

print('Prompt template ready. Use LLM client to call Gemini with your key and parse JSON.')

  (rid, filename, datetime.utcnow().isoformat(), source_text, json.dumps(parsed_json)))


Saved resume id: 5289aedc-8a59-4053-a9a2-9703ea73a0c1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computed score: {'skill_overlap_pct': 100.0, 'semantic_pct': 79.53, 'experience_pct': 60.0, 'education_pct': 50, 'final_score_0_100': 80.86, 'final_score_1_10': 8, 'matched_skills': ['aws', 'c', 'docker', 'python', 'pytorch', 'sql'], 'resume_skills': ['aws', 'c', 'docker', 'python', 'pytorch', 'sql', 'tensorflow'], 'job_skills': ['aws', 'c', 'docker', 'python', 'pytorch', 'sql']}


  (sid, resume_id, job_id, json.dumps(score_json), datetime.utcnow().isoformat()))


Prompt template ready. Use LLM client to call Gemini with your key and parse JSON.


In [None]:
# ============================================================
# 🚀 Smart Resume Screener with Gemini Integration (Colab Ready)
# ============================================================

# Install required libraries
!pip install pdfplumber python-docx spacy sentence-transformers google-generativeai fuzzywuzzy python-Levenshtein sqlite-utils tqdm

# Download spaCy model
!python -m spacy download en_core_web_sm


Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)

In [None]:
import os
import pdfplumber
import docx
import spacy
import sqlite3
import json
from tqdm import tqdm
from fuzzywuzzy import fuzz
from sentence_transformers import SentenceTransformer, util
import google.generativeai as genai
from IPython.display import clear_output

clear_output()
print("✅ Libraries loaded successfully!")


✅ Libraries loaded successfully!


In [None]:
nlp = spacy.load("en_core_web_sm")
embed_model = SentenceTransformer('all-MiniLM-L6-v2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def extract_text_from_pdf(path):
    text = ""
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

def extract_text_from_docx(path):
    doc = docx.Document(path)
    return "\n".join([para.text for para in doc.paragraphs])

def read_text_file(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read()


In [None]:
from google.colab import files

print("📄 Upload your Resume file (PDF/DOCX/TXT)")
resume_file = files.upload()

resume_path = list(resume_file.keys())[0]
if resume_path.endswith(".pdf"):
    resume_text = extract_text_from_pdf(resume_path)
elif resume_path.endswith(".docx"):
    resume_text = extract_text_from_docx(resume_path)
else:
    resume_text = read_text_file(resume_path)

print("\n💼 Upload Job Description file (PDF or TXT)")
jd_file = files.upload()

jd_path = list(jd_file.keys())[0]
if jd_path.endswith(".pdf"):
    jd_text = extract_text_from_pdf(jd_path)
else:
    jd_text = read_text_file(jd_path)

print("\n✅ Resume and JD loaded successfully!")


📄 Upload your Resume file (PDF/DOCX/TXT)




Saving 22BCE0476_AmanChauhan_VIT_Vellore.pdf to 22BCE0476_AmanChauhan_VIT_Vellore.pdf





💼 Upload Job Description file (PDF or TXT)


Saving Digital Enablement_FS Tech.pdf to Digital Enablement_FS Tech.pdf

✅ Resume and JD loaded successfully!


In [None]:
def compute_local_score(resume_text, jd_text):
    doc_resume = nlp(resume_text)
    doc_jd = nlp(jd_text)

    # Extract tokens and compare similarity
    similarity = doc_resume.similarity(doc_jd)

    # Simple fuzzy match score
    fuzzy_score = fuzz.token_sort_ratio(resume_text.lower(), jd_text.lower()) / 100

    # Average the two
    final_score = round(((similarity + fuzzy_score) / 2) * 10, 2)

    return {
        "semantic_similarity": round(similarity, 2),
        "fuzzy_match": round(fuzzy_score * 100, 2),
        "final_score_1_10": final_score
    }


In [None]:
GEMINI_API_KEY = input("🔑 Enter your Gemini API key: ")
genai.configure(api_key=GEMINI_API_KEY)

def gemini_resume_match(resume_text, jd_text):
    model = genai.GenerativeModel("gemini-2.0-flash")

    prompt = f"""
    You are a professional recruiter. Evaluate the candidate's resume below against the given job description.
    Provide a JSON output with:
    - fit_score: integer from 1-10
    - justification: one paragraph summary of your reasoning

    Resume:
    {resume_text[:4000]}

    Job Description:
    {jd_text[:4000]}
    """

    try:
        response = model.generate_content(prompt)
        response_text = response.text

        # Try to parse JSON
        start = response_text.find("{")
        end = response_text.rfind("}") + 1
        json_str = response_text[start:end]
        result = json.loads(json_str)
        return result
    except Exception as e:
        return {"fit_score": 0, "justification": f"Error: {str(e)}"}


🔑 Enter your Gemini API key: AIzaSyDWtMhtkI_TXxpxN0INwlhSUDvA6FpnFcc


In [None]:
print("🧠 Analyzing Resume vs JD...")
local_result = compute_local_score(resume_text, jd_text)
gemini_result = gemini_resume_match(resume_text, jd_text)

final_score = round((local_result["final_score_1_10"] + gemini_result["fit_score"]) / 2, 2)


🧠 Analyzing Resume vs JD...


  similarity = doc_resume.similarity(doc_jd)


In [None]:
print("\n==================== RESULTS ====================")
print(f"🧩 Local NLP Match Score: {local_result['final_score_1_10']} / 10")
print(f"🤖 Gemini Fit Score: {gemini_result['fit_score']} / 10")
print(f"🎯 Final Combined Score: {final_score} / 10")
print("\n📋 Gemini Justification:")
print(gemini_result['justification'])
print("=================================================")



🧩 Local NLP Match Score: 5.9 / 10
🤖 Gemini Fit Score: 6 / 10
🎯 Final Combined Score: 5.95 / 10

📋 Gemini Justification:
The candidate demonstrates a strong technical background with experience in AI/ML, data science, and full-stack development. They have project experience relevant to digital enablement, including AI-powered systems, data analysis pipelines, and predictive modeling. However, the resume lacks explicit experience within the financial services industry, IT strategy, governance, or specific financial technology applications (Core banking solutions, Treasury applications, loan processing applications, risk management platforms, asset liability systems), which are critical requirements for the role. The resume is also poorly formatted.


In [None]:
conn = sqlite3.connect("resume_results.db")
c = conn.cursor()

c.execute("""
CREATE TABLE IF NOT EXISTS results (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    resume_name TEXT,
    jd_name TEXT,
    local_score REAL,
    gemini_score REAL,
    final_score REAL,
    justification TEXT
)
""")

c.execute("INSERT INTO results (resume_name, jd_name, local_score, gemini_score, final_score, justification) VALUES (?, ?, ?, ?, ?, ?)",
          (resume_path, jd_path, local_result['final_score_1_10'], gemini_result['fit_score'], final_score, gemini_result['justification']))

conn.commit()
conn.close()

print("\n💾 Result saved to database successfully!")



💾 Result saved to database successfully!
