In [None]:
import pdfplumber
import re
import spacy
import json
from spacy.matcher import PhraseMatcher
``
nlp = spacy.load("en_core_web_sm")

SKILLS_DB = [
    "Python", "JavaScript", "HTML", "CSS", "TensorFlow", "PyTorch",
    "Scikit-learn", "OpenCV", "BERT", "Neural Networks",
    "Computer Vision", "NLP", "Django", "REST API", "MongoDB",
    "Git", "AWS", "Jupyter", "Postman", "CI/CD"
]

degree_keywords = [
    "B.Tech", "Bachelor", "Master", "M.Tech", "PhD", "Diploma"
]

pdf_path = "ML_Resume.pdf"

text = ""
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        extracted = page.extract_text()
        if extracted:
            text += extracted + "\n"

email_pattern = r"[\w\.-]+@[\w\.-]+\.\w+"
emails = re.findall(email_pattern, text)
email = emails[0] if emails else None

phone_pattern = r"(\+?\d{1,4}[\s-]?)?(\d{10}|\d{4}[\s-]?\d{6}|\d{5}[\s-]?\d{5})"
phones = re.findall(phone_pattern, text)
phone = ''.join(phones[0]).strip() if phones else None

doc = nlp(text)

name = None
for ent in doc.ents:
    if ent.label_ == "PERSON":
        name = ent.text
        break

education_info = {}
edu_pattern = re.compile(r"(Education|Academic Background|Qualifications)(.*?)(Experience|Projects|Skills|$)", re.S | re.I)
match = edu_pattern.search(text)
degree = None
institution = None
if match:
    edu_text = match.group(2).strip()
    lines = edu_text.split('\n')
    for line in lines:
        for deg_kw in degree_keywords:
            if deg_kw.lower() in line.lower():
                degree = line.strip()
        if any(kw in line for kw in ["College", "University", "Institute", "Engineering", "Saveetha"]):
            institution = line.strip()
education_info['Degree'] = degree
education_info['Institution'] = institution

matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp(skill) for skill in SKILLS_DB]
matcher.add("SKILLS", None, *patterns)
matches = matcher(doc)
skills = set()
for match_id, start, end in matches:
    skill = doc[start:end].text
    skills.add(skill)
skills = list(skills)

exp_pattern = re.compile(r"(Experience|Work Experience|Professional Experience)(.*?)(Education|Projects|Skills|$)", re.S | re.I)
work_experience = []
match = exp_pattern.search(text)
if match:
    exp_text = match.group(2).strip()
    lines = exp_text.split('\n')
    current_exp = {}
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if re.search(r"\b(20\d{2}|Present)\b", line):
            if current_exp:
                work_experience.append(current_exp)
                current_exp = {}
            current_exp['Duration'] = line
        elif "at" in line.lower():
            parts = line.split(" at ")
            if len(parts) == 2:
                current_exp['Role'] = parts[0].strip()
                current_exp['Company'] = parts[1].strip()
            else:
                current_exp['Role'] = line
        else:
            if 'Description' not in current_exp:
                current_exp['Description'] = line
            else:
                current_exp['Description'] += " " + line
    if current_exp:
        work_experience.append(current_exp)

parsed_data = {
    "Name": name,
    "Email": email,
    "Phone": phone,
    "Degree": education_info.get('Degree'),
    "Institution": education_info.get('Institution'),
    "Skills": skills,
    "Work_Experience": work_experience
}

print(json.dumps(parsed_data, indent=4))
with open("parsed_resume.json", "w") as f_out:
    json.dump(parsed_data, f_out, indent=4)


{
    "Name": "Yuvaraj S\n",
    "Email": "ai.yuvaraj21@gmail.com",
    "Phone": "+91 9384137766",
    "Degree": "B.Tech in Artificial Intelligence and Machine Learning Expected: May 2026",
    "Institution": "Saveetha Engineering College Chennai, India",
    "Skills": [
        "Scikit-learn",
        "Neural Networks",
        "MongoDB",
        "TensorFlow",
        "REST API",
        "computer vision",
        "Computer Vision",
        "OpenCV",
        "JavaScript",
        "AWS",
        "Git",
        "CSS",
        "NLP",
        "Postman",
        "BERT",
        "Python",
        "Django",
        "PyTorch",
        "HTML",
        "Jupyter",
        "CI/CD"
    ],
    "Work_Experience": [
        {
            "Duration": "Machine Learning Engineer July 2024 \u2013 Present",
            "Description": "Techso IT LLC, USA \u2013 Developed real-time audio transcription models with 80% accuracy using TensorFlow and Python",
            "Role": "\u2013 Integrated ML pipelines 