STEP 1: Install Required Libraries

In [None]:
!pip install spacy pdfplumber docx2txt
!python -m spacy download en_core_web_sm



Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


STEP 2: Upload Resume File (PDF / DOCX / TXT)

In [None]:
from google.colab import files

uploaded = files.upload()
resume_file = list(uploaded.keys())[0]
resume_file


Saving Yash Resume.pdf to Yash Resume (1).pdf


'Yash Resume (1).pdf'

STEP 3: Extract Text from Resume

In [None]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

resume_text = extract_text_from_pdf(resume_file)
print(resume_text[:800])


YASH KEDAR
+91 7249611609 yashkedar19@gmail.com
linkedin.com/in/yashkedarj0909
Sawant Chowk, Pathardi Phata, Nashik-422010
OBJECTIVE
To secure a position in the field of Information Technology that offers exposure to the latest technologies, allowing
me to enhance my skills and apply my knowledge effectively. I aim to contribute meaningfully while continuously
growing both personally and professionally.
EDUCATION
Bachelor of Technology in Information Technology 2022 - 2026
Yeshwantrao Chavan College Of Engineering Nagpur CGPA: 8.11
Higher Secondary Education(12th Grade) 2021 - 2022
Shri.Pramod Patil Junior College Nashik 76.17%
Secondary School Education(10th Grade) 2019 - 2020
St. Francis High School Rane Nagar Nashik 86.60%
SKILLS
Technical Skills: Java, Python, MySQL, Machine Learning
S


STEP 4: Load spaCy NLP Model

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(resume_text)

STEP 5: CORRECT NAME EXTRACTION

In [None]:
import re

def extract_name(resume_text, doc):
    lines = resume_text.split("\n")
    lines = [line.strip() for line in lines if line.strip()]

    #  Rule 1: ALL CAPS name (BEST for your resume)
    for line in lines[:15]:
        if (
            line.isupper() and
            2 <= len(line.split()) <= 3 and
            all(word.isalpha() for word in line.split())
        ):
            return line.title()

    # Rule 2: Capitalized name using Regex
    for line in lines[:15]:
        if re.match(r'^[A-Z][a-z]+(?:\s[A-Z][a-z]+){1,2}$', line):
            return line

    #  Rule 3: spaCy NER fallback (filtered)
    for ent in doc.ents:
        if (
            ent.label_ == "PERSON" and
            2 <= len(ent.text.split()) <= 3 and
            "phata" not in ent.text.lower() and
            "chowk" not in ent.text.lower()
        ):
            return ent.text

    return None

name = extract_name(resume_text, doc)
name


'Yash Kedar'

STEP 6: Extract Email

In [None]:
import re

email = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", resume_text)
email = email[0] if email else None
email


'yashkedar19@gmail.com'

STEP 7: Extract Phone Number

In [None]:
phone = re.findall(r"\+?\d{1,3}[\s-]?\d{10}", resume_text)
phone = phone[0] if phone else None
phone



'+91 7249611609'

STEP 8: Extract Skills

In [None]:
SKILLS_DB = [
    "python", "java", "mysql", "machine learning",
    "deep learning", "nlp", "sql",
    "tensorflow", "pytorch", "scikit-learn"
]

def extract_skills(text):
    text = text.lower()
    skills = [skill for skill in SKILLS_DB if skill in text]
    return list(set(skills))

skills = extract_skills(resume_text)
skills


['mysql', 'sql', 'python', 'machine learning', 'java']

STEP 9: Extract Education

In [None]:
EDU_KEYWORDS = ["bachelor", "b.tech", "higher secondary", "secondary"]

education = []

for line in resume_text.split("\n"):
    line_lower = line.lower()
    if any(word in line_lower for word in EDU_KEYWORDS):
        if line.strip() not in education:
            education.append(line.strip())

education




['Bachelor of Technology in Information Technology 2022 - 2026',
 'Higher Secondary Education(12th Grade) 2021 - 2022',
 'Secondary School Education(10th Grade) 2019 - 2020']

STEP 10: Extract Experience

In [None]:
import re

def extract_experience(text):
    patterns = [
        r"\b\d{4}\s*-\s*\d{4}\b",
        r"\b\d{4}\s*-\s*present\b",
        r"\b\d+\+?\s+years?\b"
    ]

    experience = []
    for pattern in patterns:
        experience.extend(re.findall(pattern, text.lower()))

    return list(set(experience))

experience = extract_experience(resume_text)
experience



['2019 - 2020', '2021 - 2022', '2022 - 2026']

STEP 11: Store Extracted Data in JSON

In [None]:
import json

parsed_resume = {
    "name": name,
    "email": email,
    "phone": phone,
    "skills": skills,
    "education": education,
    "experience": experience
}

with open("parsed_resume.json", "w") as f:
    json.dump(parsed_resume, f, indent=4)

parsed_resume


{'name': 'Yash Kedar',
 'email': 'yashkedar19@gmail.com',
 'phone': '+91 7249611609',
 'skills': ['mysql', 'sql', 'python', 'machine learning', 'java'],
 'education': ['Bachelor of Technology in Information Technology 2022 - 2026',
  'Higher Secondary Education(12th Grade) 2021 - 2022',
  'Secondary School Education(10th Grade) 2019 - 2020'],
 'experience': ['2019 - 2020', '2021 - 2022', '2022 - 2026']}

STEP 12: Download JSON File

In [None]:
files.download("parsed_resume.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>