In [2]:
!pip -q install spacy pdfplumber pytesseract opencv-python pillow pandas numpy
!python -m spacy download en_core_web_sm


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en

In [3]:
from google.colab import files
uploaded = files.upload()
list(uploaded.keys())[:5]


Saving cv_moka.pdf to cv_moka.pdf


['cv_moka.pdf']

Data Extraction:

In [4]:
import re
import json
import pandas as pd
import numpy as np

import pdfplumber
from PIL import Image
import pytesseract
import spacy

nlp = spacy.load("en_core_web_sm")

# ---------- Helpers ----------
EMAIL_RE = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")
PHONE_RE = re.compile(r"(\+?\d{1,3}[\s\-\.]?)?(\(?\d{3}\)?[\s\-\.]?)\d{3}[\s\-\.]?\d{4}")

def clean_text(text: str) -> str:
    if not text:
        return ""
    text = text.replace("\x0c", " ")  # form feed from OCR
    text = re.sub(r"\s+", " ", text).strip()
    return text

def extract_text_from_pdf(path: str) -> str:
    # Try normal PDF text extraction first
    all_text = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            t = page.extract_text() or ""
            all_text.append(t)
    text = clean_text("\n".join(all_text))
    return text

def extract_text_from_image(path: str) -> str:
    img = Image.open(path)
    text = pytesseract.image_to_string(img)
    return clean_text(text)

def get_email(text: str):
    m = EMAIL_RE.search(text)
    return m.group(0) if m else None

def get_phone(text: str):
    m = PHONE_RE.search(text)
    if not m:
        return None
    return clean_text(m.group(0))

def get_name_spacy(text: str):
    # Heuristic: first PERSON entity early in the doc
    doc = nlp(text[:2000])  # only first part for speed
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            # avoid tiny names like 1-word common words
            if len(ent.text.split()) >= 2:
                return ent.text.strip()
    return None

# Minimal skills list (you can expand later)
SKILLS = [
    "python","sql","excel","power bi","tableau","machine learning","deep learning",
    "pandas","numpy","scikit-learn","tensorflow","keras","nlp","spacy","opencv",
    "git","github","aws","linux","statistics","data analysis","data visualization"
]

def get_skills(text: str):
    t = text.lower()
    found = []
    for s in SKILLS:
        if s in t:
            found.append(s.title())
    return sorted(list(set(found)))

def parse_resume(text: str, filename: str):
    text = clean_text(text)
    return {
        "file": filename,
        "name": get_name_spacy(text),
        "email": get_email(text),
        "phone": get_phone(text),
        "skills": get_skills(text),
        "raw_text_preview": text[:400]  # preview only
    }

def extract_text_auto(filename: str) -> str:
    lower = filename.lower()
    if lower.endswith(".pdf"):
        text = extract_text_from_pdf(filename)
        # if PDF text is too short, assume scanned -> OCR not available from pdf pages here
        # (Simple approach for Phase 1: ask user to upload image/PDF that has text)
        return text
    elif lower.endswith((".png",".jpg",".jpeg")):
        return extract_text_from_image(filename)
    else:
        return ""


Run extraction on uploaded file:

In [5]:
results = []
for fname in uploaded.keys():
    text = extract_text_auto(fname)
    results.append(parse_resume(text, fname))

df = pd.DataFrame(results)
df


Unnamed: 0,file,name,email,phone,skills,raw_text_preview
0,cv_moka.pdf,Khalil Mokraoui,khalil.mokraoui@protonmail.com,506 012 1234,[Excel],Khalil Mokraoui 506 012 1234 khalil.mokraoui@p...


In [6]:
# ---------- Phase 3: Named Entity Recognition ----------

TARGET_LABELS = {"PERSON", "ORG", "GPE", "DATE"}

def extract_entities_spacy(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        if ent.label_ in TARGET_LABELS:
            entities.append({
                "entity_text": ent.text.strip(),
                "entity_label": ent.label_
            })
    return entities

# Apply NER on the resume text
df["entities"] = df["raw_text_preview"].apply(extract_entities_spacy)

# Show result
df[["file", "name", "entities"]]


Unnamed: 0,file,name,entities
0,cv_moka.pdf,Khalil Mokraoui,"[{'entity_text': 'Khalil Mokraoui', 'entity_la..."


Save Output:

In [5]:
df.to_csv("parsed_resumes.csv", index=False)

with open("parsed_resumes.json", "w") as f:
    json.dump(results, f, indent=2)

from google.colab import files
files.download("parsed_resumes.csv")
files.download("parsed_resumes.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>