## Phase 1 - Setup & Data Ingestion

In [1]:
# Loading dependencies

import fitz  # PyMuPDF
import docx
import os
import pandas as pd

In [2]:
# Project structure
os.makedirs("data/raw", exist_ok=True)   # raw resumes (PDF/DOCX)
os.makedirs("data/processed", exist_ok=True)  # cleaned extracted text

In [3]:
# Extract Text from pdf
def extract_text_from_pdf(file_path: str) -> str:
    """Extract text from a PDF file using PyMuPDF."""
    text = ""
    with fitz.open(file_path) as doc:
        for page in doc:
            text += page.get_text("text") + "\n"
    return text.strip()

In [4]:
# Extract Text from docx
def extract_text_from_docx(file_path: str) -> str:
    """Extract text from a DOCX file using python-docx."""
    doc = docx.Document(file_path)
    text = "\n".join([para.text for para in doc.paragraphs])
    return text.strip()

In [5]:
# Function to handle multiple file types
def extract_resume_text(file_path: str) -> str:
    """Extract text depending on file type (PDF/DOCX)."""
    ext = os.path.splitext(file_path)[1].lower()
    
    if ext == ".pdf":
        return extract_text_from_pdf(file_path)
    elif ext == ".docx":
        return extract_text_from_docx(file_path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")

In [6]:
# Load and process multiple files

# Example: Process sample resumes in data/raw directory
resume_dir = "data/raw"

extracted_data = []

for filename in os.listdir(resume_dir):
    file_path = os.path.join(resume_dir, filename)
    try:
        text = extract_resume_text(file_path)
        extracted_data.append({"filename": filename, "text": text})
    except Exception as e:
        print(f"Error processing {filename}: {e}")

# Convert to DataFrame for inspection
df_resumes = pd.DataFrame(extracted_data)
df_resumes.head()

Unnamed: 0,filename,text
0,candidate_018.pdf,EVELYNN ADAMS\nGRADUATE FRESHER\nPROFESSIONAL ...
1,1901841_RESUME.pdf,ANUVA GOYAL \n \nD.O.B.: 1st October 2000 \nGe...
2,AnuvaGoyal_Latex.pdf,ANUVA GOYAL\n[ anuvagoyal111@gmail.com\n½ Agra...
3,candidate_056.pdf,Christian Von\nKelin\nJ U N I O R A N A L Y S...
4,candidate_042.pdf,Ryan Nelson\nF R E S H E R S O F T W A R E D...


In [7]:
# Save the cleaned text

# Save processed results
output_path = "data/processed/resumes.csv"
df_resumes.to_csv(output_path, index=False)
print(f"✅ Processed resumes saved at {output_path}")

✅ Processed resumes saved at data/processed/resumes.csv


## Phase 2 - Information Extraction (NLP - Pipeline)

In [8]:
import re
import spacy

# Load spaCy model 
nlp = spacy.load("en_core_web_trf")

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Extract emails and phone numbers with regex

def extract_email(text: str):
    """Extract the first email found in the text."""
    match = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
    return match.group(0) if match else None


def extract_phone(text: str):
    """Extract the first phone number (very general pattern)."""
    match = re.search(r"(\+?\d{1,3})?[\s\-]?\(?\d{2,4}\)?[\s\-]?\d{3,4}[\s\-]?\d{3,4}", text)
    return match.group(0) if match else None

In [10]:
# Extract candidate name with spaCY NER

def extract_name(text: str):
    """Extract a candidate's name using spaCy NER (PERSON entity)."""
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return ent.text
    return None

In [11]:
# Extract skills with dictionary lookup and semantic approach

# Example skills dictionary 
skills_list = [
    "Python", "Java", "C++", "SQL", "Machine Learning", "Deep Learning",
    "NLP", "Computer Vision", "TensorFlow", "PyTorch",
    "React", "Node.js", "AWS", "Docker", "Kubernetes"
]

def extract_skills(text: str, skills=skills_list):
    """Extract matching skills from text (case-insensitive)."""
    found_skills = []
    for skill in skills:
        pattern = r"\b" + re.escape(skill) + r"\b"
        if re.search(pattern, text, flags=re.IGNORECASE):
            found_skills.append(skill)
    return list(set(found_skills))

In [12]:
# Apply extraction pipeline to resumes

def parse_resume(text: str):
    """Run the full information extraction pipeline on resume text."""
    return {
        "name": extract_name(text),
        "email": extract_email(text),
        "phone": extract_phone(text),
        "skills": extract_skills(text)
    }

# Apply to our processed dataframe
parsed_data = []

for _, row in df_resumes.iterrows():
    details = parse_resume(row["text"])
    details["filename"] = row["filename"]
    parsed_data.append(details)

df_parsed = pd.DataFrame(parsed_data)
df_parsed.head()

  with torch.cuda.amp.autocast(self._mixed_precision):


Unnamed: 0,name,email,phone,skills,filename
0,,,,"[Machine Learning, Deep Learning, Python, Java...",candidate_018.pdf
1,ANUVA GOYAL,anuvagoyal111@gmail.com,+91 9520349542,"[Computer Vision, SQL, Machine Learning, NLP, ...",1901841_RESUME.pdf
2,,anuvagoyal111@gmail.com,,"[Computer Vision, SQL, Machine Learning, NLP, ...",AnuvaGoyal_Latex.pdf
3,Christian Von,,,"[Python, Machine Learning]",candidate_056.pdf
4,Ryan Nelson,,,"[Python, Java, Machine Learning]",candidate_042.pdf


In [13]:
# Save parsed resume information

output_parsed_path = "data/processed/resumes_parsed.csv"
df_parsed.to_csv(output_parsed_path, index=False)
print(f"✅ Parsed resume info saved at {output_parsed_path}")

✅ Parsed resume info saved at data/processed/resumes_parsed.csv


## Phase 3 - Embeddings and Semantic Search

In [14]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm

In [15]:
# 1) Prepare data: combine raw text + any important parsed fields into a single "document" to embed
# Merging the two files: df_resumes has columns ['filename','text'] 
# and df_parsed has ['filename','name','email','phone','skills']

# Merge on filename (if not already merged)
df = df_resumes.merge(df_parsed, on="filename", how="left")

# Create a single field to embed (you can tweak how much metadata to include)
def make_document(row):
    parts = []
    if pd.notna(row.get("name")): parts.append(str(row["name"]))
    parts.append(row["text"][:2000])   # truncate to reasonable length for speed
    if pd.notna(row.get("skills")):
        # skills might be list-like or string; ensure string
        skills = row["skills"]
        if isinstance(skills, (list, tuple)):
            parts.append("Skills: " + ", ".join(skills))
        else:
            parts.append("Skills: " + str(skills))
    return "\n".join(parts)

df["document"] = df.apply(make_document, axis=1)
len(df), df.columns

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
# 2) Load sentence-transformers model
# I recommend "all-MiniLM-L6-v2" for good speed/quality trade-off
model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

In [None]:
# 3) Create embeddings (batching for speed)
batch_size = 64
documents = df["document"].tolist()
embeddings = []

for i in tqdm(range(0, len(documents), batch_size)):
    batch = documents[i:i+batch_size]
    emb = model.encode(batch, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True)
    embeddings.append(emb)

embeddings = np.vstack(embeddings)   # shape: (N, D)
embeddings.shape

In [None]:
# 4) Build FAISS index (cosine similarity via normalized vectors -> inner product)
d = embeddings.shape[1]   # embedding dimension
index = faiss.IndexFlatIP(d)   # inner-product index (works with normalized vectors)
index.add(embeddings)         # add vectors

print("FAISS index: n_vectors =", index.ntotal)

In [None]:
# 5) Save index and metadata for persistence
os.makedirs("models/faiss", exist_ok=True)
faiss.write_index(index, "models/faiss/resumes_index.faiss")
np.save("models/faiss/resume_embeddings.npy", embeddings)

# Save metadata (filenames + parsed fields)
df_meta = df[["filename", "name", "email", "phone", "skills", "document"]].copy()
df_meta.to_pickle("models/faiss/resume_metadata.pkl")
print("Saved FAISS index and metadata.")

In [None]:
# 6) Search function: embed query, query FAISS, return ranked results
def search_resumes(query: str, top_k: int = 5):
    # 1) embed query (and normalize)
    q_emb = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    
    # 2) search
    scores, idxs = index.search(q_emb, top_k)
    scores = scores[0]      # shape (top_k,)
    idxs = idxs[0]          # shape (top_k,)
    results = []
    
    for score, i in zip(scores, idxs):
        if i == -1:
            continue
        row = df_meta.iloc[i]
        results.append({
            "filename": row["filename"],
            "name": row["name"],
            "email": row["email"],
            "phone": row["phone"],
            "skills": row["skills"],
            "score": float(score),
            "snippet": row["document"][:500]   # short preview
        })
    return results

In [None]:
# 7) Example queries
queries = [
    "machine learning engineer with PyTorch and NLP experience",
    "frontend engineer with React and Node.js",
    "data scientist with SQL and deep learning"
]

for q in queries:
    print("\nQuery:", q)
    res = search_resumes(q, top_k=3)
    for i, r in enumerate(res, 1):
        print(f"{i}. {r['filename']} | name: {r['name']} | score: {r['score']:.4f}")
        print("   skills:", r["skills"])
        print("   snippet:", r["snippet"][:200].replace("\n"," ") + "...")

In [None]:
# 8) Add new resume(s) incrementally (without rebuilding entire index)

# - Extract text -> build document -> embed -> add to index -> append to df_meta and save embeddings

def add_new_resume(file_path: str, filename: str=None):
    # extract text using your earlier function
    text = extract_resume_text(file_path)   # from Phase 1
    
    # parse details using parse_resume (from Phase 2)
    parsed = parse_resume(text)
    
    # build document
    row = {
        "filename": filename or os.path.basename(file_path),
        "name": parsed.get("name"),
        "email": parsed.get("email"),
        "phone": parsed.get("phone"),
        "skills": parsed.get("skills"),
        "document": make_document({"name": parsed.get("name"), "text": text, "skills": parsed.get("skills")})
    }
    
    # embed
    emb = model.encode([row["document"]], convert_to_numpy=True, normalize_embeddings=True)
    
    # add to index
    index.add(emb)
    
    # append to metadata dataframe
    global df_meta
    df_meta = pd.concat([df_meta, pd.DataFrame([row])], ignore_index=True)
    
    # optionally save updated index + metadata
    faiss.write_index(index, "models/faiss/resumes_index.faiss")
    np.save("models/faiss/resume_embeddings.npy", np.vstack([embeddings, emb]))
    df_meta.to_pickle("models/faiss/resume_metadata.pkl")
    print(f"Added {row['filename']} to index.")