
# Rocket Matching — Interview + Resume Ingestion (Role/Interests/Skills/YearsExp)

This notebook extends the hybrid matcher by:
- Capturing **role, interests, skills, years_exp** during the interview
- **Resume ingestion** (PDF/DOCX) → text extraction → interests/skills/orgs enrichment
- Merging interview and resume signals with **confidence weighting**


In [1]:

# !pip install numpy pandas scikit-learn networkx geopy spacy keybert sentence-transformers scipy pdfplumber python-docx
# !python -m spacy download en_core_web_sm


In [2]:

import numpy as np, pandas as pd, os
from typing import List, Tuple, Dict, Any, Optional
import importlib
np.random.seed(42)


## 1) Interview schema

In [3]:

INTERVIEW_QUESTIONS = {
    "human": "Tell me about yourself: age range, city/country, lifestyle, hobbies, pets.",
    "role": "What best describes your role (Founder, Engineer, Designer, Researcher, etc.)?",
    "interests": "List a few interests (comma-separated).",
    "skills": "List your key skills (comma-separated).",
    "years_exp": "How many years of professional experience? (integer)",
    "professional": "Notable employers/clients and things you can produce?",
    "contributor": "How do you like to work? Past projects? What do you bring?",
    "reason": "Why are you joining Rocket? (Expand network, find projects, collaborators, build a dream...)"
}


## 2) Resume ingestion utils (PDF/DOCX → text)

In [4]:

def read_pdf_text(file_path: str) -> str:
    try:
        import pdfplumber
    except Exception:
        raise ImportError("pdfplumber not installed. pip install pdfplumber")
    text = []
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            try:
                text.append(page.extract_text() or "")
            except Exception:
                continue
    return "\n".join(text)

def read_docx_text(file_path: str) -> str:
    try:
        import docx
    except Exception:
        raise ImportError("python-docx not installed. pip install python-docx")
    doc = docx.Document(file_path)
    return "\n".join([p.text for p in doc.paragraphs])

def read_resume_text(file_path: Optional[str]) -> str:
    if not file_path:
        return ""
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return read_pdf_text(file_path)
    if ext == ".docx":
        return read_docx_text(file_path)
    raise ValueError("Unsupported resume type. Please upload .pdf or .docx")


## 3) NLP extraction (spaCy + KeyBERT)

In [5]:

def ensure_loaded_spacy():
    if importlib.util.find_spec("spacy") is None:
        raise ImportError("spaCy not installed. Run: pip install spacy && python -m spacy download en_core_web_sm")
    import spacy
    try:
        nlp = spacy.load("en_core_web_sm")
    except Exception as e:
        raise RuntimeError("spaCy model not found. Run: python -m spacy download en_core_web_sm") from e
    return nlp

def ensure_loaded_keybert():
    if importlib.util.find_spec("keybert") is None:
        raise ImportError("KeyBERT not installed. Run: pip install keybert sentence-transformers")
    from keybert import KeyBERT
    from sentence_transformers import SentenceTransformer
    return KeyBERT(model=SentenceTransformer("all-MiniLM-L6-v2"))

def extract_fields_from_text(text: str, top_k=15):
    nlp = ensure_loaded_spacy()
    kw_model = ensure_loaded_keybert()
    doc = nlp(text)

    locations = [ent.text for ent in doc.ents if ent.label_ in {"GPE","LOC"}]
    orgs = [ent.text for ent in doc.ents if ent.label_ in {"ORG"}]

    kp = kw_model.extract_keywords(
        text, keyphrase_ngram_range=(1,3), stop_words="english",
        top_n=top_k, use_mmr=True, diversity=0.6
    )
    keyphrases = [k for k,_ in kp]

    skills = [k for k in keyphrases if any(s in k.lower() for s in [
        "python","ml","machine learning","nlp","design","product","kubernetes","aws",
        "go","django","pytorch","data","ux","branding","marketing","growth","strategy","react","sql"
    ])]
    interests = [k for k in keyphrases if k not in skills]

    def dedupe(seq):
        out = []
        for x in seq:
            if x not in out: out.append(x)
        return out

    return {
        "locations": dedupe(locations),
        "orgs": dedupe(orgs),
        "skills": dedupe(skills)[:20],
        "interests": dedupe(interests)[:20],
        "keyphrases": keyphrases
    }


## 4) Interview normalizer + resume merge

In [6]:

def parse_comma_list(s: str) -> List[str]:
    return [x.strip() for x in (s or "").split(",") if x.strip()]

def normalize_interview(answers: Dict[str,str]) -> Dict[str,Any]:
    out = {
        "role": answers.get("role","").strip() or "Undecided",
        "interests": ", ".join(parse_comma_list(answers.get("interests",""))[:20]),
        "skills": ", ".join(parse_comma_list(answers.get("skills",""))[:20]),
        "years_exp": int(str(answers.get("years_exp","0")).strip() or 0),
        "bio": (answers.get("professional","") or "")[:200],
        "reason_for_joining": (answers.get("reason","") or "").strip()
    }
    return out

def merge_interview_resume(interview: Dict[str,Any], resume_text: str):
    extracted = extract_fields_from_text(resume_text) if resume_text else {"skills":[], "interests":[], "locations":[], "orgs":[], "keyphrases":[]}
    i_skills = parse_comma_list(interview.get("skills",""))
    r_skills = extracted.get("skills", [])
    i_interests = parse_comma_list(interview.get("interests",""))
    r_interests = extracted.get("interests", [])

    skills = list(dict.fromkeys(i_skills + r_skills))
    interests = list(dict.fromkeys(i_interests + r_interests))

    merged = dict(interview)
    merged.update({
        "skills": ", ".join(skills[:20]),
        "interests": ", ".join(interests[:20]),
        "resume_locations": extracted.get("locations", []),
        "resume_orgs": extracted.get("orgs", []),
        "resume_keyphrases": extracted.get("keyphrases", [])
    })
    return merged


## 5) Demo usage

In [7]:

demo_answers = {
    "human": "32, Toronto; runs & cooks; has a dog.",
    "role": "Engineer",
    "interests": "creator tools, healthcare AI, community projects, open source",
    "skills": "python, django, react, aws, ml, data",
    "years_exp": "6",
    "professional": "Built recommender POCs and full‑stack MVPs; ex‑fintech.",
    "contributor": "Async, milestone‑driven; I bring velocity and reliability.",
    "reason": "Find collaborators and expand my network"
}

resume_path = None  # set to '/mnt/data/resume.pdf' or '/mnt/data/resume.docx' if you upload one
resume_text = read_resume_text(resume_path) if resume_path else ""
interview_norm = normalize_interview(demo_answers)
merged_profile = merge_interview_resume(interview_norm, resume_text)
merged_profile


{'role': 'Engineer',
 'interests': 'creator tools, healthcare AI, community projects, open source',
 'skills': 'python, django, react, aws, ml, data',
 'years_exp': 6,
 'bio': 'Built recommender POCs and full‑stack MVPs; ex‑fintech.',
 'reason_for_joining': 'Find collaborators and expand my network',
 'resume_locations': [],
 'resume_orgs': [],
 'resume_keyphrases': []}