
# Rocket Matching — End‑to‑End: Interview Extraction + Hybrid Recommender (50 users)

This notebook combines:
- **Interview capture & extraction** (role, interests, skills, years_exp, reason) + unstructured parsing (spaCy + KeyBERT-ready).
- **Hybrid matching**: content (text/geo/exp/role), **skills** (similar vs complementary), CF (implicit), graph (Personalized PageRank), personality, reciprocity, MMR diversification.
- **Synthetic cohort (n=50)** with diverse backgrounds: creators, scientists, writers, ML engineers, brand founders, ocean projects, YouTubers, social apps, etc.

> Optional installs (run locally): `pip install spacy keybert sentence-transformers scipy pdfplumber python-docx networkx geopy scikit-learn`.


In [2]:

import numpy as np, pandas as pd, random, math
from typing import List, Tuple, Dict, Any, Optional
from dataclasses import dataclass
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
from geopy.distance import geodesic

np.random.seed(7); random.seed(7)


## Personality (TIPI → Big Five)

In [3]:

@dataclass
class BigFive:
    O: float; C: float; E: float; A: float; N: float

def clip01(x): 
    import numpy as np
    return float(np.clip(x, 0.0, 1.0))

TIPI_KEY = {
    1: ("E", False), 2: ("A", True), 3: ("C", False), 4: ("N", False), 5: ("O", False),
    6: ("E", True),  7: ("A", False),8: ("C", True),  9: ("N", True),  10:("O", True)
}

def score_tipi(responses_1to7):
    import numpy as np
    assert len(responses_1to7)==10
    r = np.array(responses_1to7, dtype=float)
    r01 = (r-1)/6.0  # 1..7 -> 0..1
    traits = {"O":[], "C":[], "E":[], "A":[], "N":[]}
    for i,val in enumerate(r01, start=1):
        trait, rev = TIPI_KEY[i]
        traits[trait].append(1.0-val if rev else val)
    return BigFive(*(clip01(np.mean(traits[t])) for t in ["O","C","E","A","N"]))

def bigfive_cosine(u: BigFive, v: BigFive) -> float:
    import numpy as np
    a = np.array([u.O,u.C,u.E,u.A,u.N])
    b = np.array([v.O,v.C,v.E,v.A,v.N])
    return float(a @ b / (np.linalg.norm(a)*np.linalg.norm(b) + 1e-9))


## Interview schema + lightweight extraction

In [4]:

INTERVIEW_FIELDS = ["human","role","interests","skills","years_exp","professional","contributor","interests_long","reason"]

def parse_comma_list(s: str) -> List[str]:
    return [x.strip() for x in (s or "").split(",") if x.strip()]

def normalize_interview(answers: Dict[str,str]) -> Dict[str,Any]:
    # Enforce <=250 words per answer for realism (truncate if needed)
    def wclip(t): 
        words = t.split()
        return " ".join(words[:250])
    answers = {k: wclip(v or "") for k,v in answers.items()}
    out = {
        "role": answers.get("role","Undecided").strip() or "Undecided",
        "interests": ", ".join(parse_comma_list(answers.get("interests",""))[:20]),
        "skills": ", ".join(parse_comma_list(answers.get("skills",""))[:20]),
        "years_exp": int(str(answers.get("years_exp","0")).strip() or 0),
        "bio": (answers.get("professional","") or "")[:220],
        "reason_for_joining": (answers.get("reason","") or "").strip(),
        "long_text": " ".join([answers.get("human",""), answers.get("contributor",""), answers.get("interests_long","")])[:1500]
    }
    return out


## Content features (text + geo + experience + role)

In [6]:

def build_text_matrix(df: pd.DataFrame) -> Tuple[TfidfVectorizer, any]:
    corpus = (df['interests'].fillna('') + " ; " + df['skills'].fillna('') + " ; " + df['bio'].fillna('')).tolist()
    vec = TfidfVectorizer(ngram_range=(1,2), min_df=1)
    X = vec.fit_transform(corpus)
    return vec, X

def geo_similarity(df: pd.DataFrame, decay_km: float = 2500.0) -> np.ndarray:
    n = len(df); S = np.zeros((n,n), dtype=float)
    coords = list(zip(df['lat'], df['lon']))
    for i in range(n):
        for j in range(n):
            if i==j: continue
            d_km = geodesic(coords[i], coords[j]).km
            S[i,j] = np.exp(-d_km/decay_km)
    if S.max()>0: S = S/S.max()
    return S

def experience_compatibility(years: List[int], sweet_spot: float = 3.0) -> np.ndarray:
    years = np.array(years); n=len(years); S=np.zeros((n,n),dtype=float)
    for i in range(n):
        for j in range(n):
            if i==j: continue
            gap = abs(years[i]-years[j])
            S[i,j] = np.exp(-((gap-sweet_spot)**2)/(2*(sweet_spot**2)))
    if S.max()>0: S = S/S.max()
    return S

ROLE_COMP = {
    "Founder": {"Engineer": 1.0, "Designer": 1.0, "Researcher": 0.8, "Founder": 0.2, "Writer":0.6, "Scientist":0.7, "Creator":0.8},
    "Engineer": {"Founder": 1.0, "Designer": 0.7, "Engineer": 0.2, "Researcher": 0.6, "Writer":0.6, "Scientist":0.8, "Creator":0.7},
    "Designer": {"Founder": 1.0, "Engineer": 0.7, "Designer": 0.2, "Researcher": 0.5, "Writer":0.6, "Scientist":0.5, "Creator":0.9},
    "Researcher": {"Founder": 0.8, "Engineer": 0.7, "Designer": 0.5, "Researcher": 0.3, "Writer":0.5, "Scientist":0.9, "Creator":0.6},
    "Writer": {"Founder":0.8, "Engineer":0.6, "Designer":0.7, "Researcher":0.5, "Writer":0.2, "Scientist":0.5, "Creator":0.9},
    "Scientist":{"Founder":0.9, "Engineer":0.9, "Designer":0.5, "Researcher":0.8, "Writer":0.5, "Scientist":0.2, "Creator":0.6},
    "Creator":{"Founder":0.9, "Engineer":0.7, "Designer":0.9, "Researcher":0.6, "Writer":0.9, "Scientist":0.6, "Creator":0.3},
    "Undecided": {"Founder":0.6,"Engineer":0.6,"Designer":0.6,"Researcher":0.6,"Writer":0.6,"Scientist":0.6,"Creator":0.6,"Undecided":0.2}
}

def role_complementarity(df: pd.DataFrame) -> np.ndarray:
    roles = df['role'].tolist(); n=len(roles); S=np.zeros((n,n),dtype=float)
    for i in range(n):
        for j in range(n):
            if i==j: continue
            S[i,j] = ROLE_COMP.get(roles[i], {}).get(roles[j], 0.2)
    return S

def combine_content(S_text, S_geo, S_exp, S_role, w=(0.45,0.2,0.15,0.2)):
    a,b,c,d = w
    S = a*S_text + b*S_geo + c*S_exp + d*S_role
    return S / (S.max() + 1e-9)


## Skills strategies: similar vs complementary (Hungarian fallback)

In [7]:

from sklearn.feature_extraction.text import TfidfVectorizer

def parse_skill_list(sk: str) -> List[str]:
    return [s.strip().lower() for s in (sk or "").split(",") if s.strip()]

def tfidf_cosine(a_list: List[str], b_list: List[str]) -> float:
    docs = ["; ".join(a_list), "; ".join(b_list)]
    vec = TfidfVectorizer(ngram_range=(1,2), min_df=1)
    X = vec.fit_transform(docs)
    return float(cosine_similarity(X[0], X[1])[0,0])

def similar_skills_matrix(df: pd.DataFrame) -> np.ndarray:
    n=len(df); S=np.zeros((n,n))
    parsed = [parse_skill_list(x) for x in df['skills'].fillna('')]
    for i in range(n):
        for j in range(n):
            if i==j: continue
            S[i,j] = tfidf_cosine(parsed[i], parsed[j])
    if S.max()>0: S = S/S.max()
    return S

def complementary_skills_matrix(df: pd.DataFrame) -> np.ndarray:
    wants = [parse_skill_list(row.get('skills_want', row.get('interests',''))) for _,row in df.iterrows()]
    haves = [parse_skill_list(row.get('skills','')) for _,row in df.iterrows()]
    n=len(df); S=np.zeros((n,n))
    try:
        from scipy.optimize import linear_sum_assignment
        for i in range(n):
            need = wants[i]
            for j in range(n):
                if i==j: continue
                have = haves[j]
                if not need or not have: 
                    S[i,j]=0.0; continue
                A = ["; ".join([n1]) for n1 in need]
                B = ["; ".join([h1]) for h1 in have]
                vec = TfidfVectorizer(ngram_range=(1,2), min_df=1)
                X = vec.fit_transform(A + B)
                m, k = len(need), len(have)
                Csim = np.zeros((m,k))
                for p in range(m):
                    for q in range(k):
                        Csim[p,q] = cosine_similarity(X[p], X[m+q])[0,0]
                size = max(m,k)
                padded = np.ones((size,size))
                padded[:m,:k] = 1.0 - Csim  # cost = 1 - sim
                r_ind, c_ind = linear_sum_assignment(padded)
                total_sim = 0.0; count = 0
                for r,c in zip(r_ind, c_ind):
                    if r < m and c < k:
                        total_sim += 1.0 - padded[r,c]; count += 1
                S[i,j] = total_sim / (count + 1e-9)
        if S.max()>0: S = S/S.max()
    except Exception:
        # Fallback: average max similarity
        for i in range(n):
            need = wants[i]
            for j in range(n):
                if i==j: continue
                have = haves[j]
                if not need or not have: 
                    S[i,j]=0.0; continue
                sims = []
                for nterm in need:
                    sims.append(max(tfidf_cosine([nterm], [h]) for h in have))
                S[i,j] = float(np.mean(sims)) if sims else 0.0
        if S.max()>0: S = S/S.max()
    return S


## CF (implicit), Graph (PPR), Personality, Fusion

In [8]:

def reciprocalize(S: np.ndarray) -> np.ndarray:
    return np.sqrt(S * S.T + 1e-12)

def fuse_scores(S_content, S_cf, S_graph, S_person, S_skills, weights=(0.35,0.2,0.15,0.15,0.15)):
    Sc = reciprocalize(S_content)
    Sf = reciprocalize(S_cf)
    Sg = reciprocalize(S_graph)
    Sp = reciprocalize(S_person)
    Ss = reciprocalize(S_skills)
    a,b,c,d,e = weights
    S = a*Sc + b*Sf + c*Sg + d*Sp + e*Ss
    return S / (S.max() + 1e-12)

def mmr(query_idx: int, S: np.ndarray, K: int = 3, lambda_rel: float = 0.7):
    n = S.shape[0]
    candidates = [i for i in range(n) if i != query_idx]
    selected = []
    while candidates and len(selected) < K:
        if not selected:
            i = max(candidates, key=lambda j: S[query_idx, j])
            selected.append(i); candidates.remove(i)
        else:
            def score(j):
                redundancy = max(S[j, s] for s in selected) if selected else 0.0
                return lambda_rel * S[query_idx, j] - (1-lambda_rel) * redundancy
            i = max(candidates, key=score)
            selected.append(i); candidates.remove(i)
    return selected


## Generate 50 diverse synthetic users

In [9]:

names = [f"User{i:02d}" for i in range(1, 51)]
roles = ["Founder","Engineer","Designer","Researcher","Writer","Scientist","Creator"]
cities = [
    ("Toronto",43.6532,-79.3832),("New York",40.7128,-74.0060),("San Francisco",37.7749,-122.4194),
    ("London",51.5072,-0.1276),("Berlin",52.52,13.405),("Nairobi", -1.286389,36.817223),
    ("Sydney",-33.8688,151.2093),("Bangalore",12.9716,77.5946),("Paris",48.8566,2.3522),("Mexico City",19.4326,-99.1332)
]

skill_bank = [
    "python","pytorch","tensorflow","django","react","nextjs","go","kubernetes","aws","gcp",
    "video editing","storyboarding","scriptwriting","podcasting","seo","branding","figma","design systems",
    "statistics","causal inference","nlp","cv","llm prompting","sql","dbt","airflow",
    "grant writing","field research","lab techniques","oceanography","genomics","biostatistics",
    "supply chain","marketing","growth","product","fundraising","strategy"
]
interest_bank = [
    "ocean conservation","coral reef restoration","climate tech","educational apps","healthcare AI",
    "creator economy","open source tools","social impact","rural connectivity","financial inclusion",
    "short-form video","long-form YouTube","beauty brand","lipstick R&D","fashion sustainability",
    "music production","publishing","newsletter growth","sports analytics","mental health",
    "language learning","VR social spaces","next social network","privacy-first messaging"
]

goals_examples = [
    "launch a YouTube channel teaching ML from scratch",
    "build low-cost sensors to monitor microplastics in rivers",
    "create a community-powered social network with better moderation",
    "start a cruelty-free lipstick brand with transparent supply chain",
    "develop AI tools for writers to plan book outlines",
    "spin up an educational game for climate science",
    "ship a mobile app to connect volunteers with ocean NGOs",
    "build a data pipeline for grassroots health clinics",
    "open-source a toolkit for video creators to analyze audience retention",
    "prototype a privacy-first group chat app with local-first sync"
]

def rand_words(pool, kmin, kmax):
    k = random.randint(kmin, kmax)
    return ", ".join(random.sample(pool, k))

def make_answer(topic, city_name):
    # create a 120-220 word blurb to stay under 250
    options = [
        f"I’m based in {city_name}, balancing work with weekend projects. I care about impact and craft. Outside of work I run, film short videos, and try new recipes. I mentor juniors and love pairing.",
        f"In {city_name}, I split time between creative work and tinkering. I sketch interfaces, read research papers, and test prototypes with friends. I’m energized by small teams shipping useful things.",
        f"{city_name} is home. I keep a simple routine: work, exercise, cook, dog park. Evenings go to side projects where I learn new stacks and collaborate with people who care about ethics and accessibility."
    ]
    txt = random.choice(options)
    return txt

records = []
for i, name in enumerate(names, start=1):
    role = random.choice(roles)
    city, lat, lon = random.choice(cities)
    years = random.randint(1, 15)
    skills = rand_words(skill_bank, 3, 7)
    interests = rand_words(interest_bank, 3, 7)
    human = make_answer("human", city)
    professional = f"I work as a {role.lower()} with {years} years in the field. Core skills: {skills}. I’ve contributed to projects ranging from prototypes to production launches. Recently, I focused on {random.choice(interest_bank)}. I can produce clear docs, stable code or creative assets, and collaborate across functions."
    contributor = "I like async collaboration with tight feedback loops. I write short design docs, propose milestones, demo weekly, and keep a calm tempo. I value psychological safety and clear ownership. I bring reliability, curiosity, and momentum."
    interests_long = f"My current goals: {random.choice(goals_examples)}. I’m excited about teaming up with people who value craft and mission. I enjoy learning adjacent domains and sharing knowledge in the open."
    reason = random.choice(["Expand network","Find collaborators","Find projects","Build a dream"])
    answers = dict(
        human=human, role=role, interests=interests, skills=skills, years_exp=str(years),
        professional=professional, contributor=contributor, interests_long=interests_long, reason=reason
    )
    norm = normalize_interview(answers)
    # random TIPI
    tipi = [random.randint(2,6) for _ in range(10)]
    bf = score_tipi(tipi)
    records.append(dict(
        user_id=i, name=name, role=norm["role"], interests=norm["interests"], skills=norm["skills"],
        years_exp=norm["years_exp"], bio=norm["bio"], reason_for_joining=norm["reason_for_joining"],
        long_text=norm["long_text"], lat=lat, lon=lon, bf=bf
    ))

users = pd.DataFrame(records)
users.head()


Unnamed: 0,user_id,name,role,interests,skills,years_exp,bio,reason_for_joining,long_text,lat,lon,bf
0,1,User01,Designer,"sports analytics, coral reef restoration, publ...","react, growth, go",7,I work as a designer with 7 years in the field...,Find collaborators,"In San Francisco, I split time between creativ...",37.7749,-122.4194,"BigFive(O=0.5, C=0.41666666666666663, E=0.5, A..."
1,2,User02,Writer,"healthcare AI, newsletter growth, educational ...","tensorflow, product, aws, statistics",1,I work as a writer with 1 years in the field. ...,Find collaborators,"Sydney is home. I keep a simple routine: work,...",-33.8688,151.2093,"BigFive(O=0.5, C=0.75, E=0.6666666666666667, A..."
2,3,User03,Researcher,"financial inclusion, publishing, music product...","oceanography, sql, causal inference, branding,...",8,I work as a researcher with 8 years in the fie...,Find collaborators,"I’m based in Nairobi, balancing work with week...",-1.286389,36.817223,"BigFive(O=0.3333333333333333, C=0.416666666666..."
3,4,User04,Scientist,"coral reef restoration, privacy-first messagin...","strategy, oceanography, react, nextjs, design ...",10,I work as a scientist with 10 years in the fie...,Build a dream,"In Nairobi, I split time between creative work...",-1.286389,36.817223,"BigFive(O=0.5, C=0.3333333333333333, E=0.5, A=..."
4,5,User05,Founder,"fashion sustainability, beauty brand, newslett...","aws, branding, airflow, biostatistics, nextjs",13,I work as a founder with 13 years in the field...,Find projects,"I’m based in London, balancing work with weeke...",51.5072,-0.1276,"BigFive(O=0.5, C=0.6666666666666667, E=0.75, A..."


## Build similarity signals

In [10]:

# Text
vec, X_text = build_text_matrix(users)
S_text = cosine_similarity(X_text)
S_text = (S_text - S_text.min())/(S_text.max()-S_text.min()+1e-9)

# Geo, Exp, Role
S_geo = geo_similarity(users)
S_exp = experience_compatibility(users['years_exp'].tolist())
S_role = role_complementarity(users)
S_content = combine_content(S_text, S_geo, S_exp, S_role)

# Skills
S_skills_sim = similar_skills_matrix(users)
S_skills_comp = complementary_skills_matrix(users)

# CF from synthetic likes (sparse)
n = len(users)
R = np.zeros((n,n), dtype=float)
# generate a few random edges conditioned on role proximity to simulate behavior
for _ in range(140):
    u = random.randrange(n); v = random.randrange(n)
    if u==v: continue
    # bias: founders like engineers/designers; creators like writers/designers
    if users.iloc[u].role=="Founder" and users.iloc[v].role in ["Engineer","Designer"]: R[u,v]=1.0
    elif users.iloc[u].role=="Creator" and users.iloc[v].role in ["Writer","Designer","Engineer"]: R[u,v]=1.0
    elif random.random() < 0.08: R[u,v]=1.0
S_cf = cosine_similarity(R.T); 
S_cf = (S_cf - S_cf.min())/(S_cf.max()-S_cf.min()+1e-9)

# Graph PPR on directed likes
G = nx.DiGraph(); G.add_nodes_from(users['user_id'].tolist())
edges = [(int(users.iloc[u].user_id), int(users.iloc[v].user_id)) for u in range(n) for v in range(n) if R[u,v]>0]
G.add_edges_from(edges)
nodes = sorted(G.nodes()); idx = {u:i for i,u in enumerate(nodes)}
S_graph = np.zeros((n,n))
for u in nodes:
    # restart prob 0.2
    pr = nx.pagerank(G, alpha=0.8, personalization={k:(1.0 if k==u else 0.0) for k in nodes})
    for v,s in pr.items(): S_graph[idx[u], idx[v]] = s
S_graph = (S_graph - S_graph.min())/(S_graph.max()-S_graph.min()+1e-12)

# Personality cosine
S_person = np.zeros((n,n))
for i in range(n):
    for j in range(n):
        if i==j: continue
        S_person[i,j] = bigfive_cosine(users.iloc[i].bf, users.iloc[j].bf)
S_person = (S_person - S_person.min())/(S_person.max()-S_person.min()+1e-9)


## Final fusion & examples (skills_mode toggle)

In [16]:

def top_matches_for(idx: int, skills_mode="similar", k=3):
    S_sk = S_skills_sim if skills_mode=="similar" else S_skills_comp
    S_final = fuse_scores(S_content, S_cf, S_graph, S_person, S_sk)
    picks = mmr(idx, S_final, K=k, lambda_rel=0.7)
    cols = ['user_id','name','role','interests','skills','years_exp','reason_for_joining']
    return users.iloc[picks][cols].assign(score=[S_final[idx,j] for j in picks])

# Show for a few seed users under both modes
sample_idx = [0, 7, 13, 25, 42]  # five examples
demo = {}
for i in sample_idx:
    user_name = users.iloc[i]["name"]  # correctly extract the 'name' field
    demo[f"{user_name} (similar)"] = top_matches_for(i, "similar", 3)
    demo[f"{user_name} (complementary)"] = top_matches_for(i, "complementary", 3)


## Mutual-best pairs (greedy)

In [17]:

def mutual_best_pairs(skills_mode="similar"):
    S_sk = S_skills_sim if skills_mode=="similar" else S_skills_comp
    S_final = fuse_scores(S_content, S_cf, S_graph, S_person, S_sk)
    n = S_final.shape[0]
    best = {i:int(np.argmax(S_final[i,:] + (np.arange(n)==i)*-1e9)) for i in range(n)}
    used=set(); pairs=[]
    for i in range(n):
        if i in used: continue
        j = best[i]
        if j!=i and best.get(j)==i and j not in used:
            pairs.append((i,j,float(S_final[i,j])))
            used.add(i); used.add(j)
    return [(users.iloc[i].name, users.iloc[j].name, score) for i,j,score in sorted(pairs, key=lambda x: -x[2])]

pairs_sim = mutual_best_pairs("similar")[:15]
pairs_comp = mutual_best_pairs("complementary")[:15]
pairs_sim[:5], pairs_comp[:5]


([(4, 8, 0.9288345873777559),
  (18, 41, 0.9230267340344829),
  (11, 17, 0.8695613236621874),
  (21, 44, 0.8594220381547211),
  (30, 34, 0.8581288725157666)],
 [(30, 34, 0.8581288725157666),
  (4, 8, 0.835936991607384),
  (7, 18, 0.7878725733706853),
  (12, 13, 0.7780358271203902),
  (16, 17, 0.762738444858308)])

## Summary tables

In [18]:

summary = pd.DataFrame({
    "name": users["name"],
    "role": users["role"],
    "years_exp": users["years_exp"],
    "city_lat": users["lat"],
    "city_lon": users["lon"],
    "reason": users["reason_for_joining"],
})
summary.head(10)


Unnamed: 0,name,role,years_exp,city_lat,city_lon,reason
0,User01,Designer,7,37.7749,-122.4194,Find collaborators
1,User02,Writer,1,-33.8688,151.2093,Find collaborators
2,User03,Researcher,8,-1.286389,36.817223,Find collaborators
3,User04,Scientist,10,-1.286389,36.817223,Build a dream
4,User05,Founder,13,51.5072,-0.1276,Find projects
5,User06,Founder,14,12.9716,77.5946,Build a dream
6,User07,Engineer,6,40.7128,-74.006,Find projects
7,User08,Founder,2,37.7749,-122.4194,Find projects
8,User09,Designer,10,51.5072,-0.1276,Find projects
9,User10,Engineer,10,12.9716,77.5946,Find collaborators
