
# Rocket — Unified Matching (STRICT Local Gating Fix)

**What's new in this fix**
- **Strict candidate filtering** in *local* mode: before ranking, we **compute allowed IDs** using `(distance <= radius)` **and** shared language (if required). We then rank **only** those IDs.  
- This prevents remote users with `0.0` scores from being selected to pad results.
- `mmr_rank`/`dpp_greedy` now accept an `allowed` list.

See citations inline (MMR, YouTube's two-stage architecture, haversine/geodesic distance) for rationale.


## Imports & setup

In [8]:

import numpy as np, pandas as pd, random, math, importlib, re
from dataclasses import dataclass
from typing import List, Dict, Any, Optional, Tuple
from datetime import date, datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from geopy.distance import geodesic
import networkx as nx

np.random.seed(777); random.seed(777)


## Embeddings & utilities (same as unified notebook)

In [9]:

class Embedder:
    def __init__(self, model_names: List[str] = None):
        self.model = None
        self.sbert_ok = False
        self.tfidf = None
        self.model_names = model_names or [
            "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
            "sentence-transformers/distiluse-base-multilingual-cased-v2",
            "sentence-transformers/all-MiniLM-L6-v2"
        ]
        if importlib.util.find_spec("sentence_transformers") is not None:
            from sentence_transformers import SentenceTransformer
            for name in self.model_names:
                try:
                    self.model = SentenceTransformer(name)
                    self.sbert_ok = True
                    break
                except Exception:
                    continue

    def fit(self, corpus: List[str]):
        if self.sbert_ok:
            return self
        self.tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=1)
        self.tfidf.fit(corpus)
        return self

    def encode(self, items: List[str]) -> np.ndarray:
        if self.sbert_ok:
            return np.array(self.model.encode(items, show_progress_bar=False, normalize_embeddings=True))
        X = self.tfidf.transform(items)
        X = X.astype(np.float64)
        norms = np.sqrt((X.power(2)).sum(axis=1))
        norms[norms==0] = 1.0
        return (X / norms).toarray()

def parse_dob(dob_str: str) -> date:
    return datetime.strptime(dob_str, "%Y-%m-%d").date()

def compute_age(dob: date, today: Optional[date] = None) -> int:
    today = today or date.today()
    years = today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
    return max(0, years)

def age_band(age: int) -> str:
    for lo, hi in [(18,24),(25,34),(35,44),(45,54)]:
        if lo <= age <= hi: return f"{lo}-{hi}"
    return "55+"

@dataclass
class BigFive:
    O: float; C: float; E: float; A: float; N: float

def clip01(x): 
    import numpy as np
    return float(np.clip(x, 0.0, 1.0))

TIPI_KEY = {
    1: ("E", False), 2: ("A", True), 3: ("C", False), 4: ("N", False), 5: ("O", False),
    6: ("E", True),  7: ("A", False),8: ("C", True),  9: ("N", True),  10:("O", True)
}

def score_tipi(responses_1to7):
    import numpy as np
    assert len(responses_1to7)==10
    r = np.array(responses_1to7, dtype=float)
    r01 = (r-1)/6.0
    traits = {"O":[], "C":[], "E":[], "A":[], "N":[]}
    for i,val in enumerate(r01, start=1):
        trait, rev = TIPI_KEY[i]
        traits[trait].append(1.0-val if rev else val)
    return BigFive(*(clip01(np.mean(traits[t])) for t in ["O","C","E","A","N"]))

def bigfive_cosine(u: BigFive, v: BigFive) -> float:
    import numpy as np
    a = np.array([u.O,u.C,u.E,u.A,u.N])
    b = np.array([v.O,v.C,v.E,v.A,v.N])
    return float(a @ b / (np.linalg.norm(a)*np.linalg.norm(b) + 1e-9))


## Intake + features

In [10]:

def normalize_intake(row: Dict[str, Any]) -> Dict[str, Any]:
    def wclip(t): 
        ws = (t or "").split()
        return " ".join(ws[:250])
    dob_str = row.get("dob","1989-01-01")
    try:
        dob = parse_dob(dob_str)
    except Exception:
        dob = date(1989,1,1); dob_str="1989-01-01"
    age_val = compute_age(dob)
    return {
        "name": row.get("name","Unnamed"),
        "dob": dob_str, "age": age_val, "age_band": age_band(age_val),
        "location_city": row.get("location_city",""), "location_country": row.get("location_country",""),
        "lat": float(row.get("lat", 43.6532)), "lon": float(row.get("lon", -79.3832)),
        "tz_offset": int(row.get("tz_offset", -5)),
        "languages": row.get("languages","en"),
        "availability_hours": row.get("availability_hours","5-10"),
        "energy_1to5": int(row.get("energy_1to5",3)),
        "collab_style": row.get("collab_style","hybrid"),
        "role": row.get("role","Undecided"),
        "seniority": row.get("seniority","Mid"),
        "years_exp": int(row.get("years_exp",3)),
        "skills_have": row.get("skills_have",""),
        "skills_want": row.get("skills_want",row.get("interests","")),
        "interests": row.get("interests",""),
        "human": wclip(row.get("human","")),
        "professional": wclip(row.get("professional","")),
        "contributor": wclip(row.get("contributor","")),
        "interests_long": wclip(row.get("interests_long","")),
        "reason": wclip(row.get("reason","")),
    }

def parse_list_csv(s: str) -> List[str]:
    return [x.strip() for x in (s or "").split(",") if x.strip()]

def build_text_similarity(df, embedder):
    corpus = (df['interests'].fillna('') + " ; " + df['skills_have'].fillna('') + " ; " + df['professional'].fillna('')).tolist()
    embedder.fit(corpus)
    X = embedder.encode(corpus)
    S = (X @ X.T)
    S = (S - S.min())/(S.max()-S.min()+1e-9)
    return S

def language_overlap(df: pd.DataFrame) -> np.ndarray:
    n=len(df); S=np.zeros((n,n))
    langs = [set([x.strip().lower() for x in (l or '').split(',') if x.strip()]) for l in df['languages'].fillna('')]
    for i in range(n):
        for j in range(n):
            if i==j: continue
            inter = langs[i] & langs[j]
            uni = langs[i] | langs[j]
            S[i,j] = len(inter)/float(len(uni) + 1e-9)
    return S

def geo_distance_matrix(df: pd.DataFrame) -> np.ndarray:
    n=len(df); D=np.zeros((n,n))
    coords = list(zip(df['lat'], df['lon']))
    for i in range(n):
        for j in range(n):
            if i==j: continue
            D[i,j] = geodesic(coords[i], coords[j]).km
    return D

def geo_similarity_from_D(D: np.ndarray, decay_km: float = 1200.0) -> np.ndarray:
    S = np.exp(-D/decay_km)
    if S.max()>0: S = S/S.max()
    np.fill_diagonal(S, 0.0)
    return S

def experience_compatibility(years: List[int], sweet_spot: float = 3.0) -> np.ndarray:
    years = np.array(years); n=len(years); S=np.zeros((n,n),dtype=float)
    for i in range(n):
        for j in range(n):
            if i==j: continue
            gap = abs(years[i]-years[j])
            S[i,j] = np.exp(-((gap-sweet_spot)**2)/(2*(sweet_spot**2)))
    if S.max()>0: S = S/S.max()
    return S

ROLE_COMP = {
    "Founder": {"Engineer": 1.0, "Designer": 1.0, "Researcher": 0.8, "Founder": 0.2, "Writer":0.6, "Scientist":0.7, "Creator":0.8},
    "Engineer": {"Founder": 1.0, "Designer": 0.7, "Engineer": 0.2, "Researcher": 0.6, "Writer":0.6, "Scientist":0.8, "Creator":0.7},
    "Designer": {"Founder": 1.0, "Engineer": 0.7, "Designer": 0.2, "Researcher": 0.5, "Writer":0.6, "Scientist":0.5, "Creator":0.9},
    "Researcher": {"Founder": 0.8, "Engineer": 0.7, "Designer": 0.5, "Researcher": 0.3, "Writer":0.5, "Scientist":0.9, "Creator":0.6},
    "Writer": {"Founder":0.8, "Engineer":0.6, "Designer":0.7, "Researcher":0.5, "Writer":0.2, "Scientist":0.5, "Creator":0.9},
    "Scientist":{"Founder":0.9, "Engineer":0.9, "Designer":0.5, "Researcher":0.8, "Writer":0.5, "Scientist":0.2, "Creator":0.6},
    "Creator":{"Founder":0.9, "Engineer":0.7, "Designer":0.9, "Researcher":0.6, "Writer":0.9, "Scientist":0.6, "Creator":0.3},
    "Undecided": {"Founder":0.6,"Engineer":0.6,"Designer":0.6,"Researcher":0.6,"Writer":0.6,"Scientist":0.6,"Creator":0.6,"Undecided":0.2}
}

def role_complementarity(df: pd.DataFrame) -> np.ndarray:
    roles = df['role'].tolist(); n=len(roles); S=np.zeros((n,n),dtype=float)
    for i in range(n):
        for j in range(n):
            if i==j: continue
            S[i,j] = ROLE_COMP.get(roles[i], {}).get(roles[j], 0.2)
    return S

def energy_compatibility(energies: List[int], target_gap=0):
    e = np.array(energies); n=len(e); S=np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            if i==j: continue
            gap = abs(e[i]-e[j])
            S[i,j] = np.exp(-((gap-target_gap)**2)/(2*(1.25**2)))
    if S.max()>0: S = S/S.max()
    return S

COLLAB_COMP = {
    "async": {"async":1.0, "hybrid":0.7, "sync":0.3},
    "hybrid":{"async":0.7, "hybrid":1.0, "sync":0.7},
    "sync":  {"async":0.3, "hybrid":0.7, "sync":1.0},
}
def collab_style_compatibility(styles: List[str]) -> np.ndarray:
    n=len(styles); S=np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            if i==j: continue
            S[i,j] = COLLAB_COMP.get(styles[i],{}).get(styles[j], 0.5)
    return S

def availability_overlap(avails: List[str]) -> np.ndarray:
    map_mid = {"2-5":3.5,"5-10":7.5,"10-20":15.0,"20+":25.0}
    v = np.array([map_mid.get(a,7.5) for a in avails])
    n=len(v); S=np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            if i==j: continue
            gap = abs(v[i]-v[j])
            S[i,j] = np.exp(-gap/15.0)
    if S.max()>0: S = S/S.max()
    return S

def time_zone_overlap(tz_list: List[int]) -> np.ndarray:
    tz = np.array(tz_list); n=len(tz); S=np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            if i==j: continue
            diff = abs(tz[i]-tz[j])
            S[i,j] = np.exp(-diff/6.0)
    if S.max()>0: S = S/S.max()
    return S


## Skills & CF/Graph/Fusion

In [11]:

def parse_skill_list(sk: str) -> List[str]:
    return [s.strip().lower() for s in (sk or "").split(",") if s.strip()]

def tfidf_cosine(a_list: List[str], b_list: List[str]) -> float:
    docs = ["; ".join(a_list), "; ".join(b_list)]
    vec = TfidfVectorizer(ngram_range=(1,2), min_df=1)
    X = vec.fit_transform(docs)
    return float(cosine_similarity(X[0], X[1])[0,0])

def similar_skills_matrix(df: pd.DataFrame, embedder: Optional[Embedder] = None) -> np.ndarray:
    n=len(df); S=np.zeros((n,n))
    parsed = [parse_skill_list(x) for x in df['skills_have'].fillna('')]
    for i in range(n):
        for j in range(n):
            if i==j: continue
            S[i,j] = tfidf_cosine(parsed[i], parsed[j])
    if S.max()>0: S = S/S.max()
    return S

def complementary_skills_matrix(df: pd.DataFrame) -> np.ndarray:
    wants = [parse_skill_list(row.get('skills_want', row.get('interests',''))) for _,row in df.iterrows()]
    haves = [parse_skill_list(row.get('skills_have','')) for _,row in df.iterrows()]
    n=len(df); S=np.zeros((n,n))
    try:
        from scipy.optimize import linear_sum_assignment
        for i in range(n):
            need = wants[i]
            for j in range(n):
                if i==j: continue
                have = haves[j]
                if not need or not have: 
                    S[i,j]=0.0; continue
                A = ["; ".join([n1]) for n1 in need]
                B = ["; ".join([h1]) for h1 in have]
                vec = TfidfVectorizer(ngram_range=(1,2), min_df=1)
                X = vec.fit_transform(A + B)
                m, k = len(need), len(have)
                Csim = np.zeros((m,k))
                for p in range(m):
                    for q in range(k):
                        Csim[p,q] = cosine_similarity(X[p], X[m+q])[0,0]
                size = max(m,k)
                padded = np.ones((size,size))
                padded[:m,:k] = 1.0 - Csim  # cost = 1 - sim
                r_ind, c_ind = linear_sum_assignment(padded)
                total_sim = 0.0; count = 0
                for r,c in zip(r_ind, c_ind):
                    if r < m and c < k:
                        total_sim += 1.0 - padded[r,c]; count += 1
                S[i,j] = total_sim / (count + 1e-9)
        if S.max()>0: S = S/S.max()
    except Exception:
        for i in range(n):
            need = wants[i]
            for j in range(n):
                if i==j: continue
                have = haves[j]
                if not need or not have: 
                    S[i,j]=0.0; continue
                sims = []
                for nterm in need:
                    sims.append(max(tfidf_cosine([nterm], [h]) for h in have))
                S[i,j] = float(np.mean(sims)) if sims else 0.0
        if S.max()>0: S = S/S.max()
    return S

def mf_als(R: np.ndarray, k: int = 16, alpha: float = 40.0, reg: float = 0.1, iters: int = 6):
    n_users, n_items = R.shape
    C = 1 + alpha * R  # confidence
    X = np.random.normal(scale=0.1, size=(n_users, k))
    Y = np.random.normal(scale=0.1, size=(n_items, k))
    eye = np.eye(k)
    for _ in range(iters):
        for u in range(n_users):
            Cu = np.diag(C[u])
            YTCuY = Y.T @ Cu @ Y
            YTCuPu = Y.T @ (Cu @ R[u])
            X[u] = np.linalg.solve(YTCuY + reg*eye, YTCuPu)
        for i in range(n_items):
            Ci = np.diag(C[:,i])
            XTCiX = X.T @ Ci @ X
            XTCiPi = X.T @ (Ci @ R[:,i])
            Y[i] = np.linalg.solve(XTCiX + reg*eye, XTCiPi)
    Xn = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-9)
    S = Xn @ Xn.T
    S = (S - S.min())/(S.max()-S.min()+1e-9)
    np.fill_diagonal(S, 0.0)
    return S

def build_graph_ppr(R: np.ndarray, alpha=0.85) -> np.ndarray:
    n = R.shape[0]
    G = nx.DiGraph(); G.add_nodes_from(range(n))
    edges = [(u,v) for u in range(n) for v in range(n) if R[u,v]>0]
    G.add_edges_from(edges)
    S = np.zeros((n,n))
    for u in range(n):
        pr = nx.pagerank(G, alpha=alpha, personalization={k:(1.0 if k==u else 0.0) for k in range(n)})
        for v,s in pr.items(): S[u,v] = s
    S = (S - S.min())/(S.max()-S.min()+1e-12)
    np.fill_diagonal(S, 0.0)
    return S

def reciprocalize(S: np.ndarray) -> np.ndarray:
    return np.sqrt(S * S.T + 1e-12)

def combine_content(S_text, S_geo, S_exp, S_role, S_energy, S_collab, S_avail, S_tz, S_lang, w):
    a,b,c,d,e,f,g,h,l = w
    S = a*S_text + b*S_geo + c*S_exp + d*S_role + e*S_energy + f*S_collab + g*S_avail + h*S_tz + l*S_lang
    return S / (S.max() + 1e-9)

def fuse_scores(S_content, S_cf, S_graph, S_person, S_skills, weights=(0.30,0.18,0.16,0.10,0.26)):
    Sc = reciprocalize(S_content)
    Sf = reciprocalize(S_cf)
    Sg = reciprocalize(S_graph)
    Sp = reciprocalize(S_person)
    Ss = reciprocalize(S_skills)
    a,b,c,d,e = weights
    S = a*Sc + b*Sf + c*Sg + d*Sp + e*Ss
    return S / (S.max() + 1e-12)


## Diversification (now with `allowed` candidates)

In [12]:

def mmr_rank(query_idx: int, S: np.ndarray, K: int = 5, lambda_rel: float = 0.7, allowed: Optional[List[int]] = None):
    n = S.shape[0]
    base_candidates = [i for i in range(n) if i != query_idx]
    candidates = [c for c in (allowed if allowed is not None else base_candidates) if c != query_idx]
    selected = []
    while candidates and len(selected) < K:
        if not selected:
            i = max(candidates, key=lambda j: S[query_idx, j])
            selected.append(i); candidates.remove(i)
        else:
            def score(j):
                redundancy = max(S[j, s] for s in selected) if selected else 0.0
                return lambda_rel * S[query_idx, j] - (1-lambda_rel) * redundancy
            i = max(candidates, key=score)
            selected.append(i); candidates.remove(i)
    return selected

def dpp_greedy(query_idx: int, S: np.ndarray, K: int = 5, allowed: Optional[List[int]] = None):
    n = S.shape[0]
    items = [i for i in range(n) if i != query_idx]
    items = [i for i in (allowed if allowed is not None else items) if i != query_idx]
    quality = S[query_idx].copy()
    q = quality / (quality.max() + 1e-9)
    selected = []
    remaining = items.copy()
    while remaining and len(selected) < K:
        if not selected:
            idx = int(np.argmax([q[i] for i in remaining]))
            chosen = remaining[idx]
        else:
            scores = []
            for r in remaining:
                max_sim = max(S[r, s] for s in selected) if selected else 0.0
                scores.append(q[r] - max_sim)
            idx = int(np.argmax(scores))
            chosen = remaining[idx]
        selected.append(chosen); remaining.pop(idx)
    return selected


## Strict local gating + APIs

In [13]:

def build_signals(df: pd.DataFrame, embedder: Embedder):
    D = geo_distance_matrix(df)
    S_geo = geo_similarity_from_D(D, decay_km=1200.0)
    S_text = build_text_similarity(df, embedder)
    S_lang = language_overlap(df)
    S_exp  = experience_compatibility(df['years_exp'].tolist())
    S_role = role_complementarity(df)
    S_energy = energy_compatibility(df['energy_1to5'].tolist())
    S_collab = collab_style_compatibility(df['collab_style'].tolist())
    S_avail  = availability_overlap(df['availability_hours'].tolist())
    S_tz     = time_zone_overlap(df['tz_offset'].tolist())
    S_sk_sim = similar_skills_matrix(df)
    S_sk_comp = complementary_skills_matrix(df)
    # Synthetic implicit likes -> CF + graph
    n = len(df); R = np.zeros((n,n), dtype=float)
    for _ in range(900):
        u = random.randrange(n); v = random.randrange(n)
        if u==v: continue
        if df.iloc[u].role=="Founder" and df.iloc[v].role in ["Engineer","Designer"]: R[u,v]=1.0
        elif df.iloc[u].role=="Creator" and df.iloc[v].role in ["Writer","Designer","Engineer"]: R[u,v]=1.0
        elif df.iloc[u].location_city==df.iloc[v].location_city and random.random()<0.25: R[u,v]=1.0
        elif random.random() < 0.04: R[u,v]=1.0
    S_cf = mf_als(R, k=16)
    S_graph = build_graph_ppr(R, alpha=0.82)
    # Personality
    S_person = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            if i==j: continue
            S_person[i,j] = bigfive_cosine(df.iloc[i].bf, df.iloc[j].bf)
    S_person = (S_person - S_person.min())/(S_person.max()-S_person.min()+1e-9)
    return {"D":D,"S_text":S_text,"S_lang":S_lang,"S_geo":S_geo,"S_exp":S_exp,"S_role":S_role,
            "S_energy":S_energy,"S_collab":S_collab,"S_avail":S_avail,"S_tz":S_tz,
            "S_sk_sim":S_sk_sim,"S_sk_comp":S_sk_comp,"S_cf":S_cf,"S_graph":S_graph,"S_person":S_person}

def fused_matrix(signals: Dict[str, np.ndarray],
                 content_weights=(0.24,0.16,0.10,0.10,0.10,0.10,0.10,0.05,0.05),
                 blend_weights=(0.34,0.18,0.14,0.12,0.22),
                 skills_mode="similar"):
    S_content = combine_content(signals["S_text"], signals["S_geo"], signals["S_exp"], signals["S_role"],
                                signals["S_energy"], signals["S_collab"], signals["S_avail"], signals["S_tz"],
                                signals["S_lang"], content_weights)
    S_sk = signals["S_sk_sim"] if skills_mode=="similar" else signals["S_sk_comp"]
    S_final = fuse_scores(S_content, signals["S_cf"], signals["S_graph"], signals["S_person"], S_sk, weights=blend_weights)
    return S_final, S_content

def local_allowed_ids(df: pd.DataFrame, signals: Dict[str,np.ndarray], query_idx: int, radius_km: float, require_shared_language: bool):
    Drow = signals["D"][query_idx]
    allowed = [i for i in range(len(df)) if i!=query_idx and Drow[i] <= radius_km]
    if require_shared_language:
        Lq = set([x.strip().lower() for x in df.iloc[query_idx].languages.split(",") if x.strip()])
        allowed = [i for i in allowed if len(Lq & set([x.strip().lower() for x in df.iloc[i].languages.split(",") if x.strip()]))>0]
    return allowed

def network_expansion(df: pd.DataFrame, signals: Dict[str,np.ndarray], query_idx: int,
                      k:int=10, skills_mode="similar", search_mode="local", local_radius_km=50.0,
                      require_shared_language=True, diversifier="mmr"):
    S_final, _ = fused_matrix(signals, skills_mode=skills_mode)
    if search_mode == "local":
        allowed = local_allowed_ids(df, signals, query_idx, local_radius_km, require_shared_language)
    else:
        allowed = [i for i in range(len(df)) if i != query_idx]
    if not allowed:
        return pd.DataFrame(columns=["name","location_city","languages","role","score"])

    if diversifier=="mmr":
        picks = mmr_rank(query_idx, S_final, K=min(k,len(allowed)), lambda_rel=0.72, allowed=allowed)
    elif diversifier=="dpp":
        picks = dpp_greedy(query_idx, S_final, K=min(k,len(allowed)), allowed=allowed)
    else:
        picks = sorted(allowed, key=lambda j: -S_final[query_idx,j])[:k]

    cols = ["name","location_city","languages","role","seniority","interests","skills_have","skills_want","years_exp",
            "age","age_band","energy_1to5","collab_style","availability_hours","reason"]
    out = df.iloc[picks][cols].copy()
    out["score"] = [S_final[query_idx,j] for j in picks]
    return out

def team_build(df: pd.DataFrame, signals: Dict[str,np.ndarray], query_idx: int,
               skills_need_text:str, K:int=4, search_mode="local", local_radius_km=50.0,
               require_shared_language=True):
    S_final, _ = fused_matrix(signals, skills_mode="complementary")
    need = [s.strip() for s in skills_need_text.split(",") if s.strip()]
    if search_mode=="local":
        allowed = set(local_allowed_ids(df, signals, query_idx, local_radius_km, require_shared_language))
    else:
        allowed = set([i for i in range(len(df)) if i != query_idx])
    # Greedy selection over allowed only
    selected = []
    def team_score(set_ids: List[int]):
        if not set_ids: return 0.0
        rel = np.mean([S_final[query_idx, j] for j in set_ids])
        need_set = set([s.strip().lower() for s in need if s.strip()])
        have = set()
        for j in set_ids:
            have |= set([s.strip().lower() for s in df.iloc[j].skills_have.split(",") if s.strip()])
        coverage = len(need_set & have) / (len(need_set) + 1e-9)
        if len(set_ids) > 1:
            pair_sims = []
            for a in range(len(set_ids)):
                for b in range(a+1, len(set_ids)):
                    pair_sims.append(S_final[set_ids[a], set_ids[b]])
            div = 1.0 - float(np.mean(pair_sims))
        else:
            div = 1.0
        return 0.55*rel + 0.30*coverage + 0.15*div
    candidates = list(allowed)
    while candidates and len(selected) < K:
        base = team_score(selected)
        best_j, best_gain = None, -1
        for j in candidates:
            gain = team_score(selected+[j]) - base
            if gain > best_gain:
                best_gain, best_j = gain, j
        if best_j is None: break
        selected.append(best_j); candidates.remove(best_j)

    cols = ['name','location_city','languages','role','seniority','skills_have','years_exp','tz_offset','availability_hours','collab_style']
    team_df = df.iloc[selected][cols].copy()
    team_df['match_score'] = [S_final[query_idx,j] for j in selected]
    return team_df


## Synthetic cohort & demo

In [14]:

roles = ["Founder","Engineer","Designer","Researcher","Writer","Scientist","Creator"]
seniorities = ["Junior","Mid","Senior","Lead/Principal","Executive/Founder"]
cities = [
    ("Toronto",43.6532,-79.3832,-5, ["en","fr"]),
    ("New York",40.7128,-74.0060,-5, ["en","es"]),
    ("San Francisco",37.7749,-122.4194,-8, ["en","zh"]),
    ("London",51.5072,-0.1276,0, ["en","fr","de"]),
    ("Berlin",52.52,13.405,1, ["de","en"]),
    ("Nairobi",-1.286389,36.817223,3, ["en","sw"]),
    ("Sydney",-33.8688,151.2093,10, ["en"]),
    ("Bangalore",12.9716,77.5946,5, ["en","hi"]),
    ("Paris",48.8566,2.3522,1, ["fr","en"]),
    ("Mexico City",19.4326,-99.1332,-6, ["es","en"])
]

def rand_words(pool, kmin, kmax):
    k = random.randint(kmin, kmax)
    return ", ".join(random.sample(list(pool), k))

SKILL_LEXICON = set([
    "python","pytorch","tensorflow","django","react","nextjs","go","kubernetes","aws","gcp",
    "video editing","storyboarding","scriptwriting","podcasting","seo","branding","figma","design systems",
    "statistics","causal inference","nlp","cv","prompt engineering","sql","dbt","airflow",
    "grant writing","field research","lab techniques","oceanography","genomics","biostatistics",
    "supply chain","marketing","growth","product","fundraising","strategy"
])
interest_bank = [
    "ocean conservation","coral reef restoration","climate tech","educational apps","healthcare AI",
    "creator economy","open source tools","social impact","rural connectivity","financial inclusion",
    "short-form video","long-form YouTube","beauty brand","lipstick R&D","fashion sustainability",
    "music production","publishing","newsletter growth","sports analytics","mental health",
    "language learning","VR social spaces","next social network","privacy-first messaging"
]

def random_dob():
    y = random.randint(1961, 2004)
    m = random.randint(1,12); d = random.randint(1,28)
    return f"{y:04d}-{m:02d}-{d:02d}"

def mk_user(i):
    name = f"User{i:03d}"
    (city, lat, lon, tz, langs_base) = random.choice(cities)
    role = random.choice(roles)
    seniority = random.choice(seniorities)
    skills_have = rand_words(SKILL_LEXICON, 3, 7)
    skills_want = rand_words(SKILL_LEXICON, 2, 5)
    interests = rand_words(interest_bank, 3, 7)
    years = random.randint(1, 18)
    human = f"I live in {city}. I like calm schedules and meetups; I enjoy running and cooking."
    professional = f"As a {role.lower()} with {years} years, I worked across startups and labs."
    contributor = "I prefer weekly demos and short design docs. I bring reliability and momentum."
    interests_long = f"Goals: {random.choice(['launch a YouTube channel on ML','build ocean microplastics sensors','start a cruelty-free lipstick brand','prototype a privacy-first social app'])}."
    reason = random.choice(["Find projects","Expand network","Find collaborators","Build a dream"])
    langs = random.sample(langs_base, min(len(langs_base), random.choice([1,1,2])))
    row = dict(
        name=name, dob=random_dob(), location_city=city, location_country="",
        lat=lat, lon=lon, tz_offset=tz, languages=", ".join(langs),
        availability_hours=random.choice(["2-5","5-10","10-20","20+"]),
        energy_1to5=random.randint(1,5), collab_style=random.choice(["async","hybrid","sync"]),
        role=role, seniority=seniority, years_exp=years,
        skills_have=skills_have, skills_want=skills_want, interests=interests,
        human=human, professional=professional, contributor=contributor, interests_long=interests_long, reason=reason
    )
    return normalize_intake(row)

records = [mk_user(i) for i in range(1,151)]
tipi_all = [[random.randint(2,6) for _ in range(10)] for __ in range(150)]
bfs = [score_tipi(t) for t in tipi_all]
users = pd.DataFrame(records)
users['bf'] = bfs

embedder = Embedder()
signals = build_signals(users, embedder)

# Toronto demo (STRICT): 50km radius + shared language
tor_idx = users.index[users.location_city=="Toronto"][0]
local_net = network_expansion(users, signals, tor_idx, k=8, skills_mode="similar",
                              search_mode="local", local_radius_km=50.0, require_shared_language=True, diversifier="mmr")

print("=== Network expansion (LOCAL Toronto, strict) ===")
print(local_net[['name','location_city','languages','role','score']].to_string(index=False))

# If you want to see fallback behavior (e.g., fewer than K available), uncomment:
# print(f"Returned {len(local_net)} rows (strict mode does not pad with remote users).")


=== Network expansion (LOCAL Toronto, strict) ===
   name location_city languages       role    score
User144       Toronto    en, fr Researcher 0.781384
User069       Toronto    fr, en  Scientist 0.695438
User099       Toronto    fr, en    Creator 0.703937
User092       Toronto    fr, en  Scientist 0.708148
User061       Toronto        fr     Writer 0.705436
User081       Toronto        fr   Engineer 0.665076
User130       Toronto    fr, en    Creator 0.639348


In [15]:
users.loc[users['name'].isin(local_net['name'])]

Unnamed: 0,name,dob,age,age_band,location_city,location_country,lat,lon,tz_offset,languages,...,years_exp,skills_have,skills_want,interests,human,professional,contributor,interests_long,reason,bf
60,User061,1963-11-26,61,55+,Toronto,,43.6532,-79.3832,-5,fr,...,7,"growth, podcasting, field research, design sys...","growth, strategy, storyboarding, fundraising, ...","privacy-first messaging, next social network, ...",I live in Toronto. I like calm schedules and m...,"As a writer with 7 years, I worked across star...",I prefer weekly demos and short design docs. I...,Goals: prototype a privacy-first social app.,Find projects,"BigFive(O=0.5, C=0.6666666666666667, E=0.66666..."
68,User069,1964-09-02,60,55+,Toronto,,43.6532,-79.3832,-5,"fr, en",...,6,"sql, biostatistics, react, podcasting, pytorch...","branding, figma, storyboarding","educational apps, sports analytics, open sourc...",I live in Toronto. I like calm schedules and m...,"As a scientist with 6 years, I worked across s...",I prefer weekly demos and short design docs. I...,Goals: prototype a privacy-first social app.,Expand network,"BigFive(O=0.16666666666666663, C=0.5, E=0.5, A..."
80,User081,1961-07-20,64,55+,Toronto,,43.6532,-79.3832,-5,fr,...,12,"nextjs, statistics, growth","tensorflow, storyboarding, product, grant writing","creator economy, music production, language le...",I live in Toronto. I like calm schedules and m...,"As a engineer with 12 years, I worked across s...",I prefer weekly demos and short design docs. I...,Goals: prototype a privacy-first social app.,Expand network,"BigFive(O=0.4166666666666667, C=0.249999999999..."
91,User092,2003-07-27,22,18-24,Toronto,,43.6532,-79.3832,-5,"fr, en",...,9,"oceanography, video editing, figma","causal inference, product, python, seo, pytorch","educational apps, ocean conservation, fashion ...",I live in Toronto. I like calm schedules and m...,"As a scientist with 9 years, I worked across s...",I prefer weekly demos and short design docs. I...,Goals: start a cruelty-free lipstick brand.,Find projects,"BigFive(O=0.5833333333333334, C=0.583333333333..."
98,User099,1991-08-23,33,25-34,Toronto,,43.6532,-79.3832,-5,"fr, en",...,1,"statistics, tensorflow, marketing, design systems","go, react, grant writing","language learning, newsletter growth, coral re...",I live in Toronto. I like calm schedules and m...,"As a creator with 1 years, I worked across sta...",I prefer weekly demos and short design docs. I...,Goals: prototype a privacy-first social app.,Expand network,"BigFive(O=0.6666666666666667, C=0.5, E=0.5, A=..."
129,User130,1969-07-12,56,55+,Toronto,,43.6532,-79.3832,-5,"fr, en",...,3,"design systems, storyboarding, python, oceanog...","airflow, go, django, sql","healthcare AI, ocean conservation, publishing,...",I live in Toronto. I like calm schedules and m...,"As a creator with 3 years, I worked across sta...",I prefer weekly demos and short design docs. I...,Goals: launch a YouTube channel on ML.,Build a dream,"BigFive(O=0.4166666666666667, C=0.333333333333..."
143,User144,1961-01-08,64,55+,Toronto,,43.6532,-79.3832,-5,"en, fr",...,16,"dbt, gcp, branding, seo","biostatistics, product, strategy, grant writing","privacy-first messaging, social impact, ocean ...",I live in Toronto. I like calm schedules and m...,"As a researcher with 16 years, I worked across...",I prefer weekly demos and short design docs. I...,Goals: start a cruelty-free lipstick brand.,Find projects,"BigFive(O=0.6666666666666667, C=0.333333333333..."
