
# Rocket — **City‑Aware, Language‑Aware** Matching & Team Formation (with CF + Bandits)

This notebook is a production‑minded prototype for **networked team building**, optimized for **local teams** with a **global search** fallback.

**What it does**
- **Rich intake** with DOB→age, city/lat/lon & time zone, **languages (ISO codes)**, availability, energy, collab style, role/seniority/years, skills **have/want**, interests, 5 interview sections.
- **Multilingual embeddings** for content (Sentence‑Transformers if available; TF‑IDF fallback).
- **Geo + language constraints**: 
  - **Local mode**: restrict or sharply weight to same‑city / near‑city within a radius (km), and **require shared language**.
  - **Global mode**: ignore geo or down‑weight it; language is a soft constraint.
- **Skills**: similar vs **complementary** (Hungarian/coverage with fallback).
- **Collaborative filtering**: lightweight MF (ALS‑ish) over implicit likes.
- **Graph**: **Personalized PageRank** over the directed “likes” graph.
- **Bandits**: **LinUCB** (contextual bandit) learns which blend weights to use per user based on context.
- **Diversification**: MMR and DPP‑style greedy.
- **Team formation**: greedy submodular objective balances match, **skill coverage**, and **diversity**, with hard constraints (distance, language, time zone, availability).

> Final cell prints **ALL names** so you can verify cohort creation.


## Optional installs (run locally)

In [None]:

# !pip install numpy pandas scikit-learn networkx geopy scipy
# !pip install sentence-transformers fasttext-langdetect langdetect
# !python -m spacy download en_core_web_sm


In [None]:

import numpy as np, pandas as pd, random, math, importlib
from dataclasses import dataclass
from typing import List, Dict, Any, Optional, Tuple
from datetime import date, datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from geopy.distance import geodesic
import networkx as nx

np.random.seed(2025); random.seed(2025)


## Embeddings — multilingual SBERT if available; TF‑IDF fallback

In [None]:

class Embedder:
    def __init__(self, model_names: List[str] = None):
        self.model = None
        self.sbert_ok = False
        self.tfidf = None
        self.model_names = model_names or [
            "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
            "sentence-transformers/distiluse-base-multilingual-cased-v2",
            "sentence-transformers/all-MiniLM-L6-v2"
        ]
        if importlib.util.find_spec("sentence_transformers") is not None:
            from sentence_transformers import SentenceTransformer
            for name in self.model_names:
                try:
                    self.model = SentenceTransformer(name)
                    self.sbert_ok = True
                    break
                except Exception:
                    continue

    def fit(self, corpus: List[str]):
        if self.sbert_ok:
            return self
        self.tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=1)
        self.tfidf.fit(corpus)
        return self

    def encode(self, items: List[str]) -> np.ndarray:
        if self.sbert_ok:
            return np.array(self.model.encode(items, show_progress_bar=False, normalize_embeddings=True))
        X = self.tfidf.transform(items)
        X = X.astype(np.float64)
        norms = np.sqrt((X.power(2)).sum(axis=1))
        norms[norms==0] = 1.0
        return (X / norms).toarray()

def cos_sim_mat(A: np.ndarray) -> np.ndarray:
    return (A @ A.T).astype(float)


## Utilities — DOB → Age / Bands

In [None]:

def parse_dob(dob_str: str) -> date:
    return datetime.strptime(dob_str, "%Y-%m-%d").date()

def compute_age(dob: date, today: Optional[date] = None) -> int:
    today = today or date.today()
    years = today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
    return max(0, years)

def age_band(age: int) -> str:
    for lo, hi in [(18,24),(25,34),(35,44),(45,54)]:
        if lo <= age <= hi: return f"{lo}-{hi}"
    return "55+"


## Personality — TIPI (fast)

In [None]:

@dataclass
class BigFive:
    O: float; C: float; E: float; A: float; N: float

def clip01(x): 
    import numpy as np
    return float(np.clip(x, 0.0, 1.0))

TIPI_KEY = {
    1: ("E", False), 2: ("A", True), 3: ("C", False), 4: ("N", False), 5: ("O", False),
    6: ("E", True),  7: ("A", False),8: ("C", True),  9: ("N", True),  10:("O", True)
}

def score_tipi(responses_1to7):
    import numpy as np
    assert len(responses_1to7)==10
    r = np.array(responses_1to7, dtype=float)
    r01 = (r-1)/6.0
    traits = {"O":[], "C":[], "E":[], "A":[], "N":[]}
    for i,val in enumerate(r01, start=1):
        trait, rev = TIPI_KEY[i]
        traits[trait].append(1.0-val if rev else val)
    return BigFive(*(clip01(np.mean(traits[t])) for t in ["O","C","E","A","N"]))

def bigfive_cosine(u: BigFive, v: BigFive) -> float:
    import numpy as np
    a = np.array([u.O,u.C,u.E,u.A,u.N])
    b = np.array([v.O,v.C,v.E,v.A,v.N])
    return float(a @ b / (np.linalg.norm(a)*np.linalg.norm(b) + 1e-9))


## Intake schema + normalizer (city & languages)

In [None]:

INTAKE_FIELDS = [
    "name","dob","location_city","location_country","lat","lon","tz_offset",
    "languages",
    "availability_hours","energy_1to5","collab_style",
    "role","seniority","years_exp",
    "skills_have","skills_want","interests",
    "human","professional","contributor","interests_long","reason"
]

def parse_comma_list(s: str) -> List[str]:
    return [x.strip() for x in (s or "").split(",") if x.strip()]

def normalize_intake(row: Dict[str, Any]) -> Dict[str, Any]:
    def wclip(t): 
        ws = (t or "").split()
        return " ".join(ws[:250])
    dob_str = row.get("dob","1989-01-01")
    try:
        dob = parse_dob(dob_str)
    except Exception:
        dob = date(1989,1,1); dob_str="1989-01-01"
    age_val = compute_age(dob)
    return {
        "name": row.get("name","Unnamed"),
        "dob": dob_str, "age": age_val, "age_band": age_band(age_val),
        "location_city": row.get("location_city",""), "location_country": row.get("location_country",""),
        "lat": float(row.get("lat", 43.6532)), "lon": float(row.get("lon", -79.3832)),
        "tz_offset": int(row.get("tz_offset", -5)),
        "languages": ", ".join(parse_comma_list(row.get("languages","en")))[:64],
        "availability_hours": row.get("availability_hours","5-10"),
        "energy_1to5": int(row.get("energy_1to5",3)),
        "collab_style": row.get("collab_style","hybrid"),
        "role": row.get("role","Undecided"),
        "seniority": row.get("seniority","Mid"),
        "years_exp": int(row.get("years_exp",3)),
        "skills_have": ", ".join(parse_comma_list(row.get("skills_have",""))[:24]),
        "skills_want": ", ".join(parse_comma_list(row.get("skills_want",row.get("interests","")))[:24]),
        "interests": ", ".join(parse_comma_list(row.get("interests",""))[:24]),
        "human": wclip(row.get("human","")),
        "professional": wclip(row.get("professional","")),
        "contributor": wclip(row.get("contributor","")),
        "interests_long": wclip(row.get("interests_long","")),
        "reason": wclip(row.get("reason","")),
    }


## Feature builders — content, geo, experience, role, social‑fit, language

In [None]:

def build_text_similarity(df: pd.DataFrame, embedder):
    corpus = (df['interests'].fillna('') + " ; " + df['skills_have'].fillna('') + " ; " + df['professional'].fillna('')).tolist()
    embedder.fit(corpus)
    X = embedder.encode(corpus)
    S = (X @ X.T)
    S = (S - S.min())/(S.max()-S.min()+1e-9)
    return S

def language_overlap(df: pd.DataFrame) -> np.ndarray:
    n=len(df); S=np.zeros((n,n))
    langs = [set([x.strip().lower() for x in (l or "").split(",") if x.strip()]) for l in df['languages'].fillna('')]
    for i in range(n):
        for j in range(n):
            if i==j: continue
            inter = langs[i] & langs[j]
            uni = langs[i] | langs[j]
            S[i,j] = len(inter)/float(len(uni) + 1e-9)
    return S

def geo_similarity(df: pd.DataFrame, decay_km: float = 1200.0) -> np.ndarray:
    n = len(df); S = np.zeros((n,n), dtype=float)
    coords = list(zip(df['lat'], df['lon']))
    for i in range(n):
        for j in range(n):
            if i==j: continue
            d_km = geodesic(coords[i], coords[j]).km
            S[i,j] = np.exp(-d_km/decay_km)
    if S.max()>0: S = S/S.max()
    return S

def distance_matrix(df: pd.DataFrame) -> np.ndarray:
    n=len(df); D=np.zeros((n,n))
    coords = list(zip(df['lat'], df['lon']))
    for i in range(n):
        for j in range(n):
            if i==j: continue
            D[i,j] = geodesic(coords[i], coords[j]).km
    return D

def experience_compatibility(years: List[int], sweet_spot: float = 3.0) -> np.ndarray:
    years = np.array(years); n=len(years); S=np.zeros((n,n),dtype=float)
    for i in range(n):
        for j in range(n):
            if i==j: continue
            gap = abs(years[i]-years[j])
            S[i,j] = np.exp(-((gap-sweet_spot)**2)/(2*(sweet_spot**2)))
    if S.max()>0: S = S/S.max()
    return S

ROLE_COMP = {
    "Founder": {"Engineer": 1.0, "Designer": 1.0, "Researcher": 0.8, "Founder": 0.2, "Writer":0.6, "Scientist":0.7, "Creator":0.8},
    "Engineer": {"Founder": 1.0, "Designer": 0.7, "Engineer": 0.2, "Researcher": 0.6, "Writer":0.6, "Scientist":0.8, "Creator":0.7},
    "Designer": {"Founder": 1.0, "Engineer": 0.7, "Designer": 0.2, "Researcher": 0.5, "Writer":0.6, "Scientist":0.5, "Creator":0.9},
    "Researcher": {"Founder": 0.8, "Engineer": 0.7, "Designer": 0.5, "Researcher": 0.3, "Writer":0.5, "Scientist":0.9, "Creator":0.6},
    "Writer": {"Founder":0.8, "Engineer":0.6, "Designer":0.7, "Researcher":0.5, "Writer":0.2, "Scientist":0.5, "Creator":0.9},
    "Scientist":{"Founder":0.9, "Engineer":0.9, "Designer":0.5, "Researcher":0.8, "Writer":0.5, "Scientist":0.2, "Creator":0.6},
    "Creator":{"Founder":0.9, "Engineer":0.7, "Designer":0.9, "Researcher":0.6, "Writer":0.9, "Scientist":0.6, "Creator":0.3},
    "Undecided": {"Founder":0.6,"Engineer":0.6,"Designer":0.6,"Researcher":0.6,"Writer":0.6,"Scientist":0.6,"Creator":0.6,"Undecided":0.2}
}

def role_complementarity(df: pd.DataFrame) -> np.ndarray:
    roles = df['role'].tolist(); n=len(roles); S=np.zeros((n,n),dtype=float)
    for i in range(n):
        for j in range(n):
            if i==j: continue
            S[i,j] = ROLE_COMP.get(roles[i], {}).get(roles[j], 0.2)
    return S

def energy_compatibility(energies: List[int], target_gap=0):
    e = np.array(energies); n=len(e); S=np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            if i==j: continue
            gap = abs(e[i]-e[j])
            S[i,j] = np.exp(-((gap-target_gap)**2)/(2*(1.25**2)))
    if S.max()>0: S = S/S.max()
    return S

COLLAB_COMP = {
    "async": {"async":1.0, "hybrid":0.7, "sync":0.3},
    "hybrid":{"async":0.7, "hybrid":1.0, "sync":0.7},
    "sync":  {"async":0.3, "hybrid":0.7, "sync":1.0},
}
def collab_style_compatibility(styles: List[str]) -> np.ndarray:
    n=len(styles); S=np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            if i==j: continue
            S[i,j] = COLLAB_COMP.get(styles[i],{}).get(styles[j], 0.5)
    return S

def availability_overlap(avails: List[str]) -> np.ndarray:
    map_mid = {"2-5":3.5,"5-10":7.5,"10-20":15.0,"20+":25.0}
    v = np.array([map_mid.get(a,7.5) for a in avails])
    n=len(v); S=np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            if i==j: continue
            gap = abs(v[i]-v[j])
            S[i,j] = np.exp(-gap/15.0)
    if S.max()>0: S = S/S.max()
    return S

def time_zone_overlap(tz_list: List[int]) -> np.ndarray:
    tz = np.array(tz_list); n=len(tz); S=np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            if i==j: continue
            diff = abs(tz[i]-tz[j])
            S[i,j] = np.exp(-diff/6.0)
    if S.max()>0: S = S/S.max()
    return S


## Skills — similar vs complementary

In [None]:

def parse_skill_list(sk: str) -> List[str]:
    return [s.strip().lower() for s in (sk or "").split(",") if s.strip()]

def tfidf_cosine(a_list: List[str], b_list: List[str]) -> float:
    docs = ["; ".join(a_list), "; ".join(b_list)]
    vec = TfidfVectorizer(ngram_range=(1,2), min_df=1)
    X = vec.fit_transform(docs)
    return float(cosine_similarity(X[0], X[1])[0,0])

def similar_skills_matrix(df: pd.DataFrame, embedder=None) -> np.ndarray:
    n=len(df); S=np.zeros((n,n))
    if embedder is not None and getattr(embedder, "sbert_ok", False):
        corpus = df['skills_have'].fillna('').tolist()
        X = embedder.encode(corpus)
        S = (X @ X.T)
        S = (S - S.min())/(S.max()-S.min()+1e-9)
        np.fill_diagonal(S, 0.0)
        return S
    parsed = [parse_skill_list(x) for x in df['skills_have'].fillna('')]
    for i in range(n):
        for j in range(n):
            if i==j: continue
            S[i,j] = tfidf_cosine(parsed[i], parsed[j])
    if S.max()>0: S = S/S.max()
    return S

def complementary_skills_matrix(df: pd.DataFrame) -> np.ndarray:
    wants = [parse_skill_list(row.get('skills_want', row.get('interests',''))) for _,row in df.iterrows()]
    haves = [parse_skill_list(row.get('skills_have','')) for _,row in df.iterrows()]
    n=len(df); S=np.zeros((n,n))
    try:
        from scipy.optimize import linear_sum_assignment
        for i in range(n):
            need = wants[i]
            for j in range(n):
                if i==j: continue
                have = haves[j]
                if not need or not have: 
                    S[i,j]=0.0; continue
                A = ["; ".join([n1]) for n1 in need]
                B = ["; ".join([h1]) for h1 in have]
                vec = TfidfVectorizer(ngram_range=(1,2), min_df=1)
                X = vec.fit_transform(A + B)
                m, k = len(need), len(have)
                Csim = np.zeros((m,k))
                for p in range(m):
                    for q in range(k):
                        Csim[p,q] = cosine_similarity(X[p], X[m+q])[0,0]
                size = max(m,k)
                padded = np.ones((size,size))
                padded[:m,:k] = 1.0 - Csim  # cost = 1 - sim
                r_ind, c_ind = linear_sum_assignment(padded)
                total_sim = 0.0; count = 0
                for r,c in zip(r_ind, c_ind):
                    if r < m and c < k:
                        total_sim += 1.0 - padded[r,c]; count += 1
                S[i,j] = total_sim / (count + 1e-9)
        if S.max()>0: S = S/S.max()
    except Exception:
        for i in range(n):
            need = wants[i]
            for j in range(n):
                if i==j: continue
                have = haves[j]
                if not need or not have: 
                    S[i,j]=0.0; continue
                sims = []
                for nterm in need:
                    sims.append(max(tfidf_cosine([nterm], [h]) for h in have))
                S[i,j] = float(np.mean(sims)) if sims else 0.0
        if S.max()>0: S = S/S.max()
    return S


## Collaborative filtering — lightweight MF (implicit feedback)

In [None]:

def mf_als(R: np.ndarray, k: int = 16, alpha: float = 40.0, reg: float = 0.1, iters: int = 6):
    n_users, n_items = R.shape
    C = 1 + alpha * R  # confidence
    X = np.random.normal(scale=0.1, size=(n_users, k))
    Y = np.random.normal(scale=0.1, size=(n_items, k))
    eye = np.eye(k)
    for _ in range(iters):
        for u in range(n_users):
            Cu = np.diag(C[u])
            YTCuY = Y.T @ Cu @ Y
            YTCuPu = Y.T @ (Cu @ R[u])
            X[u] = np.linalg.solve(YTCuY + reg*eye, YTCuPu)
        for i in range(n_items):
            Ci = np.diag(C[:,i])
            XTCiX = X.T @ Ci @ X
            XTCiPi = X.T @ (Ci @ R[:,i])
            Y[i] = np.linalg.solve(XTCiX + reg*eye, XTCiPi)
    Xn = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-9)
    S = Xn @ Xn.T
    S = (S - S.min())/(S.max()-S.min()+1e-9)
    np.fill_diagonal(S, 0.0)
    return S


## Graph (PPR), language/geo gating, and fusion (local vs global)

In [None]:

def reciprocalize(S: np.ndarray) -> np.ndarray:
    return np.sqrt(S * S.T + 1e-12)

def build_graph_ppr(R: np.ndarray, alpha=0.85) -> np.ndarray:
    n = R.shape[0]
    G = nx.DiGraph(); G.add_nodes_from(range(n))
    edges = [(u,v) for u in range(n) for v in range(n) if R[u,v]>0]
    G.add_edges_from(edges)
    S = np.zeros((n,n))
    for u in range(n):
        pr = nx.pagerank(G, alpha=alpha, personalization={k:(1.0 if k==u else 0.0) for k in range(n)})
        for v,s in pr.items(): S[u,v] = s
    S = (S - S.min())/(S.max()-S.min()+1e-12)
    np.fill_diagonal(S, 0.0)
    return S

def combine_content(S_text, S_geo, S_exp, S_role, S_energy, S_collab, S_avail, S_tz, S_lang, w):
    a,b,c,d,e,f,g,h,l = w
    S = a*S_text + b*S_geo + c*S_exp + d*S_role + e*S_energy + f*S_collab + g*S_avail + h*S_tz + l*S_lang
    return S / (S.max() + 1e-9)

def fuse_scores(S_content, S_cf, S_graph, S_person, S_skills, weights=(0.30,0.18,0.16,0.10,0.26)):
    Sc = reciprocalize(S_content)
    Sf = reciprocalize(S_cf)
    Sg = reciprocalize(S_graph)
    Sp = reciprocalize(S_person)
    Ss = reciprocalize(S_skills)
    a,b,c,d,e = weights
    S = a*Sc + b*Sf + c*Sg + d*Sp + e*Ss
    return S / (S.max() + 1e-12)

def distance_matrix(df: pd.DataFrame) -> np.ndarray:
    n=len(df); D=np.zeros((n,n))
    coords = list(zip(df['lat'], df['lon']))
    for i in range(n):
        for j in range(n):
            if i==j: continue
            D[i,j] = geodesic(coords[i], coords[j]).km
    return D

def apply_language_requirement(S: np.ndarray, S_lang: np.ndarray, require_shared: bool) -> np.ndarray:
    if not require_shared: return S
    mask = (S_lang > 0.0).astype(float)
    return S * mask


## Diversification — MMR and DPP‑style greedy

In [None]:

def mmr_rank(query_idx: int, S: np.ndarray, K: int = 5, lambda_rel: float = 0.7):
    n = S.shape[0]
    candidates = [i for i in range(n) if i != query_idx]
    selected = []
    while candidates and len(selected) < K:
        if not selected:
            i = max(candidates, key=lambda j: S[query_idx, j])
            selected.append(i); candidates.remove(i)
        else:
            def score(j):
                redundancy = max(S[j, s] for s in selected) if selected else 0.0
                return lambda_rel * S[query_idx, j] - (1-lambda_rel) * redundancy
            i = max(candidates, key=score)
            selected.append(i); candidates.remove(i)
    return selected

def dpp_greedy(query_idx: int, S: np.ndarray, K: int = 5):
    n = S.shape[0]
    items = [i for i in range(n) if i != query_idx]
    quality = S[query_idx].copy()
    q = quality / (quality.max() + 1e-9)
    selected = []
    remaining = items.copy()
    while remaining and len(selected) < K:
        if not selected:
            idx = int(np.argmax([q[items.index(r)] for r in remaining]))
            chosen = remaining[idx]
        else:
            scores = []
            for r in remaining:
                max_sim = max(S[r, s] for s in selected) if selected else 0.0
                scores.append(q[items.index(r)] - max_sim)
            idx = int(np.argmax(scores))
            chosen = remaining[idx]
        selected.append(chosen); remaining.pop(idx)
    return selected


## Contextual bandit — LinUCB (choose blend weights per‑user context)

In [None]:

class LinUCB:
    def __init__(self, arms: List[np.ndarray], alpha: float = 0.3, d: int = None):
        self.arms = [np.array(a, dtype=float) for a in arms]
        self.alpha = alpha
        self.A = []
        self.b = []
        self.d = None if d is None else d
        self.theta = []
    def _init_if_needed(self, x: np.ndarray):
        if self.d is None:
            self.d = len(x)
        if not self.A:
            for _ in self.arms:
                self.A.append(np.eye(self.d))
                self.b.append(np.zeros(self.d))
                self.theta.append(np.zeros(self.d))
    def select(self, x: np.ndarray) -> int:
        self._init_if_needed(x)
        p = []
        for i in range(len(self.arms)):
            A_inv = np.linalg.inv(self.A[i])
            theta = A_inv @ self.b[i]
            self.theta[i] = theta
            p_i = theta @ x + self.alpha * np.sqrt(x.T @ A_inv @ x)
            p.append(p_i)
        return int(np.argmax(p))
    def update(self, arm_idx: int, x: np.ndarray, reward: float):
        self._init_if_needed(x)
        self.A[arm_idx] += np.outer(x, x)
        self.b[arm_idx] += reward * x


## Generate 120 users across cities & languages

In [None]:

roles = ["Founder","Engineer","Designer","Researcher","Writer","Scientist","Creator"]
seniorities = ["Junior","Mid","Senior","Lead/Principal","Executive/Founder"]
cities = [
    ("Toronto",43.6532,-79.3832,-5, ["en","fr"]),
    ("New York",40.7128,-74.0060,-5, ["en","es"]),
    ("San Francisco",37.7749,-122.4194,-8, ["en","zh"]),
    ("London",51.5072,-0.1276,0, ["en","fr","de"]),
    ("Berlin",52.52,13.405,1, ["de","en"]),
    ("Nairobi",-1.286389,36.817223,3, ["en","sw"]),
    ("Sydney",-33.8688,151.2093,10, ["en"]),
    ("Bangalore",12.9716,77.5946,5, ["en","hi"]),
    ("Paris",48.8566,2.3522,1, ["fr","en"]),
    ("Mexico City",19.4326,-99.1332,-6, ["es","en"])
]

skill_bank = [
    "python","pytorch","tensorflow","django","react","nextjs","go","kubernetes","aws","gcp",
    "video editing","storyboarding","scriptwriting","podcasting","seo","branding","figma","design systems",
    "statistics","causal inference","nlp","cv","prompt engineering","sql","dbt","airflow",
    "grant writing","field research","lab techniques","oceanography","genomics","biostatistics",
    "supply chain","marketing","growth","product","fundraising","strategy"
]
interest_bank = [
    "ocean conservation","coral reef restoration","climate tech","educational apps","healthcare AI",
    "creator economy","open source tools","social impact","rural connectivity","financial inclusion",
    "short-form video","long-form YouTube","beauty brand","lipstick R&D","fashion sustainability",
    "music production","publishing","newsletter growth","sports analytics","mental health",
    "language learning","VR social spaces","next social network","privacy-first messaging"
]

def rand_words(pool, kmin, kmax):
    k = random.randint(kmin, kmax)
    return ", ".join(random.sample(pool, k))

def random_dob():
    y = random.randint(1961, 2004)
    m = random.randint(1,12); d = random.randint(1,28)
    return f"{y:04d}-{m:02d}-{d:02d}"

def mk_user(i):
    name = f"User{i:03d}"
    (city, lat, lon, tz, langs_base) = random.choice(cities)
    role = random.choice(roles)
    seniority = random.choice(seniorities)
    skills_have = rand_words(skill_bank, 3, 7)
    skills_want = rand_words(skill_bank, 2, 5)
    interests = rand_words(interest_bank, 3, 7)
    years = random.randint(1, 18)
    human = f"I live in {city}. I like calm schedules and meetups; I enjoy running and cooking."
    professional = f"As a {role.lower()} with {years} years, I worked across startups and labs. I can produce prototypes, brand systems, docs, and production code."
    contributor = "I prefer weekly demos and short design docs. I bring reliability, curiosity, and momentum to small teams with clear ownership."
    interests_long = f"Goals: {random.choice(['launch a YouTube channel on ML','build ocean microplastics sensors','start a cruelty-free lipstick brand','prototype a privacy-first social app'])}."
    reason = random.choice(["Find projects","Expand network","Find collaborators","Build a dream"])

    langs = random.sample(langs_base, min(len(langs_base), random.choice([1,1,2])))
    row = dict(
        name=name, dob=random_dob(), location_city=city, location_country="",
        lat=lat, lon=lon, tz_offset=tz, languages=", ".join(langs),
        availability_hours=random.choice(["2-5","5-10","10-20","20+"]),
        energy_1to5=random.randint(1,5), collab_style=random.choice(["async","hybrid","sync"]),
        role=role, seniority=seniority, years_exp=years,
        skills_have=skills_have, skills_want=skills_want, interests=interests,
        human=human, professional=professional, contributor=contributor, interests_long=interests_long, reason=reason
    )
    return normalize_intake(row)

records = [mk_user(i) for i in range(1,121)]
tipi_all = [[random.randint(2,6) for _ in range(10)] for __ in range(120)]
bfs = [score_tipi(t) for t in tipi_all]
users = pd.DataFrame(records)
users['bf'] = bfs
users.head(3)


## Build similarity signals

In [None]:

embedder = Embedder()
S_text = build_text_similarity(users, embedder)
S_lang = language_overlap(users)
S_geo  = geo_similarity(users, decay_km=1200.0)
S_exp  = experience_compatibility(users['years_exp'].tolist())
S_role = role_complementarity(users)
S_energy = energy_compatibility(users['energy_1to5'].tolist())
S_collab = collab_style_compatibility(users['collab_style'].tolist())
S_avail  = availability_overlap(users['availability_hours'].tolist())
S_tz     = time_zone_overlap(users['tz_offset'].tolist())
D_km     = distance_matrix(users)

S_skills_sim = similar_skills_matrix(users, embedder if getattr(embedder,"sbert_ok",False) else None)
S_skills_comp = complementary_skills_matrix(users)

n = len(users); R = np.zeros((n,n), dtype=float)
for _ in range(900):
    u = random.randrange(n); v = random.randrange(n)
    if u==v: continue
    if users.iloc[u].role=="Founder" and users.iloc[v].role in ["Engineer","Designer"]: R[u,v]=1.0
    elif users.iloc[u].role=="Creator" and users.iloc[v].role in ["Writer","Designer","Engineer"]: R[u,v]=1.0
    elif users.iloc[u].location_city==users.iloc[v].location_city and random.random()<0.25: R[u,v]=1.0
    elif random.random() < 0.04: R[u,v]=1.0

S_cf = mf_als(R, k=16)
S_graph = build_graph_ppr(R, alpha=0.82)

S_person = np.zeros((n,n))
for i in range(n):
    for j in range(n):
        if i==j: continue
        S_person[i,j] = bigfive_cosine(users.iloc[i].bf, users.iloc[j].bf)
S_person = (S_person - S_person.min())/(S_person.max()-S_person.min()+1e-9)


## Matching functions — local/global toggle + language requirement

In [None]:

def match_candidates(query_idx: int, skills_mode="similar", k=5, search_mode="local", 
                     local_radius_km: float = 50.0, require_shared_language: bool = True,
                     diversifier="mmr", content_weights=(0.24,0.16,0.10,0.10,0.10,0.10,0.10,0.05,0.05),
                     blend_weights=(0.34,0.18,0.14,0.12,0.22)):
    S_content = combine_content(S_text, S_geo, S_exp, S_role, S_energy, S_collab, S_avail, S_tz, S_lang, content_weights)
    S_sk = S_skills_sim if skills_mode=="similar" else S_skills_comp
    S_final = fuse_scores(S_content, S_cf, S_graph, S_person, S_skills=S_sk, weights=blend_weights)

    if search_mode == "local":
        # Mask out users beyond radius
        mask_row = (D_km[query_idx] <= local_radius_km).astype(float)
        S_final = S_final * mask_row.reshape(1,-1)
        if require_shared_language:
            S_final = apply_language_requirement(S_final, S_lang, True)

    if diversifier == "mmr":
        picks = mmr_rank(query_idx, S_final, K=k, lambda_rel=0.72)
    elif diversifier == "dpp":
        picks = dpp_greedy(query_idx, S_final, K=k)
    else:
        scores = list(enumerate(S_final[query_idx]))
        scores = [(j,s) for j,s in scores if j!=query_idx]
        picks = [j for j,_ in sorted(scores, key=lambda x: -x[1])[:k]]

    cols = ['name','location_city','languages','role','seniority','interests','skills_have','skills_want','years_exp',
            'age','age_band','energy_1to5','collab_style','availability_hours','reason']
    out = users.iloc[picks][cols].copy()
    out['score'] = [S_final[query_idx,j] for j in picks]
    return out


## Team formation — local constraints (distance, language)

In [None]:

def team_score(set_ids: List[int], query_idx: int, S_final: np.ndarray, skills_need: List[str], users_df: pd.DataFrame):
    if not set_ids: return 0.0
    rel = np.mean([S_final[query_idx, j] for j in set_ids])
    need = set([s.strip().lower() for s in skills_need if s.strip()])
    have = set()
    for j in set_ids:
        have |= set([s.strip().lower() for s in users_df.iloc[j].skills_have.split(",") if s.strip()])
    coverage = len(need & have) / (len(need) + 1e-9)
    if len(set_ids) > 1:
        pair_sims = []
        for a in range(len(set_ids)):
            for b in range(a+1, len(set_ids)):
                pair_sims.append(S_final[set_ids[a], set_ids[b]])
        div = 1.0 - float(np.mean(pair_sims))
    else:
        div = 1.0
    return 0.55*rel + 0.30*coverage + 0.15*div

def form_team(query_idx: int, skills_need_text: str, K: int = 4, search_mode="local",
              local_radius_km=50.0, require_shared_language=True, blend_weights=(0.34,0.18,0.14,0.12,0.22)):
    need = [s.strip() for s in skills_need_text.split(",") if s.strip()]
    S_content = combine_content(S_text, S_geo, S_exp, S_role, S_energy, S_collab, S_avail, S_tz, S_lang,
                                (0.24,0.20,0.10,0.10,0.10,0.10,0.08,0.04,0.04))
    S_sk = S_skills_comp
    S_final = fuse_scores(S_content, S_cf, S_graph, S_person, S_skills=S_sk, weights=blend_weights)

    candidates = [i for i in range(len(users)) if i != query_idx]
    feasible = []
    for j in candidates:
        ok = True
        if search_mode=="local" and D_km[query_idx,j] > local_radius_km: ok = False
        if require_shared_language:
            langs_i = set([x.strip().lower() for x in users.iloc[query_idx].languages.split(",") if x.strip()])
            langs_j = set([x.strip().lower() for x in users.iloc[j].languages.split(",") if x.strip()])
            if len(langs_i & langs_j)==0: ok=False
        if ok: feasible.append(j)

    selected = []
    while feasible and len(selected) < K:
        base = team_score(selected, query_idx, S_final, need, users)
        best_gain, best_j = -1, None
        for j in feasible:
            gain = team_score(selected+[j], query_idx, S_final, need, users) - base
            if gain > best_gain:
                best_gain, best_j = gain, j
        if best_j is None: break
        selected.append(best_j); feasible.remove(best_j)

    cols = ['name','location_city','languages','role','seniority','skills_have','years_exp','tz_offset','availability_hours','collab_style']
    df = users.iloc[selected][cols].copy()
    df['match_score'] = [S_final[query_idx,j] for j in selected]
    return df


## LinUCB demo — learn which blend weights to use for a user

In [None]:

arms = [
    np.array([0.40,0.18,0.10,0.10,0.22]),
    np.array([0.28,0.26,0.16,0.10,0.20]),
    np.array([0.32,0.18,0.14,0.16,0.20]),
    np.array([0.30,0.18,0.12,0.10,0.30]),
]
linucb = LinUCB(arms, alpha=0.35)

def user_context_vector(idx: int) -> np.ndarray:
    e = users.iloc[idx].energy_1to5 / 5.0
    tz = (users.iloc[idx].tz_offset + 12) / 24.0
    exp = min(users.iloc[idx].years_exp, 20) / 20.0
    role_onehot = np.zeros(7); role_map = {"Founder":0,"Engineer":1,"Designer":2,"Researcher":3,"Writer":4,"Scientist":5,"Creator":6}
    role_onehot[role_map.get(users.iloc[idx].role,0)] = 1.0
    return np.concatenate([[e,tz,exp], role_onehot])

q = 0
for t in range(10):
    x = user_context_vector(q)
    arm_idx = linucb.select(x)
    res = match_candidates(q, skills_mode="similar", k=5, search_mode="global", blend_weights=arms[arm_idx])
    top_idx = res.index[0]
    city_ok = users.iloc[q].location_city == users.iloc[top_idx].location_city
    lang_ok = len(set(users.iloc[q].languages.split(", ")) & set(users.iloc[top_idx].languages.split(", ")))>0
    reward = 1.0 if (city_ok or lang_ok) else 0.0
    linucb.update(arm_idx, x, reward)

linucb.theta


## Demo — local vs global matches & local team build

In [None]:

mx_idx = users.index[users.location_city=="Mexico City"][0]
ny_idx = users.index[users.location_city=="New York"][0]

print("=== LOCAL (Mexico City) — matches (require shared language) ===")
mx_local = match_candidates(mx_idx, skills_mode="complementary", k=5, search_mode="local",
                            local_radius_km=60.0, require_shared_language=True, diversifier="mmr")
print(mx_local[['name','location_city','languages','role','score']].to_string(index=False))

print("\n=== GLOBAL (Mexico City) — matches (no geo gating) ===")
mx_global = match_candidates(mx_idx, skills_mode="similar", k=5, search_mode="global",
                             diversifier="dpp")
print(mx_global[['name','location_city','languages','role','score']].to_string(index=False))

print("\n=== LOCAL TEAM (New York) — need: react, product, branding, growth ===")
ny_team = form_team(ny_idx, "react, product, branding, growth", K=4, search_mode="local",
                    local_radius_km=50.0, require_shared_language=True)
print(ny_team.to_string(index=False))


## Final — print ALL names

In [None]:

all_names_df = users[['name']].copy()
print(all_names_df.to_string(index=False))
all_names_df.head()
