##### Here's the problem: prosports transaction is nice, but it's essentially useless if you can't pair it with other data. To do this you need to join the table on some column. The column that needs to be joined on is the player, as we are predicting *PLAYER INJURIES*. With this is mind, from [players.sql](./sql/players.sql) we see that out of the 5024 players, there are 2795 players listed in on our [injury list](../pro_sports_transactions/data/IL_movement.csv) data. about 2400 of those names were mapped automatically (a 1:1) match, for about 80 of those names, they simply do not exist within the nba_api player list (older players from the ABA era), leaving around 300 players that can be mapped with some string manipulation / fuzzy matching. This notebook explores how to do that.

In [None]:
import pandas as pd
import re

In [None]:

players_path = "./data/id_players.csv"
il_path = "./data/IL_movement.csv"
matchables_path = "./data/matchables.csv"
out_il_path = "./data/IL_movement_with_ids.csv"
out_map_path = "./data/matchable_name_to_player_id.csv"
out_unmatched_path = "./data/unmatched_matchables.csv"

players = pd.read_csv(players_path)          
il = pd.read_csv(il_path)                    
matchables = pd.read_csv(matchables_path)    


In [None]:

def normalize(s: str) -> str:
    """Lowercase, strip, remove dots/commas/extra spaces."""
    if pd.isna(s):
        return ""
    s = str(s).lower().strip()
    
    s = s.replace(".", "").replace(",", " ")
    
    s = s.replace("’", "'").replace("`", "'")
    
    s = re.sub(r"\s+", " ", s)
    return s

def strip_parens(s: str) -> str:
    """Remove any parenthetical aliases like '(James) Mike Scott' -> 'Mike Scott'."""
    return re.sub(r"\([^)]*\)\s*", "", s).strip()

def alias_split(s: str):
    """Split on slashes for alias lists."""
    return [p.strip() for p in re.split(r"\s*\/\s*", s) if p.strip()]

def partial_ratio_100(a: str, b: str) -> bool:
    """Emulate fuzz.partial_ratio == 100 (shorter is exact substring of longer)."""
    A, B = normalize(a), normalize(b)
    if not A or not B:
        return False
    if len(A) <= len(B):
        return A in B
    else:
        return B in A

players["norm"] = players["name"].map(normalize)
players["norm_noparen"] = players["name"].map(lambda x: normalize(strip_parens(x)))
player_names = players["name"].tolist()
player_norms = players["norm"].tolist()
player_norms_set = set(player_norms)


In [None]:

def resolve_matchable(nm: str):
    if pd.isna(nm) or not str(nm).strip():
        return None, "empty"
    raw = str(nm).strip()
    
    cands = []
    
    cands.append(raw)
    cands.append(strip_parens(raw))
    
    cands.extend(alias_split(raw))
    cands.extend([strip_parens(p) for p in alias_split(raw)])

    for c in cands:
        n = normalize(c)
        if n in player_norms_set:
            pid = players.loc[players["norm"] == n, "id"].iloc[0]
            return pid, f"direct_norm:{c}"
    
    for c in cands:
        n = normalize(c)
        if not n: 
            continue        
        
        matches = [i for i, pn in enumerate(player_norms) if n in pn or pn in n]
        if matches:
            
            best_idx = sorted(matches, key=lambda i: abs(len(player_norms[i]) - len(n)))[0]
            pid = players.iloc[best_idx]["id"]
            return pid, f"partial_100:{c}->{players.iloc[best_idx]['name']}"

    return None, "no_match"

In [None]:

matchables["player_id"], matchables["how"] = zip(*matchables["name"].map(resolve_matchable))


matchables.to_csv(out_map_path, index=False)


def choose_name(row):
    a = str(row.get("Acquired", "")).strip()
    r = str(row.get("Relinquished", "")).strip()
    return a if a not in ("", "nan", "None") else r

il["player_name"] = il.apply(choose_name, axis=1)


name_to_id_direct = dict(zip(players["name"], players["id"]))
norm_to_id_direct = dict(zip(players["norm"], players["id"]))
matchable_map = dict(zip(matchables["name"], matchables["player_id"]))

In [None]:

def map_row_to_id(nm: str):
    if pd.isna(nm) or not str(nm).strip():
        return None
    raw = str(nm).strip()
    
    if raw in name_to_id_direct:
        return name_to_id_direct[raw]

    for part in alias_split(raw):
        if part in name_to_id_direct:
            return name_to_id_direct[part]
    
    n = normalize(raw)
    if n in norm_to_id_direct:
        return norm_to_id_direct[n]
    for part in alias_split(raw):
        np = normalize(part)
        if np in norm_to_id_direct:
            return norm_to_id_direct[np]
    
    if raw in matchable_map and pd.notna(matchable_map[raw]):
        return matchable_map[raw]
 
    for i, pn in enumerate(player_norms):
        if n in pn or pn in n:
            return players.iloc[i]["id"]

    return None

In [None]:

il["player_id"] = il["player_name"].map(map_row_to_id)

il.to_csv(out_il_path, index=False)

unmatched = matchables[matchables["player_id"].isna()]
unmatched.to_csv(out_unmatched_path, index=False)

In [None]:

summary = {
    "IL_rows": len(il),
    "IL_matched": int(il["player_id"].notna().sum()),
    "IL_unmatched": int(il["player_id"].isna().sum()),
    "matchables_total": len(matchables),
    "matchables_matched": int(matchables["player_id"].notna().sum()),
    "matchables_unmatched": int(unmatched.shape[0]),
    "outputs": {
        "il_with_ids": out_il_path,
        "matchable_map": out_map_path,
        "unmatched_matchables": out_unmatched_path,
    }
}
summary