In [1]:
!pip install espn_api


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


In [39]:
import pandas as pd
import numpy as np
import espn_api as espn
from espn_api.football import League
import matplotlib.pyplot as plt
import seaborn as sns

### Connect to league

In [58]:
import os
# from espn_api.football import League

LEAGUE_ID = 86952922
YEARS = [2021, 2022, 2024]

y = YEARS[-1]

# for y in YEARS[-1:]:
    
espn_s2 = "AEC20e998honXS4Wi0Z8qzlJdam4%2F%2BaApa7apspnhKR0Npb%2FMsF5DuQsFUcHW%2FhPihQun9U6PGITOi2CkbdfDCkUc8druBVhAwT08Lzrvv8oZli8YAuTi9mIWg7YqtorCNOEKPxHpYswnT3q7b885tRDKBJpLCH0T2h4h1p%2B02SfdlDhjEB2gHqFk1xl6tJRNMBiCkZ8i5RttLW6ER9ZvLTmmAdb5ceZhQ27NEMiMf%2BjWSSvwBdnf2roxwt9baw33BVnnITqYVb8FXsaUwm7%2Bm0m9GLQ%2B66%2BU%2Brg%2BQngXm1ekA%3D%3D"
swid = "{B431504E-F779-4C49-B3E8-28DDF7409957}"
kwargs = {"league_id": LEAGUE_ID, "year": y, "swid":swid, "espn_s2":espn_s2}
league = League(**kwargs)
teams_count = len(league.teams) if getattr(league, "teams", None) else None
# Try to read a small bit of data to validate
draft = league.draft

### Get historical fantasy stats from Pro Football Reference

In [153]:
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd
import re

def scrape_pfr_fantasy(year: int) -> pd.DataFrame:
    url = f"https://www.pro-football-reference.com/years/{year}/fantasy.htm"
    headers = {"User-Agent": "Mozilla/5.0"}
    html = requests.get(url, headers=headers).text
    soup = BeautifulSoup(html, "html.parser")

    # Helper: return the fantasy table element whether it's in DOM or inside comments
    def find_fantasy_table(soup: BeautifulSoup):
        # 1) try live
        t = soup.find("table", {"id": "fantasy"})
        if t:
            return t
        # 2) scan comments
        for c in soup.find_all(string=lambda t: isinstance(t, Comment)):
            if 'id="fantasy"' in c:
                frag = BeautifulSoup(c, "html.parser")
                t = frag.find("table", {"id": "fantasy"})
                if t:
                    return t
        return None

    table = find_fantasy_table(soup)
    if table is None:
        raise ValueError(f"Fantasy table not found for {year}")

    # Build column order from the LAST header row in thead (skip over_header rows)
    thead = table.find("thead")
    header_rows = [tr for tr in thead.find_all("tr") if "over_header" not in tr.get("class", [])]
    # last header row is the one with the actual columns
    last_hdr = header_rows[-1]
    cols = []
    for th in last_hdr.find_all("th"):
        key = th.get("data-stat", "").strip()
        if key:  # skip corner blanks
            cols.append(key)

    # Parse body rows using data-stat keys
    data = []
    tbody = table.find("tbody")
    for tr in tbody.find_all("tr"):
        # PFR sometimes repeats a header row inside tbody with class="thead"
        if "thead" in tr.get("class", []):
            continue

        row = {}
        # some tables have a row header in <th scope="row">
        th = tr.find("th", {"scope": "row"})
        if th is not None:
            k = th.get("data-stat", "").strip() or "rk"
            row[k] = th.get_text(strip=True)

        for td in tr.find_all("td"):
            k = td.get("data-stat", "").strip()
            if not k:
                continue
            row[k] = td.get_text(strip=True)

        # skip completely empty rows
        if not row:
            continue
        data.append(row)

    # Create DataFrame and align to column order (keep any extra keys too)
    df = pd.DataFrame(data)

    # Ensure the final column order starts with `cols` then any extras
    extras = [c for c in df.columns if c not in cols]
    df = df[[c for c in cols if c in df.columns] + extras]

    # Clean: remove repeated header rows if any sneaked in
    if "player" in df.columns:
        df = df[df["player"].str.lower() != "player"]

    # Coerce numeric columns
    for c in df.columns:
        if c in ("player", "team", "pos", "tm"):
            continue
        df[c] = pd.to_numeric(df[c], errors="ignore")

    return df

# Example usage
df_2021 = scrape_pfr_fantasy(2020)


In [154]:
pd.set_option('display.max_columns', None)
df_2021.head()

Unnamed: 0,ranker,player,team,fantasy_pos,age,g,gs,pass_cmp,pass_att,pass_yds,pass_td,pass_int,rush_att,rush_yds,rush_yds_per_att,rush_td,targets,rec,rec_yds,rec_yds_per_rec,rec_td,fumbles,fumbles_lost,all_td,two_pt_md,two_pt_pass,fantasy_points,fantasy_points_ppr,draftkings_points,fanduel_points,vbd,fantasy_rank_pos,fantasy_rank_overall
0,1,Derrick Henry*+,TEN,RB,26,16,16,0,0,0,0,0,378,2027,5.36,17,31,19,114,6.0,0,3.0,2,17,1.0,,314.0,333.1,341.1,323.6,184.0,1,1.0
1,2,Alvin Kamara*,NOR,RB,25,15,10,0,0,0,0,0,187,932,4.98,16,107,83,756,9.11,5,1.0,0,21,,,295.0,377.8,383.8,336.3,165.0,2,2.0
2,3,Dalvin Cook*,MIN,RB,25,14,14,0,0,0,0,0,312,1557,4.99,16,54,44,361,8.2,1,5.0,3,17,3.0,,294.0,337.8,346.8,315.8,164.0,3,3.0
3,4,Davante Adams*+,GNB,WR,28,14,14,0,0,0,0,0,0,0,,0,149,115,1374,11.95,18,1.0,1,18,,,243.0,358.4,362.4,300.9,117.0,1,4.0
4,5,Travis Kelce*+,KAN,TE,31,15,15,1,2,4,0,0,0,0,,0,145,105,1416,13.49,11,1.0,1,11,1.0,,208.0,312.8,316.8,260.3,117.0,1,5.0


In [155]:
df_2021['player_name'] = df_2021['player'].str.replace(r"[*+]", "", regex=True).str.strip()
stats = df_2021[['player_name', 'team', 'fantasy_pos', 'g', 'gs', 'fantasy_points_ppr']]
stats = stats.loc[stats['fantasy_pos'].isin(['QB', 'RB', 'WR', 'TE'])]
stats['fantasy_points_ppr'].fillna(0, inplace=True)
stats["pos_rank"] = (
    stats.groupby("fantasy_pos")["fantasy_points_ppr"]
      .rank(method="dense", ascending=False)
      .astype(int)
)

stats["full_rank"] = (
    stats["fantasy_points_ppr"]
      .rank(method="dense", ascending=False)
      .astype(int)
)

In [156]:
stats.sort_values('fantasy_points_ppr', ascending=False).head(20)

Unnamed: 0,player_name,team,fantasy_pos,g,gs,fantasy_points_ppr,pos_rank,full_rank
6,Josh Allen,BUF,QB,16,16,396.1,1,1
7,Aaron Rodgers,GNB,QB,16,16,383.3,2,2
8,Kyler Murray,ARI,QB,16,16,378.7,3,3
1,Alvin Kamara,NOR,RB,15,10,377.8,1,4
9,Patrick Mahomes,KAN,QB,15,15,374.4,4,5
10,Deshaun Watson,HOU,QB,16,16,369.3,5,6
15,Russell Wilson,SEA,QB,16,16,359.8,6,7
3,Davante Adams,GNB,WR,14,14,358.4,1,8
22,Ryan Tannehill,TEN,QB,16,16,344.4,7,9
25,Tom Brady,TAM,QB,16,16,337.9,8,10


In [157]:
rates = {'QB':12, 'RB':30, 'WR':30, 'TE':12}

def top_by_position_df(df: pd.DataFrame, rates: dict):
    # expect columns: position, pos_rank, overall_rank (case-insensitive ok)
    df2 = df.copy()
    df2["fantasy_pos"] = df2["fantasy_pos"].str.upper()

    frames = []
    for pos, n in rates.items():
        g = df2[df2["fantasy_pos"] == pos].sort_values(
            by=["pos_rank", "full_rank"],
            na_position="last"
        ).head(n)
        frames.append(g)

    out = pd.concat(frames, ignore_index=True) if frames else df2.iloc[0:0]
    # also return a flat list ordered by overall_rank if you want
    flat = out.sort_values(by=["full_rank"], na_position="last")
    return out, flat


by_pos_df, flat_df = top_by_position_df(stats, rates)

In [158]:
flat_df.head(20)

Unnamed: 0,player_name,team,fantasy_pos,g,gs,fantasy_points_ppr,pos_rank,full_rank
0,Josh Allen,BUF,QB,16,16,396.1,1,1
1,Aaron Rodgers,GNB,QB,16,16,383.3,2,2
2,Kyler Murray,ARI,QB,16,16,378.7,3,3
12,Alvin Kamara,NOR,RB,15,10,377.8,1,4
3,Patrick Mahomes,KAN,QB,15,15,374.4,4,5
4,Deshaun Watson,HOU,QB,16,16,369.3,5,6
5,Russell Wilson,SEA,QB,16,16,359.8,6,7
42,Davante Adams,GNB,WR,14,14,358.4,1,8
6,Ryan Tannehill,TEN,QB,16,16,344.4,7,9
7,Tom Brady,TAM,QB,16,16,337.9,8,10


### Create Draft Value Metric 

In [161]:
def compute_vorp_star(
    df: pd.DataFrame,
    teams: int = 12,
    starters_per_team: dict = None,
    use_ppg: bool = False,
    min_games_for_ppg: int = 8,
    pool_factor: float = 2.0,         # pool ~ 2× starters per position
    winsor_limits: tuple = (0.02, 0.98)  # clamp extremes for robust scale
):
    """
    Returns a copy of df with columns:
      - points_used         (PPG or season total)
      - rep_points          (replacement level for the position)
      - vorp_raw            (points_used - rep_points)
      - pos_scale_robust    (1.4826 * MAD of (points_used - rep_points) in pool)
      - vorp_star           (vorp_raw / pos_scale_robust)
      - injury_flag         (True if < min_games_for_ppg when use_ppg)
    """

    df = df.copy()

    # 1) Decide points metric (total or PPG)
    if use_ppg:
        df["points_used"] = df["fantasy_points_ppr"] / df["g"].replace(0, np.nan)
        # flag small-sample seasons
        df["injury_flag"] = (df["g"] < min_games_for_ppg) | (~np.isfinite(df["points_used"]))
    else:
        df["points_used"] = df["fantasy_points_ppr"]
        df["injury_flag"] = False

    # Fill any NaNs created (e.g., 0 games) with 0 so later math works;
    # they’ll still be flagged as injury_flag=True if use_ppg
    df["points_used"] = df["points_used"].fillna(0.0)

    # 2) Replacement level per position
    # Default lineup for 12-team, 1QB/2RB/2WR/1TE (adjust if your league differs)
    if starters_per_team is None:
        starters_per_team = {"QB": 1, "RB": 2.5, "WR": 2.5, "TE": 1}

    # How many starters in the league per position = teams * starters_per_team[pos]
    rep_index = {
        pos: int(teams * starters_per_team.get(pos, 0))
        for pos in df["fantasy_pos"].unique()
    }
    
    # Helper: robust MAD scale
    def mad(x):
        med = np.median(x)
        return 1.4826 * np.median(np.abs(x - med))

    # Helper: winsorize
    def winsorize(s, lo=0.02, hi=0.98):
        ql, qh = np.quantile(s, [lo, hi])
        return np.clip(s, ql, qh)

    out_frames = []
    for pos, grp in df.groupby("fantasy_pos", sort=False):
        # sort high→low by points_used
        gsort = grp.sort_values("points_used", ascending=False)

        # replacement rank (e.g., RB24). If none defined, skip normalization gracefully.
        # R = rep_index.get(pos, 0)
        # if R <= 0 or R > len(gsort):
        #     # no valid replacement index; fall back to zero baseline/scale=1 to avoid div-by-0
        #     rep_points = 0.0
        # else:
        #     rep_points = gsort.iloc[R-1]["points_used"]
        
    
        R = rep_index.get(pos, 0)
        if R <= 0 or R > len(gsort):
            rep_points = 0.0
        else:
            rep_points = gsort.iloc[R-1]["points_used"]


        gsort = gsort.assign(rep_points=rep_points)
        gsort = gsort.assign(vorp_raw=gsort["points_used"] - rep_points)

        # build pool ~ top (pool_factor × starters) to estimate robust spread
        pool_size = int(max(R * pool_factor, min(len(gsort), R))) if R > 0 else min(len(gsort), 24)
        pool = gsort.iloc[:pool_size]["vorp_raw"].values

        # winsorize pool then MAD
        pool_w = winsorize(pool, *winsor_limits) if len(pool) else np.array([0.0])
        scale = np.std(pool_w, ddof=0)   # population std dev
        if scale == 0 or not np.isfinite(scale):
            scale = 1.0   # avoid divide by 0

        gsort = gsort.assign(pos_scale_robust=scale)
        gsort = gsort.assign(vorp_star=gsort["vorp_raw"] / scale)

        out_frames.append(gsort)

    result = pd.concat(out_frames, axis=0).sort_index()

    # Optional: ranks
    result["vorp_star_rank_overall"] = result["vorp_star"].rank(method="dense", ascending=False).astype(int)
    result["vorp_star_rank_pos"] = (
        result.groupby("fantasy_pos")["vorp_star"].rank(method="dense", ascending=False).astype(int)
    )

    return result

# -------------------------
# Example usage on your df:
# -------------------------
# df has: player_name, team, fantasy_pos, g, gs, fantasy_points_ppr, pos_rank, full_rank

# Example for 12-team, 1QB/2RB/2WR/1TE, using season totals:
df_v = compute_vorp_star(
    stats,
    teams=12,
    starters_per_team={"QB": 1.25, "RB": 2.5, "WR": 2.5, "TE": 1.25},
    use_ppg=False,           # set True if you prefer per-game
    min_games_for_ppg=8,
    pool_factor=1,
    winsor_limits=(0.02, 0.98),
)

# Columns now available:
# ['points_used','rep_points','vorp_raw','pos_scale_robust','vorp_star',
#  'injury_flag','vorp_star_rank_overall','vorp_star_rank_pos', ...]
# Example view:
cols = [
    "player_name","team","fantasy_pos","fantasy_points_ppr","vorp_star","vorp_star_rank_overall", "vorp_star_rank_pos"
]
df_v[cols].sort_values(["vorp_star"], ascending=False).head(30)


Unnamed: 0,player_name,team,fantasy_pos,fantasy_points_ppr,vorp_star,vorp_star_rank_overall,vorp_star_rank_pos
3,Davante Adams,GNB,WR,358.4,4.284542,1,1
1,Alvin Kamara,NOR,RB,377.8,4.193523,2,1
4,Travis Kelce,KAN,TE,312.8,3.616856,3,1
5,Tyreek Hill,KAN,WR,328.9,3.527692,4,2
16,Stefon Diggs,BUF,WR,328.6,3.519995,5,3
2,Dalvin Cook,MIN,RB,337.8,3.430023,6,2
0,Derrick Henry,TEN,RB,333.1,3.340312,7,3
6,Josh Allen,BUF,QB,396.1,3.109318,8,1
14,Darren Waller,LVR,TE,278.6,2.893485,9,2
7,Aaron Rodgers,GNB,QB,383.3,2.814727,10,2


In [160]:
df_v.loc[df_v.player_name == 'Marquise Brown']

Unnamed: 0,player_name,team,fantasy_pos,g,gs,fantasy_points_ppr,pos_rank,full_rank,points_used,injury_flag,rep_points,vorp_raw,pos_scale_robust,vorp_star,vorp_star_rank_overall,vorp_star_rank_pos
102,Marquise Brown,BAL,WR,16,14,183.0,35,74,183.0,False,191.4,-8.4,38.977325,-0.21551,92,35
