Run this notebook only after you have completed the run_model module up through the calculation of relative player value.

This notebook will walk through how to estimate an input parameter for the "vorp_scale" argument in the calculate_relative_value function via linear regression fit on player ADP.

In [14]:
import sys
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import requests
from bs4 import BeautifulSoup
from helper import split_name
import statsmodels.api as sm

# Reading in player value dataframe
value_df = pd.read_csv('value_df.csv')

# URL for FantasyPros MLB ADP data
adp_url = "https://www.fantasypros.com/mlb/adp/overall.php"

In [19]:
def pull_adp(url: str) -> pd.DataFrame:
    """
    Pulls FantasyPros MLB ADP (overall) and returns:
      - Player
      - Team
      - Positions
      - ADP  (uses AVG column)
      - first_name, last_name (via split_name)
    """
    headers = {"User-Agent": "Mozilla/5.0"}
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()

    # FantasyPros renders a standard HTML table; let pandas parse it.
    tables = pd.read_html(r.text)
    if not tables:
        raise ValueError("No tables found on page.")

    adp_raw = tables[0].copy()

    # Normalize expected column names (page shows: Rank, Player (Team), RTS, NFBC, AVG) :contentReference[oaicite:1]{index=1}
    # Sometimes pandas names the player column slightly differently; find it.
    rank_col = next((c for c in adp_raw.columns if str(c).lower() in ["rank", "#"]), adp_raw.columns[0])
    avg_col = next((c for c in adp_raw.columns if str(c).strip().upper() == "AVG"), None)
    player_col = next((c for c in adp_raw.columns if "player" in str(c).lower()), None)

    if avg_col is None or player_col is None:
        raise ValueError(f"Unexpected table columns: {adp_raw.columns.tolist()}")

    df = (
        adp_raw
        .rename(columns={rank_col: "Rank", player_col: "PlayerRaw", avg_col: "ADP"})
        .assign(
            ADP=lambda d: pd.to_numeric(d["ADP"], errors="coerce"),
            PlayerRaw=lambda d: d["PlayerRaw"].astype(str),
            # Example raw string: "Aaron Judge (NYY - LF,CF,RF,DH)" :contentReference[oaicite:2]{index=2}
            Team=lambda d: d["PlayerRaw"].str.extract(r"\(([^-]+)\s*-")[0].str.strip(),
            Positions=lambda d: d["PlayerRaw"].str.extract(r"-\s*([^)]+)\)")[0].str.strip(),
            Player=lambda d: d["PlayerRaw"].str.replace(r"\s*\(.*\)$", "", regex=True).str.strip(),
        )
        .dropna(subset=["ADP", "Player"])
        .loc[:, ["Player", "Team", "Positions", "ADP"]]
    )

    # Match your existing pipeline convention
    df = split_name(df, "Player")
    return df

def estimate_vorp_scale_from_adp(value_df: pd.DataFrame, adp_url: str) -> float:
    adp_df = pull_adp(adp_url)

    reg_df = (
        value_df
        .merge(adp_df, on=["first_name", "last_name"], how="left")
        .assign(Value=lambda d: -d["ADP"])
        .dropna(subset=["Value", "vorp", "z_score_value"])
    )

    X = sm.add_constant(reg_df[["z_score_value", "vorp"]])
    y = reg_df["Value"].astype(float)

    model = sm.OLS(y, X).fit()
    print(model.summary())
    return float(model.params["vorp"])

In [17]:
# Ensure value_df has first_name/last_name (if not, split Name)
value_df = split_name(value_df, "Name")

In [20]:
# Estimate VORP scale from ADP via linear regression 
# Formula: ADP Value ~ z_score_value + vorp
vorp_scale = estimate_vorp_scale_from_adp(value_df, adp_url)

                            OLS Regression Results                            
Dep. Variable:                  Value   R-squared:                       0.444
Model:                            OLS   Adj. R-squared:                  0.438
Method:                 Least Squares   F-statistic:                     69.17
Date:                Thu, 08 Jan 2026   Prob (F-statistic):           8.46e-23
Time:                        09:42:13   Log-Likelihood:                -933.06
No. Observations:                 176   AIC:                             1872.
Df Residuals:                     173   BIC:                             1882.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const          -121.7172      6.552    -18.576



In [24]:
adp_df = pull_adp(adp_url)
reg_df = (
        value_df
        .merge(adp_df, on=["first_name", "last_name"], how="left")
        .assign(Value=lambda d: -d["ADP"])
        .dropna(subset=["Value", "vorp", "z_score_value"])
    )

X = sm.add_constant(reg_df[["z_score_value"]])
y = reg_df["Value"].astype(float)

model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Value   R-squared:                       0.443
Model:                            OLS   Adj. R-squared:                  0.439
Method:                 Least Squares   F-statistic:                     138.1
Date:                Thu, 08 Jan 2026   Prob (F-statistic):           7.53e-24
Time:                        09:49:06   Log-Likelihood:                -933.34
No. Observations:                 176   AIC:                             1871.
Df Residuals:                     174   BIC:                             1877.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const          -125.5401      4.102    -30.602



In [25]:
X = sm.add_constant(reg_df[["vorp"]])
y = reg_df["Value"].astype(float)

model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  Value   R-squared:                       0.347
Model:                            OLS   Adj. R-squared:                  0.343
Method:                 Least Squares   F-statistic:                     92.56
Date:                Thu, 08 Jan 2026   Prob (F-statistic):           7.75e-18
Time:                        09:49:21   Log-Likelihood:                -947.23
No. Observations:                 176   AIC:                             1898.
Df Residuals:                     174   BIC:                             1905.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -143.2485      5.677    -25.233      0.0

In [26]:
from sklearn.linear_model import Ridge

X = reg_df[["z_score_value", "vorp"]].values
y = reg_df["Value"].values

ridge = Ridge(alpha=1.0).fit(X, y)
ridge.coef_

array([ 0.04389139, -0.03815363])