<a href="https://colab.research.google.com/github/atharva-ketkar1/PrizePicks_Predictor/blob/main/nba_ml_predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
%%capture
!pip install nba_api

# Get all Players on the PrizePicks slate

In [47]:
import requests
import pandas as pd

STAT_ABBREVIATIONS = {
    "Points": "PTS",
    "Rebounds": "REB",
    "Assists": "AST"
}

def fetch_prizepicks_players_and_projections():
    url = "https://api.prizepicks.com/projections"
    params = {"league_id": "7", "per_page": "250"}
    headers = {"User-Agent": "Mozilla/5.0"}

    resp = requests.get(url, params=params, headers=headers)
    data = resp.json()

    projections = data["data"]
    included = {item["id"]: item for item in data["included"]}

    players = set()
    all_records = []

    for proj in projections:
        attr = proj["attributes"]
        stat_type = attr["stat_type"]
        line_score = attr["line_score"]
        odds_type = attr.get("odds_type", "none")

        if odds_type != "standard":
            continue
        if stat_type not in STAT_ABBREVIATIONS:
            continue

        player_id = str(proj["relationships"]["new_player"]["data"]["id"])
        player_info = included.get(player_id, {}).get("attributes", {})
        name = player_info.get("name", "Unknown")
        team = player_info.get("team", "Unknown")

        if "+" in name:
            continue

        players.add(name)

        all_records.append({
            "player": name,
            "team": team,
            "stat_type": STAT_ABBREVIATIONS[stat_type],
            "line_score": line_score
        })

    return players, pd.DataFrame(all_records)


In [48]:
names,live_props = fetch_prizepicks_players_and_projections()
names

{'Aaron Nesmith',
 'Aaron Wiggins',
 'Alex Caruso',
 'Andrew Nembhard',
 'Ben Sheppard',
 'Bennedict Mathurin',
 'Cason Wallace',
 'Chet Holmgren',
 'Isaiah Hartenstein',
 'Isaiah Joe',
 'Jalen Williams',
 'Luguentz Dort',
 'Myles Turner',
 'Obi Toppin',
 'Pascal Siakam',
 'Shai Gilgeous-Alexander',
 'T.J. McConnell',
 'Tyrese Haliburton'}

In [49]:
live_props.head(10)

Unnamed: 0,player,team,stat_type,line_score
0,Isaiah Joe,OKC,PTS,2.5
1,Myles Turner,IND,REB,5.0
2,Myles Turner,IND,PTS,13.5
3,T.J. McConnell,IND,PTS,7.5
4,Luguentz Dort,OKC,REB,3.5
5,Luguentz Dort,OKC,PTS,8.5
6,Aaron Wiggins,OKC,PTS,5.5
7,Jalen Williams,OKC,AST,5.0
8,Jalen Williams,OKC,PTS,21.5
9,Jalen Williams,OKC,REB,5.5


# Get historical stats for each player on the slate

In [50]:
from nba_api.stats.static import players
from nba_api.stats.endpoints import playergamelog
import pandas as pd
import time
import pandas as pd
import numpy as np

### Get active players to search by ID

In [51]:
all_players = players.get_active_players()
name_to_id = {p['full_name']: p['id'] for p in all_players}

In [52]:
def get_player_game_logs(player_name, season='2024-25'):
    if player_name not in name_to_id:
        return None

    player_id = name_to_id[player_name]

    try:
        log = playergamelog.PlayerGameLog(player_id=player_id, season=season)
        df = log.get_data_frames()[0]
        df['player'] = player_name
        return df
    except Exception as e:
        print(f"Failed to fetch {player_name}: {e}")
        return None


In [53]:
all_logs = []

for name in names:
    df = get_player_game_logs(name)
    if df is not None:
        all_logs.append(df)
    time.sleep(1)

game_logs_df = pd.concat(all_logs, ignore_index=True)

In [54]:
game_logs_df

Unnamed: 0,SEASON_ID,Player_ID,Game_ID,GAME_DATE,MATCHUP,WL,MIN,FGM,FGA,FG_PCT,...,REB,AST,STL,BLK,TOV,PF,PTS,PLUS_MINUS,VIDEO_AVAILABLE,player
0,22024,1628983,0022401153,"Apr 08, 2025",OKC vs. LAL,W,37,14,26,0.538,...,6,6,0,1,1,5,42,7,1,Shai Gilgeous-Alexander
1,22024,1628983,0022401135,"Apr 06, 2025",OKC vs. LAL,L,30,12,23,0.522,...,3,9,1,1,1,1,26,-20,1,Shai Gilgeous-Alexander
2,22024,1628983,0022401123,"Apr 04, 2025",OKC @ HOU,L,35,10,22,0.455,...,4,8,3,0,3,2,22,-6,1,Shai Gilgeous-Alexander
3,22024,1628983,0022401108,"Apr 02, 2025",OKC vs. DET,W,35,10,26,0.385,...,6,3,3,2,1,1,33,30,1,Shai Gilgeous-Alexander
4,22024,1628983,0022401094,"Mar 31, 2025",OKC vs. CHI,W,27,9,14,0.643,...,0,12,0,2,2,0,27,29,1,Shai Gilgeous-Alexander
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1198,22024,1641717,0022400140,"Nov 01, 2024",OKC @ POR,W,23,5,7,0.714,...,1,3,1,1,1,0,13,13,1,Cason Wallace
1199,22024,1641717,0022400125,"Oct 30, 2024",OKC vs. SAS,W,29,4,9,0.444,...,1,2,2,1,1,3,9,8,1,Cason Wallace
1200,22024,1641717,0022400100,"Oct 27, 2024",OKC vs. ATL,W,23,2,6,0.333,...,4,1,3,0,2,2,5,13,1,Cason Wallace
1201,22024,1641717,0022400091,"Oct 26, 2024",OKC @ CHI,W,29,4,11,0.364,...,6,2,3,0,0,1,9,15,1,Cason Wallace


## I don't have historical PrizePicks lines, so I will have to simulate random predictions

In [55]:
def simulate_line(stat_values):
    return round(stat_values.mean() + np.random.normal(0, 0.5), 1)

In [56]:
dataset = []

for _, row in game_logs_df.iterrows():
    for stat_type in ["PTS", "REB", "AST"]:
        stat_val = row[stat_type]
        line_score = simulate_line(game_logs_df[(game_logs_df['player'] == row['player'])][stat_type].tail(5))

        dataset.append({
            "player": row["player"],
            "team": row["MATCHUP"].split(" ")[0],
            "stat_type": stat_type,
            "line_score": line_score,
            "actual_outcome": stat_val,
            "opponent": row["MATCHUP"].split(" ")[-1],
            "home": "vs." in row["MATCHUP"],
            "minutes": row["MIN"]
        })

train_df = pd.DataFrame(dataset)

In [57]:
train_df.head(10)

Unnamed: 0,player,team,stat_type,line_score,actual_outcome,opponent,home,minutes
0,Shai Gilgeous-Alexander,OKC,PTS,26.0,42,LAL,True,37
1,Shai Gilgeous-Alexander,OKC,REB,6.6,6,LAL,True,37
2,Shai Gilgeous-Alexander,OKC,AST,5.9,6,LAL,True,37
3,Shai Gilgeous-Alexander,OKC,PTS,25.9,26,LAL,True,30
4,Shai Gilgeous-Alexander,OKC,REB,7.5,3,LAL,True,30
5,Shai Gilgeous-Alexander,OKC,AST,6.5,9,LAL,True,30
6,Shai Gilgeous-Alexander,OKC,PTS,26.6,22,HOU,False,35
7,Shai Gilgeous-Alexander,OKC,REB,7.1,4,HOU,False,35
8,Shai Gilgeous-Alexander,OKC,AST,6.4,8,HOU,False,35
9,Shai Gilgeous-Alexander,OKC,PTS,27.2,33,DET,True,35


In [58]:
train_df[(train_df["team"] == "IND") & (train_df["opponent"] == "OKC")].head(10)

Unnamed: 0,player,team,stat_type,line_score,actual_outcome,opponent,home,minutes
246,Tyrese Haliburton,IND,PTS,14.7,18,OKC,False,29
247,Tyrese Haliburton,IND,REB,3.9,4,OKC,False,29
248,Tyrese Haliburton,IND,AST,6.7,3,OKC,False,29
354,Tyrese Haliburton,IND,PTS,13.8,4,OKC,True,35
355,Tyrese Haliburton,IND,REB,5.1,2,OKC,True,35
356,Tyrese Haliburton,IND,AST,6.7,8,OKC,True,35
456,Ben Sheppard,IND,PTS,5.1,10,OKC,False,22
457,Ben Sheppard,IND,REB,2.3,4,OKC,False,22
458,Ben Sheppard,IND,AST,0.6,2,OKC,False,22
585,Ben Sheppard,IND,PTS,4.5,5,OKC,True,22


# Start Machine Learning with a Random Forest Regressor

In [59]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error

In [60]:
features = ["line_score", "home", "minutes", "team", "opponent"]
target = "actual_outcome"
categorical = ["team", "opponent"]
numerical = ["line_score", "home", "minutes"]

### Let ChatGPT make a basic model because I'm not tryna do that rn

In [61]:
models = {}
for stat in ["PTS", "REB", "AST"]:
    df = train_df[train_df["stat_type"] == stat].copy()

    X = df[features]
    y = df[target]

    # Preprocessing
    preprocessor = ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", "passthrough", numerical)
    ])

    # Model pipeline
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
    ])

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    pipeline.fit(X_train, y_train)

    # Evaluate
    preds = pipeline.predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    print(f"{stat} MAE: {mae:.2f}")

    # Save model
    models[stat] = pipeline


PTS MAE: 4.93
REB MAE: 1.90
AST MAE: 1.56


In [62]:
import joblib

save_path = '/content/drive/MyDrive/NBA_prediction_models'

joblib.dump(models["PTS"], f"{save_path}/model_nbafinalsg4_pts.pkl")
joblib.dump(models["REB"], f"{save_path}/model_nbafinalsg4_reb.pkl")
joblib.dump(models["AST"], f"{save_path}/model_nbafinalsg4_ast.pkl")

['/content/drive/MyDrive/NBA_prediction_models/model_nbafinalsg4_ast.pkl']

## Find out current PrizePicks Projections for PTS,AST,REB

In [63]:
from nba_api.stats.endpoints import ScoreboardV2
from datetime import datetime, timedelta

def get_matchups_for_date(date):
    sb = ScoreboardV2(game_date=date)
    game_header = sb.game_header.get_data_frame()
    line_scores = sb.line_score.get_data_frame()

    matchups = {}

    for _, game in game_header.iterrows():
        game_id = game["GAME_ID"]
        home_team_id = game["HOME_TEAM_ID"]
        visitor_team_id = game["VISITOR_TEAM_ID"]

        teams = line_scores[line_scores["GAME_ID"] == game_id]
        team_abbr_map = teams.set_index("TEAM_ID")["TEAM_ABBREVIATION"].to_dict()

        home_abbr = team_abbr_map.get(home_team_id)
        away_abbr = team_abbr_map.get(visitor_team_id)

        if not home_abbr or not away_abbr:
            continue

        matchups[home_abbr] = {"opponent": away_abbr, "home": True}
        matchups[away_abbr] = {"opponent": home_abbr, "home": False}

    return matchups


In [64]:
matchups = get_matchups_for_date("06/13/2025")
matchups

{'IND': {'opponent': 'OKC', 'home': True},
 'OKC': {'opponent': 'IND', 'home': False}}

In [65]:
live_props["opponent"] = live_props["team"].map(lambda t: matchups.get(t, {}).get("opponent", "UNK"))
live_props["home"] = live_props["team"].map(lambda t: matchups.get(t, {}).get("home", True))

In [66]:
game_logs_df.sort_values(by=["player", "GAME_DATE"], inplace=True)

season_avg_minutes = game_logs_df.groupby("player")["MIN"].mean()
last_5_avg_minutes = game_logs_df.groupby("player").tail(5).groupby("player")["MIN"].mean()

estimated_minutes = (
    0.7 * last_5_avg_minutes.fillna(0) + 0.3 * season_avg_minutes.fillna(0)
)

live_props["minutes"] = live_props["player"].map(estimated_minutes)

live_props["minutes"] = live_props["minutes"].fillna(20)


In [68]:
live_props.head(10)

Unnamed: 0,player,team,stat_type,line_score,opponent,home,minutes
0,Isaiah Joe,OKC,PTS,2.5,IND,False,19.250811
1,Myles Turner,IND,REB,5.0,OKC,True,30.058333
2,Myles Turner,IND,PTS,13.5,OKC,True,30.058333
3,T.J. McConnell,IND,PTS,7.5,OKC,True,16.721013
4,Luguentz Dort,OKC,REB,3.5,IND,False,27.256056
5,Luguentz Dort,OKC,PTS,8.5,IND,False,27.256056
6,Aaron Wiggins,OKC,PTS,5.5,IND,False,19.496053
7,Jalen Williams,OKC,AST,5.0,IND,False,32.130435
8,Jalen Williams,OKC,PTS,21.5,IND,False,32.130435
9,Jalen Williams,OKC,REB,5.5,IND,False,32.130435


In [72]:
model_paths = {
    "PTS": "/content/drive/MyDrive/NBA_prediction_models/model_nbafinalsg4_pts.pkl",
    "REB": "/content/drive/MyDrive/NBA_prediction_models/model_nbafinalsg4_reb.pkl",
    "AST": "/content/drive/MyDrive/NBA_prediction_models/model_nbafinalsg4_ast.pkl"
}

results = []

for _, row in live_props.iterrows():
    stat = row["stat_type"]
    if stat not in model_paths:
        continue

    model = joblib.load(model_paths[stat])

    input_df = pd.DataFrame([{
        "line_score": row["line_score"],
        "home": int(row["home"]),
        "minutes": row["minutes"],
        "team": row["team"],
        "opponent": row["opponent"]
    }])

    prediction = model.predict(input_df)[0]
    delta = prediction - row["line_score"]

    results.append({
        "player": row["player"],
        "team": row["team"],
        "stat_type": stat,
        "line_score": row["line_score"],
        "predicted": round(prediction, 2),
        "delta": round(delta, 2),
        "recommendation": (
            "OVER" if delta > 0.75 else
            "UNDER" if delta < -0.75 else
            "NO BET"
        )
    })


In [75]:
final_df = pd.DataFrame(results)
final_df = final_df.sort_values(by="delta", ascending=False)

print("Predicted PrizePicks Edges:")
display(final_df)

Predicted PrizePicks Edges:


Unnamed: 0,player,team,stat_type,line_score,predicted,delta,recommendation
21,Cason Wallace,OKC,PTS,5.5,12.71,7.21,OVER
0,Isaiah Joe,OKC,PTS,2.5,6.46,3.96,OVER
32,Isaiah Hartenstein,OKC,PTS,6.0,9.48,3.48,OVER
13,Aaron Nesmith,IND,PTS,11.5,14.66,3.16,OVER
24,Tyrese Haliburton,IND,PTS,18.5,21.35,2.85,OVER
15,Bennedict Mathurin,IND,PTS,10.5,13.18,2.68,OVER
8,Jalen Williams,OKC,PTS,21.5,23.84,2.34,OVER
11,Andrew Nembhard,IND,PTS,10.5,12.81,2.31,OVER
2,Myles Turner,IND,PTS,13.5,15.77,2.27,OVER
12,Andrew Nembhard,IND,REB,3.5,5.7,2.2,OVER


In [80]:
final_df["abs_delta"] = final_df["delta"].abs()
top_10 = final_df.sort_values(by="abs_delta", ascending=False).head(10)
final_df.drop(columns=["abs_delta"], inplace=True)
top_10 = top_10.drop(columns=["abs_delta"])
top_10

Unnamed: 0,player,team,stat_type,line_score,predicted,delta,recommendation
21,Cason Wallace,OKC,PTS,5.5,12.71,7.21,OVER
0,Isaiah Joe,OKC,PTS,2.5,6.46,3.96,OVER
16,Chet Holmgren,OKC,PTS,15.5,11.75,-3.75,UNDER
32,Isaiah Hartenstein,OKC,PTS,6.0,9.48,3.48,OVER
13,Aaron Nesmith,IND,PTS,11.5,14.66,3.16,OVER
26,Shai Gilgeous-Alexander,OKC,PTS,34.0,30.86,-3.14,UNDER
24,Tyrese Haliburton,IND,PTS,18.5,21.35,2.85,OVER
15,Bennedict Mathurin,IND,PTS,10.5,13.18,2.68,OVER
31,Isaiah Hartenstein,OKC,REB,7.0,4.38,-2.62,UNDER
8,Jalen Williams,OKC,PTS,21.5,23.84,2.34,OVER
