In [84]:
# CONSTANTS
TEAMS = ["PHI", "BRK", "NYK", "TOR", "BOS",
        "CHI", "CLE", "MIL", "IND", "DET",
        "MIA", "WAS", "CHO", "ATL", "ORL",
        "UTA", "DEN", "POR", "MIN", "OKC",
        "GSW", "PHO", "LAL", "SAC", "LAC",
        "DAL", "MEM", "SAS", "HOU", "NOP"]

BASE_URL = "https://www.basketball-reference.com"
BOOK_URL = "https://www.scoresandodds.com/nba/props"
YEAR = 2022
POSITIONS = ["PG", "SG", "SF", "PF", "C"]
# RELEVANT_STATS = ["player_name", "player_position", "fg3", "orb", "drb", "trb", "ast", "pts", "plus_minus"]
RELEVANT_STATS = ["player_position", "fg3", "orb", "drb", "trb", "ast", "pts"]

In [5]:
%env PIP_INDEX_URL https://artifactory.service.bo1.csnzoo.com/artifactory/api/pypi/python/simple

env: PIP_INDEX_URL=https://artifactory.service.bo1.csnzoo.com/artifactory/api/pypi/python/simple


In [6]:
%pip install beautifulsoup4

Looking in indexes: https://artifactory.service.bo1.csnzoo.com/artifactory/api/pypi/python/simple
You should consider upgrading via the '/usr/local/opt/python@3.9/bin/python3.9 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
# imports
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [8]:
def get_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text)
    return soup

In [17]:
# - Connect to basketballreference.com
# - Retrieve schedule
# - Using schedule, find the box score for each of the games

def get_schedules():
    team_schedules = {}
    for team in TEAMS:
        game_schedule_url = f"{BASE_URL}/teams/{team}/{YEAR}_games.html"

        soup = get_url(game_schedule_url)

        # find the schedule
        past_games = []
        for game in soup.find_all("td", attrs={"data-stat":"box_score_text"}):
            if len(game["class"]) > 1:
                break

            past_games = past_games + [f"{BASE_URL}{game.a['href']}"]
        team_schedules[team] = past_games

    return team_schedules

get_schedules()

{'PHI': ['https://www.basketball-reference.com/boxscores/202110200NOP.html',
  'https://www.basketball-reference.com/boxscores/202110220PHI.html',
  'https://www.basketball-reference.com/boxscores/202110240OKC.html',
  'https://www.basketball-reference.com/boxscores/202110260NYK.html',
  'https://www.basketball-reference.com/boxscores/202110280PHI.html',
  'https://www.basketball-reference.com/boxscores/202110300PHI.html',
  'https://www.basketball-reference.com/boxscores/202111010PHI.html',
  'https://www.basketball-reference.com/boxscores/202111030PHI.html',
  'https://www.basketball-reference.com/boxscores/202111040DET.html'],
 'BRK': ['https://www.basketball-reference.com/boxscores/202110190MIL.html',
  'https://www.basketball-reference.com/boxscores/202110220PHI.html',
  'https://www.basketball-reference.com/boxscores/202110240BRK.html',
  'https://www.basketball-reference.com/boxscores/202110250BRK.html',
  'https://www.basketball-reference.com/boxscores/202110270BRK.html',
  'ht

In [37]:
def get_player_position(player_page):
    soup = get_url(player_page)

    player_stats = {}
    for el in soup.find("tr", attrs={"id":"per_game.2022"}):
        if el["data-stat"] == "pos":
            return el.text

get_player_position("https://www.basketball-reference.com/players/h/hardeja01.html")

'PG'

In [38]:
def get_player_stats(player_page):
    soup = get_url(player_page)

    player_stats = {}
    for el in soup.find("tr", attrs={"id":"per_game.2022"}):
        if el["class"][0] == "right":
            player_stats[el["data-stat"]] = el.text
        
    return player_stats

get_player_stats("https://www.basketball-reference.com/players/h/hardeja01.html")

{'g': '9',
 'gs': '9',
 'mp_per_g': '33.9',
 'fg_per_g': '5.2',
 'fga_per_g': '13.6',
 'fg_pct': '.385',
 'fg3_per_g': '3.1',
 'fg3a_per_g': '7.6',
 'fg3_pct': '.412',
 'fg2_per_g': '2.1',
 'fg2a_per_g': '6.0',
 'fg2_pct': '.352',
 'efg_pct': '.500',
 'ft_per_g': '4.1',
 'fta_per_g': '4.7',
 'ft_pct': '.881',
 'orb_per_g': '0.9',
 'drb_per_g': '6.6',
 'trb_per_g': '7.4',
 'ast_per_g': '9.0',
 'stl_per_g': '1.1',
 'blk_per_g': '1.0',
 'tov_per_g': '4.9',
 'pf_per_g': '2.3',
 'pts_per_g': '17.7'}

# Version 1 - Average Statistic For Each Position Scored Against Each Team

In [60]:
# - Collect statistics for starters based on the position
#     - Might need to classify positions of players first / lookup
#     - These should include current season performance, and performance in the last X games
# - Collect average statistics of starters for each team
#     - These should include last seasons performance, current season performance, and performance in the last X games

def get_game_stats(gaame):
    soup = get_url(game)

    home_stats = []
    away_stats = []
    flag = True
    for div in soup.find_all("div", class_="table_container"):
        try:
            # Each team (HOME first)
            if "game-basic" in div["id"]:
#                 print(div["id"])

                # Each starter
                for player in div.tbody.find_all("tr")[:5]:
                    player_stats = {}

                    player_stats["player_name"] = player.th.a.text

                    player_page = BASE_URL + player.th.a["href"]

                    # Calculations using the players primary position
                    player_stats["player_position"] = get_player_position(player_page)

                    for stat in player.find_all("td"):
                        player_stats[stat["data-stat"]] = stat.text

                    if flag:
                        home_stats = home_stats + [player_stats]
                    else:
                        away_stats = away_stats + [player_stats]

                flag = False
        except KeyError as e:
            continue
    
    return home_stats, away_stats

In [103]:
schedules = get_schedules()
all_stats_against = {team: {pos: {stat: 0 for stat in RELEVANT_STATS[1:]} for pos in POSITIONS} for team in TEAMS}

for team in schedules.keys():
    print(f"working on team: {team}")

    for game in schedules[team]:
        stats_against_team = {}
        home_stats, away_stats = get_game_stats(game)
        
        for player_stats in away_stats:
            player_stats = {stat:player_stats[stat] for stat in RELEVANT_STATS}
            
            for stat in [*player_stats][1:]:
                all_stats_against[team][player_stats["player_position"]][stat] += int(player_stats[stat])
                

In [104]:
all_stats_against

{'PHI': {'PG': {'fg3': 17,
   'orb': 6,
   'drb': 28,
   'trb': 34,
   'ast': 48,
   'pts': 165},
  'SG': {'fg3': 28, 'orb': 4, 'drb': 42, 'trb': 46, 'ast': 28, 'pts': 189},
  'SF': {'fg3': 12, 'orb': 5, 'drb': 29, 'trb': 34, 'ast': 20, 'pts': 83},
  'PF': {'fg3': 10, 'orb': 5, 'drb': 43, 'trb': 48, 'ast': 22, 'pts': 113},
  'C': {'fg3': 6, 'orb': 22, 'drb': 63, 'trb': 85, 'ast': 30, 'pts': 128}},
 'BRK': {'PG': {'fg3': 25,
   'orb': 5,
   'drb': 46,
   'trb': 51,
   'ast': 61,
   'pts': 153},
  'SG': {'fg3': 7, 'orb': 0, 'drb': 8, 'trb': 8, 'ast': 6, 'pts': 33},
  'SF': {'fg3': 25, 'orb': 7, 'drb': 54, 'trb': 61, 'ast': 15, 'pts': 163},
  'PF': {'fg3': 17, 'orb': 13, 'drb': 83, 'trb': 96, 'ast': 50, 'pts': 251},
  'C': {'fg3': 4, 'orb': 6, 'drb': 20, 'trb': 26, 'ast': 5, 'pts': 40}},
 'NYK': {'PG': {'fg3': 18,
   'orb': 6,
   'drb': 34,
   'trb': 40,
   'ast': 47,
   'pts': 118},
  'SG': {'fg3': 31, 'orb': 10, 'drb': 31, 'trb': 41, 'ast': 32, 'pts': 172},
  'SF': {'fg3': 18, 'orb': 4,

In [110]:
all_stats_df = pd.DataFrame.from_dict(all_stats_against, orient="index")
# all_stats_df['pra'] = all_stats_df['trb'] + all_stats_df['ast'] + all_stats_df['pts']
all_stats_df

Unnamed: 0,PG,SG,SF,PF,C
PHI,"{'fg3': 17, 'orb': 6, 'drb': 28, 'trb': 34, 'a...","{'fg3': 28, 'orb': 4, 'drb': 42, 'trb': 46, 'a...","{'fg3': 12, 'orb': 5, 'drb': 29, 'trb': 34, 'a...","{'fg3': 10, 'orb': 5, 'drb': 43, 'trb': 48, 'a...","{'fg3': 6, 'orb': 22, 'drb': 63, 'trb': 85, 'a..."
BRK,"{'fg3': 25, 'orb': 5, 'drb': 46, 'trb': 51, 'a...","{'fg3': 7, 'orb': 0, 'drb': 8, 'trb': 8, 'ast'...","{'fg3': 25, 'orb': 7, 'drb': 54, 'trb': 61, 'a...","{'fg3': 17, 'orb': 13, 'drb': 83, 'trb': 96, '...","{'fg3': 4, 'orb': 6, 'drb': 20, 'trb': 26, 'as..."
NYK,"{'fg3': 18, 'orb': 6, 'drb': 34, 'trb': 40, 'a...","{'fg3': 31, 'orb': 10, 'drb': 31, 'trb': 41, '...","{'fg3': 18, 'orb': 4, 'drb': 36, 'trb': 40, 'a...","{'fg3': 13, 'orb': 10, 'drb': 43, 'trb': 53, '...","{'fg3': 15, 'orb': 28, 'drb': 83, 'trb': 111, ..."
TOR,"{'fg3': 23, 'orb': 8, 'drb': 34, 'trb': 42, 'a...","{'fg3': 19, 'orb': 11, 'drb': 33, 'trb': 44, '...","{'fg3': 27, 'orb': 10, 'drb': 42, 'trb': 52, '...","{'fg3': 5, 'orb': 19, 'drb': 38, 'trb': 57, 'a...","{'fg3': 4, 'orb': 31, 'drb': 77, 'trb': 108, '..."
BOS,"{'fg3': 23, 'orb': 3, 'drb': 40, 'trb': 43, 'a...","{'fg3': 22, 'orb': 2, 'drb': 21, 'trb': 23, 'a...","{'fg3': 21, 'orb': 12, 'drb': 55, 'trb': 67, '...","{'fg3': 8, 'orb': 10, 'drb': 32, 'trb': 42, 'a...","{'fg3': 11, 'orb': 38, 'drb': 102, 'trb': 140,..."
CHI,"{'fg3': 13, 'orb': 6, 'drb': 34, 'trb': 40, 'a...","{'fg3': 20, 'orb': 7, 'drb': 34, 'trb': 41, 'a...","{'fg3': 11, 'orb': 10, 'drb': 39, 'trb': 49, '...","{'fg3': 6, 'orb': 5, 'drb': 15, 'trb': 20, 'as...","{'fg3': 12, 'orb': 14, 'drb': 72, 'trb': 86, '..."
CLE,"{'fg3': 24, 'orb': 8, 'drb': 34, 'trb': 42, 'a...","{'fg3': 19, 'orb': 11, 'drb': 19, 'trb': 30, '...","{'fg3': 11, 'orb': 6, 'drb': 31, 'trb': 37, 'a...","{'fg3': 14, 'orb': 21, 'drb': 66, 'trb': 87, '...","{'fg3': 2, 'orb': 28, 'drb': 87, 'trb': 115, '..."
MIL,"{'fg3': 7, 'orb': 1, 'drb': 19, 'trb': 20, 'as...","{'fg3': 34, 'orb': 11, 'drb': 50, 'trb': 61, '...","{'fg3': 8, 'orb': 15, 'drb': 40, 'trb': 55, 'a...","{'fg3': 22, 'orb': 13, 'drb': 56, 'trb': 69, '...","{'fg3': 3, 'orb': 20, 'drb': 31, 'trb': 51, 'a..."
IND,"{'fg3': 25, 'orb': 16, 'drb': 55, 'trb': 71, '...","{'fg3': 25, 'orb': 5, 'drb': 35, 'trb': 40, 'a...","{'fg3': 17, 'orb': 3, 'drb': 26, 'trb': 29, 'a...","{'fg3': 12, 'orb': 7, 'drb': 38, 'trb': 45, 'a...","{'fg3': 12, 'orb': 33, 'drb': 93, 'trb': 126, ..."
DET,"{'fg3': 21, 'orb': 10, 'drb': 52, 'trb': 62, '...","{'fg3': 8, 'orb': 1, 'drb': 12, 'trb': 13, 'as...","{'fg3': 8, 'orb': 16, 'drb': 31, 'trb': 47, 'a...","{'fg3': 15, 'orb': 7, 'drb': 41, 'trb': 48, 'a...","{'fg3': 2, 'orb': 18, 'drb': 67, 'trb': 85, 'a..."


In [None]:
all_stats_df.sort_values(by=['3p'])

In [None]:
all_stats_df.sort_values(by=['trb'])

In [None]:
all_stats_df.sort_values(by=['ast'])

In [None]:
all_stats_df.sort_values(by=['pts'])

In [None]:
all_stats_df.sort_values(by=['pra'])

In [222]:
book = get_url(BOOK_URL)

# Each stat
for tbody in book.find_all("tbody", attrs={"data-group":"odds-table--props-0"}):
    # Each player
    for tr in tbody.find_all("tr")[]:
        # 
        for td in tr.find_all("td"):
        

odds-table--3-pointers-0
odds-table--assists-0
odds-table--points-0
odds-table--points-&-assists-0
odds-table--points-&-rebounds-0
odds-table--points,-rebounds,-&-assists-0
odds-table--rebounds-0


'aa'