# Scraping Diddy Sheets
First column = date
Second column = home team

In [12]:
# %pip install pandas scikit-learn bs4 requests

import pandas as pd 
from datetime import datetime, date
from bs4 import BeautifulSoup
import requests

In [33]:
def get_opp_team(soup: BeautifulSoup) -> str: 
    scorebox_div = soup.find('div', class_="scorebox")
    if scorebox_div is None:
        print("Not Found")
        return None
    
    link_tag = scorebox_div.find('strong').find('a')
    if link_tag is None:
        print("<a> tag not found")
        return None

    if 'href' not in link_tag.attrs:
        print("Link not found in href")
        return None
    
    href = link_tag["href"]

    return href.split("/")[2]

In [16]:
def get_table_headers(soup: BeautifulSoup, table_id: str) -> str:
    table = soup.find('table', table_id)

    if table is None:
        print("Couldn't find the table")
        return None
    
    table_header = table.find('thead')

    if table_header is None:
        print("Couldn't find the table header")
    
    data_stats = [th['data-stat'] for th in table.find_all('th') if 'data_stat' in th.attrs]

    return data_stats[1:]

In [65]:
def get_stat_table(soup: BeautifulSoup, table_id: str, team: str) -> pd.DataFrame:
    table = soup.find('table', id = table_id)

    if table is None:
        print("Table not found")
        return None
    
    table_body = table.find('tbody')

    if table_body is None:
        print("Table body not found")
        return None

    table_rows = [tr for tr in table_body.find_all('tr') if 'thead' not in tr.attrs]

    if table_rows is None:
        print("Table rows not found")
        return None

    data = []
    col_names = None

    for row in table_rows:
        table_data = [td for td in row if td.name == "td"]

        if "Did Not Play" in str(row) or "Reserves" in str(row): 
            continue 
    
        player_name = row.find('th').text.strip()

        if col_names is None:
            col_names = ["player", "team"] + [td['data-stat'] for td in table_data if 'data-stat' in td.attrs]
        
        data_values = [td.text.strip() for td in table_data]
        data_values = [player_name, team] + data_values
        data.append(data_values)

    if col_names and data:
        return pd.DataFrame(data, columns=col_names)
    else:
        print("No data found")
        return None

In [31]:
# input the home team and the day the game was played
def scrape_game(home_team: str, game_date: date):
    date_str = game_date.strftime("%Y%m%d")
    link = f"https://www.basketball-reference.com/boxscores/{date_str}0{home_team}.html"

    response = requests.get(link)
    if response.status_code != 200:
        print("Oops" + link)
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    opp_team = get_opp_team(soup)

    home_table_id = f"box-{home_team}-game-basic"
    opponent_table_id = f"box-{opp_team}-game-basic"

    home_df = get_stat_table(soup, home_table_id, home_team)
    opp_df = get_stat_table(soup, opponent_table_id, opp_team)

    return pd.concat([home_df, opp_df], ignore_index=True)


In [66]:
scrape_game("MIA", date(2023, 5, 27))

Unnamed: 0,player,team,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,...,drb,trb,ast,stl,blk,tov,pf,pts,game_score,plus_minus
0,Jimmy Butler,MIA,46:40,5,21,0.238,2,4,0.5,12,...,4,11,8,1,0,2,3,24,20.0,0
1,Bam Adebayo,MIA,45:39,4,16,0.25,0,0,,3,...,6,13,5,1,1,0,0,11,12.9,6
2,Gabe Vincent,MIA,41:17,6,18,0.333,3,6,0.5,0,...,3,4,0,0,1,1,5,15,4.1,-2
3,Caleb Martin,MIA,40:33,7,13,0.538,4,8,0.5,3,...,14,15,1,1,1,1,5,21,18.6,1
4,Max Strus,MIA,25:17,3,8,0.375,2,5,0.4,2,...,1,1,3,0,0,0,2,10,6.8,-12
5,Duncan Robinson,MIA,20:08,5,11,0.455,3,6,0.5,0,...,1,1,2,0,0,0,5,13,7.0,8
6,Kyle Lowry,MIA,18:05,3,6,0.5,0,1,0.0,2,...,1,2,3,2,1,1,3,8,8.6,1
7,Cody Zeller,MIA,2:21,0,0,,0,0,,1,...,0,0,0,0,0,0,1,1,0.2,-7
8,Jayson Tatum,BOS,43:56,8,22,0.364,0,8,0.0,15,...,9,12,5,1,2,3,2,31,25.7,3
9,Marcus Smart,BOS,42:15,7,15,0.467,4,11,0.364,3,...,3,4,1,0,0,4,4,21,9.6,-10
