# Scraping Diddy Sheets
First column = date
Second column = home team

In [67]:
# %pip install pandas scikit-learn bs4 requests

import pandas as pd 
from datetime import datetime, date
from bs4 import BeautifulSoup
import requests
import time

In [33]:
def get_opp_team(soup: BeautifulSoup) -> str: 
    scorebox_div = soup.find('div', class_="scorebox")
    if scorebox_div is None:
        print("Not Found")
        return None
    
    link_tag = scorebox_div.find('strong').find('a')
    if link_tag is None:
        print("<a> tag not found")
        return None

    if 'href' not in link_tag.attrs:
        print("Link not found in href")
        return None
    
    href = link_tag["href"]

    return href.split("/")[2]

In [16]:
def get_table_headers(soup: BeautifulSoup, table_id: str) -> str:
    table = soup.find('table', table_id)

    if table is None:
        print("Couldn't find the table")
        return None
    
    table_header = table.find('thead')

    if table_header is None:
        print("Couldn't find the table header")
    
    data_stats = [th['data-stat'] for th in table.find_all('th') if 'data_stat' in th.attrs]

    return data_stats[1:]

In [65]:
def get_stat_table(soup: BeautifulSoup, table_id: str, team: str) -> pd.DataFrame:
    table = soup.find('table', id = table_id)

    if table is None:
        print("Table not found")
        return None
    
    table_body = table.find('tbody')

    if table_body is None:
        print("Table body not found")
        return None

    table_rows = [tr for tr in table_body.find_all('tr') if 'thead' not in tr.attrs]

    if table_rows is None:
        print("Table rows not found")
        return None

    data = []
    col_names = None

    for row in table_rows:
        table_data = [td for td in row if td.name == "td"]

        if "Did Not Play" in str(row) or "Reserves" in str(row): 
            continue 
    
        player_name = row.find('th').text.strip()

        if col_names is None:
            col_names = ["player", "team"] + [td['data-stat'] for td in table_data if 'data-stat' in td.attrs]
        
        data_values = [td.text.strip() for td in table_data]
        data_values = [player_name, team] + data_values
        data.append(data_values)

    if col_names and data:
        return pd.DataFrame(data, columns=col_names)
    else:
        print("No data found")
        return None

In [90]:
def add_date(df: pd.DataFrame, date: date):
    df['date'] = date
    return df

def correct_date(df: pd.DataFrame):
    if "date" not in df.columns:
        print("Dates not found. Dataframe must contain a column named 'date'")
        return None
    
    if pd.api.types.is_string_dtype(df['date']):
        df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')

    return df

In [95]:
# input the home team and the day the game was played
def scrape_game(home_team: str, game_date: date, sec_delay: int):
    print(f"Scraping: {home_team}")

    time.sleep(sec_delay)

    date_str = game_date.strftime("%Y%m%d")
    link = f"https://www.basketball-reference.com/boxscores/{date_str}0{home_team}.html"

    response = requests.get(link)
    if response.status_code != 200:
        print("Not Found: " + link)
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    opp_team = get_opp_team(soup)

    home_table_id = f"box-{home_team}-game-basic"
    opponent_table_id = f"box-{opp_team}-game-basic"

    home_df = get_stat_table(soup, home_table_id, home_team)
    home_df = add_date(home_df, game_date)
    opp_df = get_stat_table(soup, opponent_table_id, opp_team)
    opp_df = add_date(opp_df, game_date)

    return pd.concat([home_df, opp_df], ignore_index=True)


In [70]:
def scrape_sheets(df: pd.DataFrame, sec_delay: int) -> pd.DataFrame:
    all_games_df = df.apply(lambda row: scrape_game(row['home_team'], row['date'], sec_delay=sec_delay), axis = 1).tolist()
    
    return pd.concat(all_games_df)

In [96]:
# importing the diddy csv
diddy_path = "./Celeb-Attendence/diddy.csv"

diddy_csv = pd.read_csv(diddy_path)

In [97]:
diddy_csv.head()
type(diddy_csv['date'][0])

str

In [98]:
diddy_csv = correct_date(diddy_csv)

In [99]:
diddy_games_df = scrape_sheets(diddy_csv, sec_delay=5)
diddy_games_df.head()

Scraping: LAL
Scraping: MIA
Scraping: MIA
Scraping: GSW
Scraping: BRK
Scraping: LAL
Scraping: LAL
Scraping: LAL
Scraping: LAL
Scraping: LAL
Scraping: NYK
Scraping: LAL
Scraping: NYK
Scraping: LAL
Scraping: MIA
Scraping: NYK
Scraping: NJN
Scraping: LAL
Scraping: LAL
Scraping: HOU
Scraping: NJN
Scraping: MIA
Scraping: NYK
Scraping: HOU
Scraping: MIA
Scraping: LAC
Scraping: NYK
Scraping: MIA
Scraping: MIA
Scraping: MIA
Scraping: MIA
Scraping: LAL


Unnamed: 0,player,team,mp,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,...,trb,ast,stl,blk,tov,pf,pts,game_score,plus_minus,date
0,Brandon Ingram,LAL,44:01,12,21,0.571,2,2,1.0,6,...,5,3,3,2,7,5,32,20.7,-8,2017-11-29
1,Lonzo Ball,LAL,43:20,5,12,0.417,3,7,0.429,2,...,2,10,1,0,2,2,15,14.8,-8,2017-11-29
2,Kentavious Caldwell-Pope,LAL,41:32,7,18,0.389,2,8,0.25,5,...,7,0,1,0,2,3,21,11.5,1,2017-11-29
3,Larry Nance Jr.,LAL,22:27,0,2,0.0,0,0,,0,...,4,5,4,1,0,0,0,8.0,-11,2017-11-29
4,Brook Lopez,LAL,16:52,2,5,0.4,0,2,0.0,2,...,1,1,1,1,2,1,6,3.2,-15,2017-11-29


In [101]:
diddy_save_path = "./Diddy-Stats/diddygames.csv"

diddy_games_df.to_csv(diddy_save_path)

## Getting the Career Stats of the Top 20 Diddy players


In [109]:
top_20_player_links = {
    "Dwayne Wade": "https://www.basketball-reference.com/players/w/wadedw01.html",
    "Kobe Bryant": "https://www.basketball-reference.com/players/b/bryanko01.html",
    "Lamar Odom": "https://www.basketball-reference.com/players/o/odomla01.html",
    "Pau Gasol": "https://www.basketball-reference.com/players/g/gasolpa01.html",
    "Derek Fisher": "https://www.basketball-reference.com/players/f/fishede01.html",
    "Lebron James": "https://www.basketball-reference.com/players/j/jamesle01.html",
    "Jordan Farmar": "https://www.basketball-reference.com/players/f/farmajo01.html",
    "Sasha Vujačić": "https://www.basketball-reference.com/players/v/vujacsa01.html",
    "Trevor Ariza": "https://www.basketball-reference.com/players/a/arizatr01.html",
    "Udonis Haslem": "https://www.basketball-reference.com/players/h/hasleud01.html",
    "Andrew Bynum": "https://www.basketball-reference.com/players/b/bynuman01.html",
    "Jason Kidd": "https://www.basketball-reference.com/players/k/kiddja01.html",
    "Kevin Garnett": "https://www.basketball-reference.com/players/g/garneke01.html",
    "Luke Walton": "https://www.basketball-reference.com/players/w/waltolu01.html",
    "Paul Pierce": "https://www.basketball-reference.com/players/p/piercpa01.html",
    "Rajon Rondo": "https://www.basketball-reference.com/players/r/rondora01.html",
    "Ray Allen": "https://www.basketball-reference.com/players/a/allenra02.html",
    "Shannon Brown": "https://www.basketball-reference.com/players/b/brownsh01.html",
    "Carmelo Anthony": "https://www.basketball-reference.com/players/a/anthoca01.html",
    "Chris Bosh": "https://www.basketball-reference.com/players/b/boshch01.html"
}

New setup for the tables in basketball reference
* regular season is in the div with id="div_per_game_stats"
* post season is in the div with id="div_per_game_stats_post"

For total stats
* regular = id="div_totals"
* playoffs = "div_playoffs_totals"


In [132]:
def scrape_totals(link: str, player_name: str, is_playoff: bool = False) -> pd.DataFrame:
    response = requests.get(link)

    if response.status_code != 200:
        print("Not Found: " + link)
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    if is_playoff:
        div_id = "div_playoffs_totals"
    else:
        div_id = "div_totals"

    div = soup.find('div', id=div_id)
    table = div.find('table')

    table_body = table.find('tbody')
    table_rows = table_body.find_all('tr')
    
    data = []
    col_names = None

    for row in table_rows:
        table_data = [td for td in row if td.name == "td"]

        season = row.find('th').text.strip()

        if col_names is None:
            col_names = ["player", "season", "season_type"] + [td['data-stat'] for td in table_data if 'data-stat' in td.attrs]
        
        data_values = [td.text.strip() for td in table_data]
        data_values = [player_name, season, ("playoff" if is_playoff else "regular")] + data_values
        data.append(data_values)

    if col_names and data:
        return pd.DataFrame(data, columns=col_names)
    else:
        print("No data found")
        return None


In [133]:
def scrape_top_diddy(link_dict: dict, sec_delay: int) -> pd.DataFrame:
    result_df = []

    for name,link in link_dict.items():
        print(f"Scraping: {name}")
        time.sleep(sec_delay)

        playoff_df = scrape_totals(link=link, player_name=name, is_playoff=True)

        time.sleep(sec_delay)
        regular_df = scrape_totals(link=link, player_name=name, is_playoff=False)

        result_df.append(playoff_df)
        result_df.append(regular_df)

    return pd.concat(result_df, ignore_index=True)

In [135]:
top_df = scrape_top_diddy(link_dict=top_20_player_links, sec_delay=3)

Scraping: Dwayne Wade
Scraping: Kobe Bryant
Scraping: Lamar Odom
Scraping: Pau Gasol
Scraping: Derek Fisher
Scraping: Lebron James
Scraping: Jordan Farmar
Scraping: Sasha Vujačić
Scraping: Trevor Ariza
Scraping: Udonis Haslem
Scraping: Andrew Bynum
Scraping: Jason Kidd
Scraping: Kevin Garnett
Scraping: Luke Walton
Scraping: Paul Pierce
Scraping: Rajon Rondo
Scraping: Ray Allen
Scraping: Shannon Brown
Scraping: Carmelo Anthony
Scraping: Chris Bosh


In [138]:
top_df_path = "./Diddy-Stats/top20.csv"
top_df.to_csv(top_df_path)