# Scraping Diddy Sheets
First column = date
Second column = home team

In [67]:
# %pip install pandas scikit-learn bs4 requests

import pandas as pd 
from datetime import datetime, date
from bs4 import BeautifulSoup
import requests
import time

In [33]:
def get_opp_team(soup: BeautifulSoup) -> str: 
    scorebox_div = soup.find('div', class_="scorebox")
    if scorebox_div is None:
        print("Not Found")
        return None
    
    link_tag = scorebox_div.find('strong').find('a')
    if link_tag is None:
        print("<a> tag not found")
        return None

    if 'href' not in link_tag.attrs:
        print("Link not found in href")
        return None
    
    href = link_tag["href"]

    return href.split("/")[2]

In [16]:
def get_table_headers(soup: BeautifulSoup, table_id: str) -> str:
    table = soup.find('table', table_id)

    if table is None:
        print("Couldn't find the table")
        return None
    
    table_header = table.find('thead')

    if table_header is None:
        print("Couldn't find the table header")
    
    data_stats = [th['data-stat'] for th in table.find_all('th') if 'data_stat' in th.attrs]

    return data_stats[1:]

In [65]:
def get_stat_table(soup: BeautifulSoup, table_id: str, team: str) -> pd.DataFrame:
    table = soup.find('table', id = table_id)

    if table is None:
        print("Table not found")
        return None
    
    table_body = table.find('tbody')

    if table_body is None:
        print("Table body not found")
        return None

    table_rows = [tr for tr in table_body.find_all('tr') if 'thead' not in tr.attrs]

    if table_rows is None:
        print("Table rows not found")
        return None

    data = []
    col_names = None

    for row in table_rows:
        table_data = [td for td in row if td.name == "td"]

        if "Did Not Play" in str(row) or "Reserves" in str(row): 
            continue 
    
        player_name = row.find('th').text.strip()

        if col_names is None:
            col_names = ["player", "team"] + [td['data-stat'] for td in table_data if 'data-stat' in td.attrs]
        
        data_values = [td.text.strip() for td in table_data]
        data_values = [player_name, team] + data_values
        data.append(data_values)

    if col_names and data:
        return pd.DataFrame(data, columns=col_names)
    else:
        print("No data found")
        return None

In [90]:
def add_date(df: pd.DataFrame, date: date):
    df['date'] = date
    return df

def correct_date(df: pd.DataFrame):
    if "date" not in df.columns:
        print("Dates not found. Dataframe must contain a column named 'date'")
        return None
    
    if pd.api.types.is_string_dtype(df['date']):
        df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')

    return df

In [95]:
# input the home team and the day the game was played
def scrape_game(home_team: str, game_date: date, sec_delay: int):
    print(f"Scraping: {home_team}")

    time.sleep(sec_delay)

    date_str = game_date.strftime("%Y%m%d")
    link = f"https://www.basketball-reference.com/boxscores/{date_str}0{home_team}.html"

    response = requests.get(link)
    if response.status_code != 200:
        print("Not Found: " + link)
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    opp_team = get_opp_team(soup)

    home_table_id = f"box-{home_team}-game-basic"
    opponent_table_id = f"box-{opp_team}-game-basic"

    home_df = get_stat_table(soup, home_table_id, home_team)
    home_df = add_date(home_df, game_date)
    opp_df = get_stat_table(soup, opponent_table_id, opp_team)
    opp_df = add_date(opp_df, game_date)

    return pd.concat([home_df, opp_df], ignore_index=True)


In [70]:
def scrape_sheets(df: pd.DataFrame, sec_delay: int) -> pd.DataFrame:
    all_games_df = df.apply(lambda row: scrape_game(row['home_team'], row['date'], sec_delay=sec_delay), axis = 1).tolist()
    
    return pd.concat(all_games_df)

In [96]:
# importing the diddy csv
diddy_path = "./Celeb-Attendence/diddy.csv"

diddy_csv = pd.read_csv(diddy_path)

In [97]:
diddy_csv.head()
type(diddy_csv['date'][0])

str

In [98]:
diddy_csv = correct_date(diddy_csv)

In [None]:
diddy_games_df = scrape_sheets(diddy_csv, sec_delay=5)
diddy_games_df.head()

Scraping: LAL


In [None]:
diddy_save_path = "./Diddy-Stats/diddygames.csv"

diddy_games_df.to_csv()