# Scraping Diddy Sheets
First column = date
Second column = home team

In [9]:
# %pip install pandas scikit-learn bs4 requests

import pandas as pd 
from datetime import datetime, date
from bs4 import BeautifulSoup
import requests
import time

In [23]:
def get_opp_team(soup: BeautifulSoup) -> str: 
    scorebox_div = soup.find('div', class_="scorebox")
    if scorebox_div is None:
        print("Not Found")
        return None
    
    link_tag = scorebox_div.find('strong').find('a')
    if link_tag is None:
        print("<a> tag not found")
        return None

    if 'href' not in link_tag.attrs:
        print("Link not found in href")
        return None
    
    href = link_tag["href"]

    return href.split("/")[2]

In [22]:
def get_table_headers(soup: BeautifulSoup, table_id: str) -> str:
    table = soup.find('table', table_id)

    if table is None:
        print("Couldn't find the table")
        return None
    
    table_header = table.find('thead')

    if table_header is None:
        print("Couldn't find the table header")
    
    data_stats = [th['data-stat'] for th in table.find_all('th') if 'data_stat' in th.attrs]

    return data_stats[1:]

In [21]:
def get_stat_table(soup: BeautifulSoup, table_id: str, team: str) -> pd.DataFrame:
    table = soup.find('table', id = table_id)

    if table is None:
        print("Table not found")
        return None
    
    table_body = table.find('tbody')

    if table_body is None:
        print("Table body not found")
        return None

    table_rows = [tr for tr in table_body.find_all('tr') if 'thead' not in tr.attrs]

    if table_rows is None:
        print("Table rows not found")
        return None

    data = []
    col_names = None

    for row in table_rows:
        table_data = [td for td in row if td.name == "td"]

        if "Did Not Play" in str(row) or "Reserves" in str(row): 
            continue 
    
        player_name = row.find('th').text.strip()

        if col_names is None:
            col_names = ["player", "team"] + [td['data-stat'] for td in table_data if 'data-stat' in td.attrs]
        
        data_values = [td.text.strip() for td in table_data]
        data_values = [player_name, team] + data_values
        data.append(data_values)

    if col_names and data:
        return pd.DataFrame(data, columns=col_names)
    else:
        print("No data found")
        return None

In [20]:
def add_date(df: pd.DataFrame, date: date):
    df['date'] = date
    return df

def correct_date(df: pd.DataFrame):
    if "date" not in df.columns:
        print("Dates not found. Dataframe must contain a column named 'date'")
        return None
    
    if pd.api.types.is_string_dtype(df['date']):
        df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')

    return df

In [19]:
# input the home team and the day the game was played
def scrape_game(home_team: str, game_date: date, sec_delay: int):
    print(f"Scraping: {home_team}")

    time.sleep(sec_delay)

    date_str = game_date.strftime("%Y%m%d")
    link = f"https://www.basketball-reference.com/boxscores/{date_str}0{home_team}.html"

    response = requests.get(link)
    if response.status_code != 200:
        print("Not Found: " + link)
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    opp_team = get_opp_team(soup)

    home_table_id = f"box-{home_team}-game-basic"
    opponent_table_id = f"box-{opp_team}-game-basic"

    home_df = get_stat_table(soup, home_table_id, home_team)
    home_df = add_date(home_df, game_date)
    opp_df = get_stat_table(soup, opponent_table_id, opp_team)
    opp_df = add_date(opp_df, game_date)

    return pd.concat([home_df, opp_df], ignore_index=True)


In [18]:
def scrape_sheets(df: pd.DataFrame, sec_delay: int) -> pd.DataFrame:
    all_games_df = df.apply(lambda row: scrape_game(row['home_team'], row['date'], sec_delay=sec_delay), axis = 1).tolist()
    
    return pd.concat(all_games_df)

In [18]:
# importing the diddy csv
diddy_path = "./Celeb-Attendence/diddy.csv"

diddy_csv = pd.read_csv(diddy_path)

In [19]:
diddy_csv.head()
type(diddy_csv['date'][0])

str

In [20]:
diddy_csv = correct_date(diddy_csv)

In [None]:
diddy_games_df = scrape_sheets(diddy_csv, sec_delay=5)
diddy_games_df.head()

In [101]:
diddy_save_path = "./Diddy-Stats/diddygames.csv"

diddy_games_df.to_csv(diddy_save_path)

## Getting the Career Stats of the Top 20 Diddy players


In [1]:
top_20_player_links = {
    "Dwyane Wade": "https://www.basketball-reference.com/players/w/wadedw01.html",
    "Kobe Bryant": "https://www.basketball-reference.com/players/b/bryanko01.html",
    "Lamar Odom": "https://www.basketball-reference.com/players/o/odomla01.html",
    "Pau Gasol": "https://www.basketball-reference.com/players/g/gasolpa01.html",
    "Derek Fisher": "https://www.basketball-reference.com/players/f/fishede01.html",
    "Lebron James": "https://www.basketball-reference.com/players/j/jamesle01.html",
    "Jordan Farmar": "https://www.basketball-reference.com/players/f/farmajo01.html",
    "Sasha Vujačić": "https://www.basketball-reference.com/players/v/vujacsa01.html",
    "Trevor Ariza": "https://www.basketball-reference.com/players/a/arizatr01.html",
    "Udonis Haslem": "https://www.basketball-reference.com/players/h/hasleud01.html",
    "Andrew Bynum": "https://www.basketball-reference.com/players/b/bynuman01.html",
    "Jason Kidd": "https://www.basketball-reference.com/players/k/kiddja01.html",
    "Kevin Garnett": "https://www.basketball-reference.com/players/g/garneke01.html",
    "Luke Walton": "https://www.basketball-reference.com/players/w/waltolu01.html",
    "Paul Pierce": "https://www.basketball-reference.com/players/p/piercpa01.html",
    "Rajon Rondo": "https://www.basketball-reference.com/players/r/rondora01.html",
    "Ray Allen": "https://www.basketball-reference.com/players/a/allenra02.html",
    "Shannon Brown": "https://www.basketball-reference.com/players/b/brownsh01.html",
    "J.R. Smith": "https://www.basketball-reference.com/players/s/smithjr01.html",
    "Chris Bosh": "https://www.basketball-reference.com/players/b/boshch01.html"
}

New setup for the tables in basketball reference
* regular season is in the div with id="div_per_game_stats"
* post season is in the div with id="div_per_game_stats_post"

For total stats
* regular = id="div_totals"
* playoffs = "div_playoffs_totals"


In [10]:
def scrape_totals(link: str, player_name: str, is_playoff: bool = False) -> pd.DataFrame:
    response = requests.get(link)

    if response.status_code != 200:
        print("Not Found: " + link)
        return None
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    if is_playoff:
        div_id = "div_playoffs_totals"
    else:
        div_id = "div_totals"

    div = soup.find('div', id=div_id)
    table = div.find('table')

    table_body = table.find('tbody')
    table_rows = table_body.find_all('tr')
    
    data = []
    col_names = None

    for row in table_rows:
        table_data = [td for td in row if td.name == "td"]

        season = row.find('th').text.strip()

        if col_names is None:
            col_names = ["player", "season", "season_type"] + [td['data-stat'] for td in table_data if 'data-stat' in td.attrs]
        
        data_values = [td.text.strip() for td in table_data]
        data_values = [player_name, season, ("playoff" if is_playoff else "regular")] + data_values
        data.append(data_values)

    if col_names and data:
        return pd.DataFrame(data, columns=col_names)
    else:
        print("No data found")
        return None


In [11]:
def scrape_top_diddy(link_dict: dict, sec_delay: int) -> pd.DataFrame:
    result_df = []

    for name,link in link_dict.items():
        print(f"Scraping: {name}")
        time.sleep(sec_delay)

        playoff_df = scrape_totals(link=link, player_name=name, is_playoff=True)

        time.sleep(sec_delay)
        regular_df = scrape_totals(link=link, player_name=name, is_playoff=False)

        result_df.append(playoff_df)
        result_df.append(regular_df)

    return pd.concat(result_df, ignore_index=True)

In [140]:
top_df = scrape_top_diddy(link_dict=top_20_player_links, sec_delay=3)

Scraping: Dwayne Wade
Scraping: Kobe Bryant
Scraping: Lamar Odom
Scraping: Pau Gasol
Scraping: Derek Fisher
Scraping: Lebron James
Scraping: Jordan Farmar
Scraping: Sasha Vujačić
Scraping: Trevor Ariza
Scraping: Udonis Haslem
Scraping: Andrew Bynum
Scraping: Jason Kidd
Scraping: Kevin Garnett
Scraping: Luke Walton
Scraping: Paul Pierce
Scraping: Rajon Rondo
Scraping: Ray Allen
Scraping: Shannon Brown
Scraping: J.R. Smith
Scraping: Chris Bosh


In [141]:
top_df_path = "./Diddy-Stats/top20.csv"
top_df.to_csv(top_df_path)

## Scrape a season 
* Input player id, year, and season_type -> get the data for that season
    * year is the later one, so 2011-12 would be the integer 2012


In [2]:
player_ids = {
    "Dwyane Wade": "p/piercpa01",
    "Paul Pierce": "w/wadedw01"
}

In [7]:
def scrape_all_playoffs(url):
    # pgl_basic_playoffs
    response = requests.get(url)

    if response.status_code != 200:
        raise Exception("Error: " + str(response.status_code))
    
    soup = BeautifulSoup(response.content, 'html.parser')

    table_id = "pgl_basic_playoffs"
    table = soup.find('table', id = table_id)

    if table is None:
        raise Exception("Table not found")
    
    table_body = table.find('tbody')

    if table_body is None:
        raise Exception("Table Body not found")
    
    # Selecting all rows that don't have the 'thead' class attribute
    table_rows = table_body.find_all('tr', class_=lambda x: x != 'thead')

    if table_rows is None:
        raise Exception("No rows found")
    
    data = []
    col_names = None
    
    for row in table_rows:
        table_data = [td for td in row if "right" in td['class'] or ("left" in td['class'] and td["data-stat"] == "date_game")]

        if col_names is None:
            col_names = [td['data-stat'] for td in table_data if 'data-stat' in td.attrs]
        
        data_values = [td.text.strip() for td in table_data]
        data.append(data_values)
    
    if col_names and data:
        df = pd.DataFrame(data, columns=col_names)

        # dropping columns 
        # Don't need ranker, game_season, gs
        df = df.drop(['ranker', 'game_season', 'gs'], axis=1)
        return df
    else:
        raise Exception("No data found")

In [6]:
def scrape_playoffs(url, year):
    df = scrape_all_playoffs(url)

    # changing to date time format
    df['date_game'] = pd.to_datetime(df['date_game'])

    # filtering year 
    df = df.loc[df['date_game'].dt.year == year]

    df = df.reset_index(drop=True)

    return df

In [5]:
def scrape_regular(url):
    # pgl_basic
    response = requests.get(url)

    if response.status_code != 200:
        raise Exception(f"Error: {response.status_code}")
    
    soup = BeautifulSoup(response.content, 'html.parser')

    table_id = "pgl_basic"
    table = soup.find('table', id = table_id)

    if table is None:
        raise Exception("Table not found")
    
    table_body = table.find('tbody')

    if table_body is None:
        raise Exception("Table Body not found")
    
    # Selecting all rows that don't have the 'thead' class attribute
    table_rows = table_body.find_all('tr', class_=lambda x: x != 'thead')

    if table_rows is None:
        raise Exception("No rows found")
    
    data = []
    col_names = None
    
    for row in table_rows:
        if "Did Not Play" in str(row) or "Not With Team" in str(row) or "Inactive" in str(row) or "Did Not Dress" in str(row): 
            continue 

        if col_names is None:
            col_names = [td['data-stat'] for td in row if 'data-stat' in td.attrs]

        data_values = [td.text.strip() for td in row]
        data.append(data_values)
    
    if col_names and data:
        df = pd.DataFrame(data, columns=col_names)

        # dropping columns 
        df = df.drop(['ranker','game_season', 'gs', 'game_location', 'age'], axis=1)
        return df
    else:
        raise Exception("No data found")


In [4]:
def get_all_regular_years(player_id):
    url = f"https://www.basketball-reference.com/players/{player_id}.html"

    response = requests.get(url) 

    if response.status_code != 200:
        raise Exception(f"Error: {response.status_code}")
    
    soup = BeautifulSoup(response.content, 'html.parser') 

    inner_nav_div = soup.find('div', id='inner_nav')
    if inner_nav_div is None:
        raise Exception("inner_nav div not found")
    
    a_tags = inner_nav_div.find_all('a')
    
    if a_tags is None:
        raise Exception("a tags not found")
    
    game_log_hrefs = [a["href"] for a in a_tags 
                      if 'href' in a.attrs 
                      and 'gamelog' in a['href']
                      and 'playoff' not in a['href']]

    return list(set(game_log_hrefs))


In [3]:
def scrape_all_regular(player_name, player_id, sec_delay):
    url_suffixes = get_all_regular_years(player_id)

    df_list = []

    for suffix in url_suffixes:
        url = f"https://www.basketball-reference.com/{suffix}"

        df = scrape_regular(url)
        prev_year = str(int(suffix[-4:])-1)
        season = prev_year + "-" + suffix[-4:]
        df["season"] = season

        df_list.append(df)
        time.sleep(sec_delay)

    result_df = pd.concat(df_list, ignore_index=True)
    result_df['player'] = player_name
    return result_df

In [2]:
def scrape_season(player_name, player_id, year, season_type): 
    url = f"https://www.basketball-reference.com/players/{player_id}"

    df = None
    if season_type == "playoffs":
        url += "/gamelog-playoffs/"
        df = scrape_playoffs(url, year)

    elif season_type == "regular":
        url += f"/gamelog/{year}" 
        df = scrape_regular(url)

    df['player'] = player_name
    df['season_type'] = season_type

    return df 


## Getting Players Playoff and Regular season stats

In [19]:
pp_playoffs_df = scrape_all_playoffs("https://www.basketball-reference.com/players/p/piercpa01/gamelog-playoffs/")

In [133]:
pp_playoffs_df['player'] = "Paul Pierce"

In [135]:
pp_playoffs_path = "./career_data/paul_pierce_playoffs.csv"
pp_playoffs_df.to_csv(pp_playoffs_path)

In [28]:
wade_regular_df = scrape_all_regular("Dwyane Wade", "w/wadedw01", 5)

In [29]:
wade_regular_path = "./career_data/dwyane_wade_regular.csv"
wade_regular_df.to_csv(wade_regular_path, index=False)

In [30]:
sim_players = {
    "Dwyane Wade": "p/piercpa01",
    "Paul Pierce": "w/wadedw01",
    "Kevin Garnett": "g/garneke01",
    "Rajon Rondo": "r/rondora01",
    "Pau Gasol": "g/gasolpa01",
    "Lebron James": "j/jamesle01"
}

In [39]:
import os 

In [40]:
for key, value in sim_players.items(): 
    print(key)
    save_path = "./career_data/" + key.split(" ")[0].lower() + "_" + key.split(" ")[1].lower()
    
    if not os.path.isfile(save_path + "_regular.csv"):
        reg_df = scrape_all_regular(key, value, sec_delay=10)

        reg_df.to_csv(save_path + "_regular.csv")

    if not os.path.isfile(save_path + "_playoffs.csv"):
        playoff_url = f"https://www.basketball-reference.com/players/{value}/gamelog-playoffs/"
        playoff_df = scrape_all_playoffs(playoff_url)

        playoff_df['player'] = key

        playoff_df.to_csv(save_path + "_playoffs.csv")

Dwyane Wade
Paul Pierce
Kevin Garnett
Rajon Rondo
Pau Gasol
Lebron James


In [10]:
test_df = scrape_all_playoffs("https://www.basketball-reference.com/players/w/wadedw01/gamelog-playoffs/")

In [20]:
test_df['player'] = "Dwyane Wade"
test_df.to_csv('./career_data/dwyane_wade_playoffs.csv')

# Scraping Drizzy Sheets

In [4]:
import pandas as pd 
from datetime import datetime, date
from bs4 import BeautifulSoup
import requests
import time

In [5]:
drizzy_df = pd.read_csv("Celeb-Attendence/drizzy.csv")

In [None]:
len(drizzy_df["date"].unique())

123

In [None]:
# Fixing duplicates in drizzy_df
# drizzy_df = drizzy_df.drop_duplicates()

In [None]:
# drizzy_df.shape

(123, 3)

In [None]:
# drizzy_df.to_csv("Celeb-Attendence/drizzy.csv")

In [26]:
drizzy_df = correct_date(drizzy_df)

drizzy_sheets_df = scrape_sheets(drizzy_df, sec_delay=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y')


Scraping: HOU
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: LAC
Scraping: TOR
Scraping: TOR
Scraping: GSW
Scraping: TOR
Scraping: LAC
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: LAL
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: LAL
Scraping: GSW
Scraping: TOR
Scraping: LAL
Scraping: MIA
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Not Found: https://www.basketball-reference.com/boxscores/202110110TOR.html
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: LAL
Scraping: GSW
Scraping: MIA
Scraping: TOR
Scraping: TOR
Scraping: TOR
Not Found: https://www.basketball-reference.com/boxscores/202203230TOR.html
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: TOR
Scraping: LAL
Scraping: TOR
Scraping

In [28]:
drizzy_sheets_df.to_csv("Drizzy-Stats/drizzygames.csv")