# BSA Data Journalism Spring 2024

## Data scraping

In [170]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import time
import os

### Finding draft picks with basketball-reference
Defining start/end year and finding viable urls

In [171]:
start_year = 2008
end_year = 2023
url_reference = "https://www.basketball-reference.com/draft/"
# cols = ['pick_overall', 'team_id', 'player', 'college_name', 'seasons', 'g', 'mp', 'pts', 'trb', 'ast', 'fg_pct', 'fg3_pct', 'ft_pct', 'mp_per_g',
#        'pts_per_g', 'trb_per_g', 'ast_per_g', 'ws', 'ws_per_48', 'bpm', 'vorp']
dfDraftInfo : pd.DataFrame = None
path = "./data/draftdata.csv"

Table found on page html with id = "stats"

In [172]:
def get_table(soup: BeautifulSoup, year : int):
    df_cur_yr = None
    
    table = soup.find('table', {'id': 'stats'})
    
    if table is not None:
        rows = table.find_all('tr')

        for row in rows:
            data = row.find_all('td')

            player_data_dict = {}
            for d in data:
                datatype = d.get('data-stat')

                if datatype is not None:
                    # get contents
                    content = d.text.strip()
                    player_data_dict[str(datatype)] = str(content)

            # check for non empty row
            if len(player_data_dict.keys()) != 0:
                player_data_dict["year"] = year
                if df_cur_yr is not None:    
                    # keep adding
                    temp_df = pd.DataFrame([player_data_dict])
                    df_cur_yr = pd.concat([df_cur_yr, temp_df], axis = 0)
                    pass
                else:
                    # not empty
                    df_cur_yr = pd.DataFrame([player_data_dict])
                    pass

    return df_cur_yr.reset_index(drop=True)

Go through every year between the start_year and the end_year and save in a csv file so we don't need to run this more once

In [173]:
def scrape_draft(): 
    success = True

    for year in range(start_year, end_year+1):
        url = url_reference + "NBA_" + str(year) + ".html"
        response = requests.get(url)
        print(year)

        # successful get
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")

            df = get_table(soup, year)
            if dfDraftInfo is None:
                dfDraftInfo = df
            else:
                dfDraftInfo = pd.concat([dfDraftInfo, df]).reset_index(drop = True)
        else: 
            print("Unsuccessful Get request")
            success = False
            break

        # delay scraping
        time.sleep(2)

    # save data from scrape
    if success:
        dfDraftInfo.to_csv(path, index = False)

### Filtering

Filter the data, get rid of the NA rows (forfeited)

In [174]:
df_draft = pd.read_csv(path)
df_draft = df_draft.dropna(subset=['pick_overall'])

Filter so that only the lottery picks are in the dataframe

In [175]:
df_draft = df_draft[df_draft['pick_overall'] < 15]

### Get pre-NBA stats for CBB players
Using sports-reference.com

In [176]:
url_reference = "https://www.sports-reference.com/cbb/players/anthony-davis-5.html"

Get the list of names of players in the dataframe that went to college with their corresponding colleges

In [265]:
def get_cbb_players():
    df_cbb_players = df_draft.dropna(subset=['college_name'])
    return df_cbb_players

df_cbb_players = get_cbb_players()

In [178]:
import unicodedata
import re

def split_name(name : str):
    def strip_non_alphanumeric(s):
        pattern = re.compile(r'[^\w-]+')
        return pattern.sub('', s)
    
    names = name.split(" ")
    first_name = names[0]
    last_names = names[1:]

    first_name = strip_non_alphanumeric(first_name)
    last_names = [strip_non_alphanumeric(l).lower() for l in last_names]

    return first_name.lower(), last_names


Players with more than one word last names, 
Otto ['Porter', 'Jr.']
Dennis ['Smith', 'Jr.']
Marvin ['Bagley', 'III']
Jaren ['Jackson', 'Jr.']
Wendell ['Carter', 'Jr.']
Michael ['Porter', 'Jr.']
Kira ['Lewis', 'Jr.']
Jabari ['Smith', 'Jr.']
Dereck ['Lively', 'II']

In [179]:
# links of players with last names that don't follow the pattern
urls = {
    "otto" : "https://www.sports-reference.com/cbb/players/otto-porter-1.html",
    "dennis" : "https://www.sports-reference.com/cbb/players/dennis-smithjr-1.html",
    "marvin" : "https://www.sports-reference.com/cbb/players/marvin-bagleyiii-1.html",
    "jaren" : "https://www.sports-reference.com/cbb/players/jaren-jacksonjr-1.html",
    "wendell" : "https://www.sports-reference.com/cbb/players/wendell-carterjr-1.html",
    "michael" : "https://www.sports-reference.com/cbb/players/michael-porterjr-1.html",
    "kira" : "https://www.sports-reference.com/cbb/players/kira-lewisjr-1.html",
    "jabari" : "https://www.sports-reference.com/cbb/players/jabari-smith-2.html",
    "dereck" :  "https://www.sports-reference.com/cbb/players/dereck-lively-ii-1.html",
}

In [180]:
df_cbb_stats : pd.DataFrame = None

In [181]:
def scrape_helper(url : str, name : str):
    df : pd.DataFrame = None
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        table = soup.find('table', {'id': 'players_per_game'})

        if table is not None:
            rows = table.find_all('tr')

            for row in rows:
                data = row.find_all('td')

                player_data_dict = {}
                for d in data:
                    datatype = d.get('data-stat')

                    if datatype is not None:
                        # get contents
                        content = d.text.strip()
                        player_data_dict[str(datatype)] = str(content)

        player_data_dict['player'] = name
        df = pd.DataFrame([player_data_dict])
    else:
        print("Bad Response Status Code", response.status_code)
        return None
    
    return df

In [182]:
def scrape_cbb_stats():
    df_all : pd.DataFrame = None

    for index, row in df_cbb_players.iterrows():
        name = row['player']
        team = row['college_name']

        first_name, last_name = split_name(name)
        team = team.split(" ")[0]
        
        url = ""
        if len(last_name) != 1:
            url = urls[first_name]
        else: 
            # bam adebayo is named edrice
            if first_name == 'bam':
                first_name = 'edrice'
            # mo bamba is named mohamed
            elif first_name == "mo":
                first_name = 'mohamed'
            # ja morant is named temetrius
            elif first_name == "ja":
                first_name = 'temetrius'
            # obi toppin is named obadiah
            elif first_name == "obi":
                first_name = "obadiah"
            # shaedon sharpe doesn't qualify
            elif first_name == "shaedon":
                continue
            # johnny davis is named johnny
            elif first_name == "johnny":
                first_name = "jonathan"

            last_name = last_name[0]
            for i in range(1, 15):

                attempts = 0
                correct_url = False
                while attempts < 3:
                    # to check for the correct player (by team), for example anthony-davis-5, not anthony-davis-4
                    url = f"https://www.sports-reference.com/cbb/players/{first_name}-{last_name}-{i}.html" 
                    response = requests.get(url)
                    
                    if response.status_code == 200:
                        print(team)
                        soup = BeautifulSoup(response.text, "html.parser")
                        school_name_elements = soup.find_all(attrs={"data-stat": "school_name"})

                        correct_url = any(team in element.get_text() for element in school_name_elements)
                    else:
                        print("Bad Response Status Code", response.status_code)
                        time.sleep(2)
                        
                    if correct_url:
                        break

                    time.sleep(2)
                    
                    attempts += 1
            
                if correct_url:
                    break
        
        if len(url) != 0:
            df = scrape_helper(url, name)
            while df is None:
                time.sleep(2)
                df = scrape_helper(url, name)
            
            if df_all is None:
                df_all = df
            else: 
                df_all = pd.concat([df_all, df]).reset_index(drop = True)
        else:
            print("URL not found")

        time.sleep(5)

    return df_all

In [183]:
# df_cbb_stats = scrape_cbb_stats()
path2 = './data/cbbdata.csv'
# df_cbb_stats.to_csv(path2, index=False)


Delete unnecessary columns

In [187]:
df_cbb_stats = pd.read_csv(path2)

drop_cols = [
    'conf_abbr', 
    'class', 
    'sos-dum',
    'sos'
]

df_cbb_stats = df_cbb_stats.drop(drop_cols, axis = 1)

### Scraping shot selection from barttorvik.com

Shot selection data only exists from 2010 onwards, must filter out 2008-2009

Also no data on players with less than 10 games
- Darius Garland
- Michael Porter Jr
- James Wiseman

In [185]:
url_reference = "https://barttorvik.com/playerstat.php?year=2016&p=Jaylen%20Brown"

In [281]:
df_cbb_players = df_cbb_players.reset_index(drop = True)
df_cbb_players = df_cbb_players[df_cbb_players['year'] >= 2010]

not_qualified = ["Michael Porter Jr.", "Darius Garland", "James Wiseman"]
df_cbb_players = df_cbb_players[~df_cbb_players['player'].isin(not_qualified)]

df_shot_selection : pd.DataFrame = None

Correct table has headers "DUNKS", "At the rim", "Other 2-PT", "3-Pt Jumpers"

In [282]:
def has_desired_headers(headers):
    desired_headers = ["DUNKS", "AT THE RIM", "OTHER 2-PT", "3-PT JUMPERS"]
    for header in desired_headers:
        if header.lower() not in headers.lower():
            return False
    return True

In [283]:
shot_stat_headers = ['dunk_tot', 'dunk_pct', 
                     'rim_tot', 'rim_pct', 'rim_asted', 
                     'other2pt_tot', 'other2pt_pct', 'other2pt_asted', 
                     '3pt_tot', '3pt_pct', '3pt_asted']

shot_urls = {
    "otto" : "https://barttorvik.com/playerstat.php?year=2013&p=Otto%20Porter&t=Georgetown",
    "dennis" : "https://barttorvik.com/playerstat.php?year=2017&p=Dennis%20Smith%2C%20Jr.&t=North%20Carolina%20St.",
    "marvin" : "https://barttorvik.com/playerstat.php?year=2018&p=Marvin%20Bagley%20III&t=Duke",
    "jaren" : "https://barttorvik.com/playerstat.php?year=2018&p=Jaren%20Jackson%20Jr.&t=Michigan%20St.",
    "wendell" : "https://barttorvik.com/playerstat.php?year=2018&p=Wendell%20Carter%20Jr.&t=Duke",
    "kira" : "https://barttorvik.com/playerstat.php?year=2020&p=Kira%20Lewis%20Jr.&t=Alabama",
    "jabari" : "https://barttorvik.com/playerstat.php?year=2022&p=Jabari%20Smith&t=Auburn",
    "dereck" :  "https://barttorvik.com/playerstat.php?year=2023&p=Dereck%20Lively%20II&t=Duke",
}

In [284]:
# doesn't remove the period at the end of the names
def split_name2(name : str):
    def strip_non_alphanumeric(s):
        pattern = re.compile(r'[^.\w-]+')
        return pattern.sub('', s)
    
    names = name.split(" ")
    first_name = names[0]
    last_names = names[1:]

    first_name = strip_non_alphanumeric(first_name)
    last_names = [strip_non_alphanumeric(l).lower() for l in last_names]

    return first_name.lower(), last_names

In [308]:
def scrape_shot_selection():
    df : pd.DataFrame = None

    for index, row in df_cbb_players.iterrows():
        name = row['player']
        year = row['year']
        first_name, last_name = split_name2(name)

        url = ""

        if first_name == 'cj':
            first_name = "c.j."
        if first_name == 'dangelo':
            first_name = "D%27Angelo"
        if first_name == 'deaaron':
            first_name = 'De%27Aaron'
        if first_name == 'bam':
            first_name = 'edrice'
        if first_name == 'mo':
            first_name = "mohamed"
        if first_name == 'rj':
            first_name = "r.j."
        if first_name == 'deandre' and last_name[0] != "ayton":
            first_name = "De%27andre"
        if first_name == "p.j." and last_name[0] == "washington":
            first_name = "pj"
        if first_name == "shaedon":
            continue

        if len(last_name) == 1:
            last_name = last_name[0]
            url = f"https://barttorvik.com/playerstat.php?year={year}&p={first_name}%20{last_name}"
        else:
            url = shot_urls[first_name]

        print(url)
        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            
            tables = soup.find_all('table')
            filtered_table = [table for table in tables if has_desired_headers(table.find('thead').text)][0]
            
            tds = filtered_table.find_all('td')

            shot_stat_dict = {}
            shot_stat_dict['player'] = name
            i = 0
            for td in tds:
                header = shot_stat_headers[i]
                shot_stat_dict[header] = td.text
                i+=1

            df_temp = pd.DataFrame([shot_stat_dict])
            if df is None:
                df = df_temp
            else: 
                df = pd.concat([df, df_temp]).reset_index(drop = True)
        else:
            print("Bad Request: ", response.status_code)
            break
        time.sleep(5)

    return df


In [None]:
# df_shot_selection = scrape_shot_selection()

In [312]:
path = "./data/shotdata.csv"
# df_shot_selection.to_csv(path, index = False)

In [313]:
df_shot_selection = pd.read_csv(path)