# BSA Data Journalism Spring 2024

## Data scraping

In [29]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import time
import os

Defining start/end year and finding viable urls

In [30]:
start_year = 2008
end_year = 2023
url_reference = "https://www.basketball-reference.com/draft/"
# cols = ['pick_overall', 'team_id', 'player', 'college_name', 'seasons', 'g', 'mp', 'pts', 'trb', 'ast', 'fg_pct', 'fg3_pct', 'ft_pct', 'mp_per_g',
#        'pts_per_g', 'trb_per_g', 'ast_per_g', 'ws', 'ws_per_48', 'bpm', 'vorp']
dfDraftInfo : pd.DataFrame = None
path = "./data/draftdata.csv"

Table found on page html with id = "stats"

In [31]:
def get_table(soup: BeautifulSoup, year : int):
    df_cur_yr = None
    
    table = soup.find('table', {'id': 'stats'})
    
    if table is not None:
        rows = table.find_all('tr')

        for row in rows:
            data = row.find_all('td')

            player_data_dict = {}
            for d in data:
                datatype = d.get('data-stat')

                if datatype is not None:
                    # get contents
                    content = d.text.strip()
                    player_data_dict[str(datatype)] = str(content)

            # check for non empty row
            if len(player_data_dict.keys()) != 0:
                player_data_dict["year"] = year
                if df_cur_yr is not None:    
                    # keep adding
                    temp_df = pd.DataFrame([player_data_dict])
                    df_cur_yr = pd.concat([df_cur_yr, temp_df], axis = 0)
                    pass
                else:
                    # not empty
                    df_cur_yr = pd.DataFrame([player_data_dict])
                    pass

    return df_cur_yr.reset_index(drop=True)

Go through every year between the start_year and the end_year and save in a csv file so we don't need to run this more once

In [32]:
def scrape_draft(): 
    success = True

    for year in range(start_year, end_year+1):
        url = url_reference + "NBA_" + str(year) + ".html"
        response = requests.get(url)
        print(year)

        # successful get
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")

            df = get_table(soup, year)
            if dfDraftInfo is None:
                dfDraftInfo = df
            else:
                dfDraftInfo = pd.concat([dfDraftInfo, df]).reset_index(drop = True)
        else: 
            print("Unsuccessful Get request")
            success = False
            break

        # delay scraping
        time.sleep(2)

    # save data from scrape
    if success:
        dfDraftInfo.to_csv(path, index = False)

### Filtering

Filter the data, get rid of the NA rows (forfeited)

In [33]:
df_draft = pd.read_csv(path)
df_draft = df_draft.dropna(subset=['pick_overall'])

Filter so that only the lottery picks are in the dataframe

In [34]:
df_draft = df_draft[df_draft['pick_overall'] < 15]

### Get pre-NBA stats for CBB players

In [35]:
url_reference = "https://www.sports-reference.com/cbb/players/anthony-davis-5.html"

Get the list of names of players in the dataframe that went to college with their corresponding colleges

In [36]:
def get_cbb_players():
    df_cbb_players = df_draft.dropna(subset=['college_name'])
    return df_cbb_players

df_cbb_players = get_cbb_players()

In [37]:
import unicodedata
import re

def split_name(name : str):
    def strip_non_alphanumeric(s):
        pattern = re.compile(r'[\W_]+')
        return pattern.sub('', s)
    
    names = name.split(" ")
    first_name = names[0]
    last_names = names[1:]

    first_name = strip_non_alphanumeric(first_name)
    last_names = [strip_non_alphanumeric(l).lower() for l in last_names]

    return first_name.lower(), last_names


Players with more than one word last names, 
Otto ['Porter', 'Jr.']
Dennis ['Smith', 'Jr.']
Marvin ['Bagley', 'III']
Jaren ['Jackson', 'Jr.']
Wendell ['Carter', 'Jr.']
Michael ['Porter', 'Jr.']
Kira ['Lewis', 'Jr.']
Jabari ['Smith', 'Jr.']
Dereck ['Lively', 'II']

In [38]:
# links of players with last names that don't follow the pattern
urls = {
    "otto" : "https://www.sports-reference.com/cbb/players/otto-porter-1.html",
    "dennis" : "https://www.sports-reference.com/cbb/players/dennis-smithjr-1.html",
    "marvin" : "https://www.sports-reference.com/cbb/players/marvin-bagleyiii-1.html",
    "jaren" : "https://www.sports-reference.com/cbb/players/jaren-jacksonjr-1.html",
    "wendell" : "https://www.sports-reference.com/cbb/players/wendell-carterjr-1.html",
    "michael" : "https://www.sports-reference.com/cbb/players/michael-porterjr-1.html",
    "kira" : "https://www.sports-reference.com/cbb/players/kira-lewisjr-1.html",
    "jabari" : "https://www.sports-reference.com/cbb/players/jabari-smith-2.html",
    "dereck" :  "https://www.sports-reference.com/cbb/players/dereck-lively-ii-1.html",
}

In [39]:
df_cbb_stats : pd.DataFrame = None

In [40]:
def scrape_helper(url : str):
    df : pd.DataFrame = None
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        table = soup.find('table', {'id': 'players_per_game'})

        if table is not None:
            rows = table.find_all('tr')

            for row in rows:
                data = row.find_all('td')

                player_data_dict = {}
                for d in data:
                    datatype = d.get('data-stat')

                    if datatype is not None:
                        # get contents
                        content = d.text.strip()
                        player_data_dict[str(datatype)] = str(content)

        df = pd.DataFrame([player_data_dict])
    else:
        print("Bad Response Status Code", response.status_code)
        return None
    return df

In [46]:
def scrape_cbb_stats():
    df_all : pd.DataFrame = None

    for index, row in df_cbb_players.iterrows():
        name = row['player']
        team = row['college_name']

        first_name, last_name = split_name(name)
        
        url = ""
        if len(last_name) != 1:
            url = urls[first_name]
        else: 
            last_name = last_name[0]
            for i in range(1, 10):
                attempts = 0
                while attempts < 3:
                    # to check for the correct player (by team), for example anthony-davis-5, not anthony-davis-4
                    url = f"https://www.sports-reference.com/cbb/players/{first_name}-{last_name}-{i}.html"
                    print(url)
                    response = requests.get(url)
                    correct_url = False
                    
                    if response.status_code == 200:
                        soup = BeautifulSoup(response.text, "html.parser")
                        school_name_elements = soup.find_all(attrs={"data-stat": "school_name"})

                        correct_url = any(team in element.get_text() for element in school_name_elements)
                    else:
                        print("Bad Response Status Code", response.status_code)
                        time.sleep(2)
                        attempts += 1
                        
                    if correct_url:
                        break
                    else:
                        time.sleep(2)
        
        if len(url) != 0:
            df = scrape_helper(url)
            while df is None:
                time.sleep(2)
                df = scrape_helper(url)
            
            if df_all is None:
                df_all = df
            else: 
                df_all = pd.concat([df_all, df]).reset_index(drop = True)
        else:
            print("URL not found")
        time.sleep(5)

    return df_all

In [45]:
df_cbb_stats = scrape_cbb_stats()
path2 = './data/cbbdata.csv'
df_cbb_stats.to_csv(path2, index=False)


https://www.sports-reference.com/cbb/players/derrick-rose-1.html
Bad Response Status Code 429
Bad Response Status Code 429
Bad Response Status Code 429


KeyboardInterrupt: 

AttributeError: 'NoneType' object has no attribute 'head'