# BSA Data Journalism Spring 2024

## Data scraping

In [216]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import time
import os

Defining start/end year and finding viable urls

In [217]:
start_year = 2008
end_year = 2023
url_reference = "https://www.basketball-reference.com/draft/"
# cols = ['pick_overall', 'team_id', 'player', 'college_name', 'seasons', 'g', 'mp', 'pts', 'trb', 'ast', 'fg_pct', 'fg3_pct', 'ft_pct', 'mp_per_g',
#        'pts_per_g', 'trb_per_g', 'ast_per_g', 'ws', 'ws_per_48', 'bpm', 'vorp']
dfDraftInfo : pd.DataFrame = None
path = "./data/draftdata.csv"

Table found on page html with id = "stats"

In [218]:
def get_table(soup: BeautifulSoup, year : int):
    df_cur_yr = None
    
    table = soup.find('table', {'id': 'stats'})
    
    if table is not None:
        rows = table.find_all('tr')

        for row in rows:
            data = row.find_all('td')

            player_data_dict = {}
            for d in data:
                datatype = d.get('data-stat')

                if datatype is not None:
                    # get contents
                    content = d.text.strip()
                    player_data_dict[str(datatype)] = str(content)

            # check for non empty row
            if len(player_data_dict.keys()) != 0:
                player_data_dict["year"] = year
                if df_cur_yr is not None:    
                    # keep adding
                    temp_df = pd.DataFrame([player_data_dict])
                    df_cur_yr = pd.concat([df_cur_yr, temp_df], axis = 0)
                    pass
                else:
                    # not empty
                    df_cur_yr = pd.DataFrame([player_data_dict])
                    pass

    return df_cur_yr.reset_index(drop=True)

Go through every year between the start_year and the end_year and save in a csv file so we don't need to run this more once

In [219]:
def scrape_draft(): 
    success = True

    for year in range(start_year, end_year+1):
        url = url_reference + "NBA_" + str(year) + ".html"
        response = requests.get(url)
        print(year)

        # successful get
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")

            df = get_table(soup, year)
            if dfDraftInfo is None:
                dfDraftInfo = df
            else:
                dfDraftInfo = pd.concat([dfDraftInfo, df]).reset_index(drop = True)
        else: 
            print("Unsuccessful Get request")
            success = False
            break

        # delay scraping
        time.sleep(2)

    # save data from scrape
    if success:
        dfDraftInfo.to_csv(path, index = False)

### Filtering

Filter the data, get rid of the NA rows (forfeited)

In [220]:
df_draft = pd.read_csv(path)
df_draft = df_draft.dropna(subset=['pick_overall'])

Filter so that only the lottery picks are in the dataframe

In [221]:
df_draft = df_draft[df_draft['pick_overall'] < 15]

### Get pre-NBA stats for CBB players

In [222]:
url_reference = "https://www.sports-reference.com/cbb/players/anthony-davis-5.html"

Get the list of names of players in the dataframe that went to college with their corresponding colleges

In [223]:
def get_cbb_players():
    df_cbb_players = df_draft.dropna(subset=['college_name'])
    return df_cbb_players

df_cbb_players = get_cbb_players()

In [224]:
import unicodedata
import re

def split_name(name : str):
    def strip_non_alphanumeric(s):
        pattern = re.compile(r'[\W_]+')
        return pattern.sub('', s)
    
    names = name.split(" ")
    first_name = names[0]
    last_names = names[1:]

    first_name = strip_non_alphanumeric(first_name)
    for l in last_names:
        strip_non_alphanumeric(l)

    return first_name, last_names


In [225]:
def scrape_cbb_stats():
    for index, row in df_cbb_players.iterrows():
        name = row['player']
        team = row['college_name']

        first_name, last_name = split_name(name)
        
        if len(last_name) != 1:
            print(first_name, last_name)

scrape_cbb_stats()

Otto ['Porter', 'Jr.']
Dennis ['Smith', 'Jr.']
Marvin ['Bagley', 'III']
Jaren ['Jackson', 'Jr.']
Wendell ['Carter', 'Jr.']
Michael ['Porter', 'Jr.']
Kira ['Lewis', 'Jr.']
Jabari ['Smith', 'Jr.']
Dereck ['Lively', 'II']
