# BSA Data Journalism Spring 2024

## Data scraping

In [181]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import time
import os

Defining start/end year and finding viable urls

In [187]:
start_year = 2008
end_year = 2023
url_reference = "https://www.basketball-reference.com/draft/"
# cols = ['pick_overall', 'team_id', 'player', 'college_name', 'seasons', 'g', 'mp', 'pts', 'trb', 'ast', 'fg_pct', 'fg3_pct', 'ft_pct', 'mp_per_g',
#        'pts_per_g', 'trb_per_g', 'ast_per_g', 'ws', 'ws_per_48', 'bpm', 'vorp']
dfDraftInfo : pd.DataFrame = None
path = "./data/draftdata.csv"

Table found on page html with id = "stats"

In [183]:
def get_table(soup: BeautifulSoup, year : int):
    df_cur_yr = None
    
    table = soup.find('table', {'id': 'stats'})
    
    if table is not None:
        rows = table.find_all('tr')

        for row in rows:
            data = row.find_all('td')

            player_data_dict = {}
            for d in data:
                datatype = d.get('data-stat')

                if datatype is not None:
                    # get contents
                    content = d.text.strip()
                    player_data_dict[str(datatype)] = str(content)

            # check for non empty row
            if len(player_data_dict.keys()) != 0:
                player_data_dict["year"] = year
                if df_cur_yr is not None:    
                    # keep adding
                    temp_df = pd.DataFrame([player_data_dict])
                    df_cur_yr = pd.concat([df_cur_yr, temp_df], axis = 0)
                    pass
                else:
                    # not empty
                    df_cur_yr = pd.DataFrame([player_data_dict])
                    pass

    return df_cur_yr.reset_index(drop=True)

Go through every year between the start_year and the end_year and save in a csv file so we don't need to run this more once

In [185]:
def scrape_draft(): 
    success = True

    for year in range(start_year, end_year+1):
        url = url_reference + "NBA_" + str(year) + ".html"
        response = requests.get(url)
        print(year)

        # successful get
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")

            df = get_table(soup, year)
            if dfDraftInfo is None:
                dfDraftInfo = df
            else:
                dfDraftInfo = pd.concat([dfDraftInfo, df]).reset_index(drop = True)
        else: 
            print("Unsuccessful Get request")
            success = False
            break

        # delay scraping
        time.sleep(2)

    # save data from scrape
    if success:
        dfDraftInfo.to_csv(path, index = False)

### Filtering

Filter the data, get rid of the NA rows (forfeited)

In [198]:
df_draft = pd.read_csv(path)
df_draft = df_draft.dropna(subset=['pick_overall'])

Filter so that only the first round picks are in the dataframe

In [202]:
df_draft = df_draft[df_draft['pick_overall'] < 15]

### Get pre-NBA stats for CBB players

In [203]:
url_reference = "https://www.sports-reference.com/cbb/players/anthony-davis-5.html"

Get the list of names of players in the dataframe that went to college with their corresponding colleges

In [206]:
def get_cbb_players():
    df_cbb_players = df_draft.dropna(subset=['college_name'])
    return df_cbb_players

df_cbb_players = get_cbb_players()

In [209]:
def scrape_cbb_stats():
    for index, row in df_cbb_players.iterrows():
        name = row['player']
        team = row['college_name']
        print(f"Team: {team}, Name: {name}")

scrape_cbb_stats()

Team: Memphis, Name: Derrick Rose
Team: Kansas State, Name: Michael Beasley
Team: USC, Name: O.J. Mayo
Team: UCLA, Name: Russell Westbrook
Team: UCLA, Name: Kevin Love
Team: Indiana, Name: Eric Gordon
Team: West Virginia, Name: Joe Alexander
Team: Texas, Name: D.J. Augustin
Team: Stanford, Name: Brook Lopez
Team: Arizona, Name: Jerryd Bayless
Team: Rider University, Name: Jason Thompson
Team: Kansas, Name: Brandon Rush
Team: LSU, Name: Anthony Randolph
Team: Oklahoma, Name: Blake Griffin
Team: UConn, Name: Hasheem Thabeet
Team: Arizona State, Name: James Harden
Team: Memphis, Name: Tyreke Evans
Team: Syracuse, Name: Jonny Flynn
Team: Davidson, Name: Stephen Curry
Team: Arizona, Name: Jordan Hill
Team: USC, Name: DeMar DeRozan
Team: Louisville, Name: Terrence Williams
Team: Duke, Name: Gerald Henderson
Team: UNC, Name: Tyler Hansbrough
Team: Louisville, Name: Earl Clark
Team: Kentucky, Name: John Wall
Team: Ohio State, Name: Evan Turner
Team: Georgia Tech, Name: Derrick Favors
Team: Syr