# BSA Data Journalism Spring 2024

## Data scraping

In [984]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import time
import os

### Finding draft picks with basketball-reference
Defining start/end year and finding viable urls

In [985]:
start_year = 2008
end_year = 2023
url_reference = "https://www.basketball-reference.com/draft/"
# cols = ['pick_overall', 'team_id', 'player', 'college_name', 'seasons', 'g', 'mp', 'pts', 'trb', 'ast', 'fg_pct', 'fg3_pct', 'ft_pct', 'mp_per_g',
#        'pts_per_g', 'trb_per_g', 'ast_per_g', 'ws', 'ws_per_48', 'bpm', 'vorp']
dfDraftInfo : pd.DataFrame = None
path = "./data/draftdata.csv"

Table found on page html with id = "stats"

In [986]:
def get_table(soup: BeautifulSoup, year : int):
    df_cur_yr = None
    
    table = soup.find('table', {'id': 'stats'})
    
    if table is not None:
        rows = table.find_all('tr')

        for row in rows:
            data = row.find_all('td')

            player_data_dict = {}
            for d in data:
                datatype = d.get('data-stat')

                if datatype is not None:
                    # get contents
                    content = d.text.strip()
                    player_data_dict[str(datatype)] = str(content)

            # check for non empty row
            if len(player_data_dict.keys()) != 0:
                player_data_dict["year"] = year
                if df_cur_yr is not None:    
                    # keep adding
                    temp_df = pd.DataFrame([player_data_dict])
                    df_cur_yr = pd.concat([df_cur_yr, temp_df], axis = 0)
                    pass
                else:
                    # not empty
                    df_cur_yr = pd.DataFrame([player_data_dict])
                    pass

    return df_cur_yr.reset_index(drop=True)

Go through every year between the start_year and the end_year and save in a csv file so we don't need to run this more once

In [987]:
def scrape_draft(): 
    success = True

    for year in range(start_year, end_year+1):
        url = url_reference + "NBA_" + str(year) + ".html"
        response = requests.get(url)
        print(year)

        # successful get
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")

            df = get_table(soup, year)
            if dfDraftInfo is None:
                dfDraftInfo = df
            else:
                dfDraftInfo = pd.concat([dfDraftInfo, df]).reset_index(drop = True)
        else: 
            print("Unsuccessful Get request")
            success = False
            break

        # delay scraping
        time.sleep(2)

    # save data from scrape
    if success:
        dfDraftInfo.to_csv(path, index = False)

### Filtering

Filter the data, get rid of the NA rows (forfeited)

In [988]:
df_draft = pd.read_csv(path)
df_draft = df_draft.dropna(subset=['pick_overall'])

Filter so that only the lottery picks are in the dataframe

In [989]:
df_draft = df_draft[df_draft['pick_overall'] < 15]

### Get pre-NBA stats for CBB players
Using sports-reference.com

In [990]:
url_reference = "https://www.sports-reference.com/cbb/players/anthony-davis-5.html"

Get the list of names of players in the dataframe that went to college with their corresponding colleges

In [991]:
def get_cbb_players():
    df_cbb_players = df_draft.dropna(subset=['college_name'])
    return df_cbb_players

df_cbb_players = get_cbb_players()

In [992]:
import unicodedata
import re

def split_name(name : str):
    def strip_non_alphanumeric(s):
        pattern = re.compile(r'[^\w-]+')
        return pattern.sub('', s)
    
    names = name.split(" ")
    first_name = names[0]
    last_names = names[1:]

    first_name = strip_non_alphanumeric(first_name)
    last_names = [strip_non_alphanumeric(l).lower() for l in last_names]

    return first_name.lower(), last_names


Players with more than one word last names, 
Otto ['Porter', 'Jr.']
Dennis ['Smith', 'Jr.']
Marvin ['Bagley', 'III']
Jaren ['Jackson', 'Jr.']
Wendell ['Carter', 'Jr.']
Michael ['Porter', 'Jr.']
Kira ['Lewis', 'Jr.']
Jabari ['Smith', 'Jr.']
Dereck ['Lively', 'II']

In [993]:
# links of players with last names that don't follow the pattern
urls = {
    "otto" : "https://www.sports-reference.com/cbb/players/otto-porter-1.html",
    "dennis" : "https://www.sports-reference.com/cbb/players/dennis-smithjr-1.html",
    "marvin" : "https://www.sports-reference.com/cbb/players/marvin-bagleyiii-1.html",
    "jaren" : "https://www.sports-reference.com/cbb/players/jaren-jacksonjr-1.html",
    "wendell" : "https://www.sports-reference.com/cbb/players/wendell-carterjr-1.html",
    "michael" : "https://www.sports-reference.com/cbb/players/michael-porterjr-1.html",
    "kira" : "https://www.sports-reference.com/cbb/players/kira-lewisjr-1.html",
    "jabari" : "https://www.sports-reference.com/cbb/players/jabari-smith-2.html",
    "dereck" :  "https://www.sports-reference.com/cbb/players/dereck-lively-ii-1.html",

    # prospects
    "daron": "https://www.sports-reference.com/cbb/players/daron-holmesii-1.html",
    "tristan": "https://www.sports-reference.com/cbb/players/tristan-dasilva-1.html",
    "kevin": "https://www.sports-reference.com/cbb/players/kevin-mccullar-1.html",
    "terrence": "https://www.sports-reference.com/cbb/players/terrence-shannonjr-1.html"
}

In [994]:
df_cbb_stats : pd.DataFrame = None

In [995]:
def scrape_helper(url : str, name : str):
    df : pd.DataFrame = None
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        table = soup.find('table', {'id': 'players_per_game'})

        if table is not None:
            rows = table.find_all('tr')

            for row in rows:
                data = row.find_all('td')

                player_data_dict = {}
                for d in data:
                    datatype = d.get('data-stat')

                    if datatype is not None:
                        # get contents
                        content = d.text.strip()
                        player_data_dict[str(datatype)] = str(content)

        player_data_dict['player'] = name
        df = pd.DataFrame([player_data_dict])
    else:
        print("Bad Response Status Code", response.status_code)
        return None
    
    return df

In [996]:
def scrape_cbb_stats(df_input):
    df_all : pd.DataFrame = None

    for index, row in df_input.iterrows():
        name = row['player']
        team = row['college_name']

        first_name, last_name = split_name(name)
        team = team.split(" ")[0]
        
        url = ""
        if len(last_name) != 1:
            url = urls[first_name]
        else: 
            # bam adebayo is named edrice
            if first_name == 'bam':
                first_name = 'edrice'
            # mo bamba is named mohamed
            elif first_name == "mo":
                first_name = 'mohamed'
            # ja morant is named temetrius
            elif first_name == "ja":
                first_name = 'temetrius'
            # obi toppin is named obadiah
            elif first_name == "obi":
                first_name = "obadiah"
            # shaedon sharpe doesn't qualify
            elif first_name == "shaedon":
                continue
            # johnny davis is named johnny
            elif first_name == "johnny" and last_name == "davis":
                first_name = "jonathan"

            last_name = last_name[0]
            for i in range(1, 15):

                attempts = 0
                correct_url = False
                while attempts < 2:
                    # to check for the correct player (by team), for example anthony-davis-5, not anthony-davis-4
                    url = f"https://www.sports-reference.com/cbb/players/{first_name}-{last_name}-{i}.html" 
                    response = requests.get(url)
                    
                    if response.status_code == 200:
                        print(name)
                        print(url)
                        soup = BeautifulSoup(response.text, "html.parser")
                        school_name_elements = soup.find_all(attrs={"data-stat": "school_name"})

                        correct_url = any(team in element.get_text() for element in school_name_elements)
                    else:
                        print("Bad Response Status Code", response.status_code)
                        print(url)
                        time.sleep(2)
                        
                    if correct_url:
                        break

                    time.sleep(2)
                    
                    attempts += 1
            
                if correct_url:
                    break
        
        if len(url) != 0:   
            df = scrape_helper(url, name)
            while df is None:
                time.sleep(2)
                df = scrape_helper(url, name)
            
            if df_all is None:
                df_all = df
            else: 
                df_all = pd.concat([df_all, df]).reset_index(drop = True)
        else:
            print("URL not found")

        time.sleep(5)

    return df_all

In [997]:
# df_cbb_stats = scrape_cbb_stats()
path2 = './data/cbbdata.csv'
# df_cbb_stats.to_csv(path2, index=False)


Delete unnecessary columns

In [998]:
df_cbb_stats = pd.read_csv(path2)

drop_cols = [
    'conf_abbr', 
    'class', 
    'sos-dum',
    'sos'
]

df_cbb_stats = df_cbb_stats.drop(drop_cols, axis = 1)

### Scraping shot selection from barttorvik.com

Shot selection data only exists from 2010 onwards, must filter out 2008-2009

Also no data on players with less than 10 games
- Darius Garland
- Michael Porter Jr
- James Wiseman

In [999]:
url_reference = "https://barttorvik.com/playerstat.php?year=2016&p=Jaylen%20Brown"

In [1000]:
df_cbb_players = df_cbb_players.reset_index(drop = True)
df_cbb_players = df_cbb_players[df_cbb_players['year'] >= 2010]

not_qualified = ["Michael Porter Jr.", "Darius Garland", "James Wiseman"]
df_cbb_players = df_cbb_players[~df_cbb_players['player'].isin(not_qualified)]

df_shot_selection : pd.DataFrame = None

Correct table has headers "DUNKS", "At the rim", "Other 2-PT", "3-Pt Jumpers"

In [1001]:
def has_desired_headers(headers):
    desired_headers = ["DUNKS", "AT THE RIM", "OTHER 2-PT", "3-PT JUMPERS"]
    for header in desired_headers:
        if header.lower() not in headers.lower():
            return False
    return True

In [1002]:
shot_stat_headers = ['dunk_tot', 'dunk_pct', 
                     'rim_tot', 'rim_pct', 'rim_asted', 
                     'other2pt_tot', 'other2pt_pct', 'other2pt_asted', 
                     '3pt_tot', '3pt_pct', '3pt_asted']

shot_urls = {
    "otto" : "https://barttorvik.com/playerstat.php?year=2013&p=Otto%20Porter&t=Georgetown",
    "dennis" : "https://barttorvik.com/playerstat.php?year=2017&p=Dennis%20Smith%2C%20Jr.&t=North%20Carolina%20St.",
    "marvin" : "https://barttorvik.com/playerstat.php?year=2018&p=Marvin%20Bagley%20III&t=Duke",
    "jaren" : "https://barttorvik.com/playerstat.php?year=2018&p=Jaren%20Jackson%20Jr.&t=Michigan%20St.",
    "wendell" : "https://barttorvik.com/playerstat.php?year=2018&p=Wendell%20Carter%20Jr.&t=Duke",
    "kira" : "https://barttorvik.com/playerstat.php?year=2020&p=Kira%20Lewis%20Jr.&t=Alabama",
    "jabari" : "https://barttorvik.com/playerstat.php?year=2022&p=Jabari%20Smith&t=Auburn",
    "dereck" :  "https://barttorvik.com/playerstat.php?year=2023&p=Dereck%20Lively%20II&t=Duke",
    
    # prospects
    "daron": "https://barttorvik.com/playerstat.php?year=2023&p=DaRon%20Holmes%20II&t=Dayton",
    "tristan": "https://barttorvik.com/playerstat.php?year=2024&p=Tristan%20da%20Silva&t=Colorado",
    "kevin": "https://barttorvik.com/playerstat.php?year=2024&p=Kevin%20McCullar%20Jr.&t=Kansas",
    "terrence": "https://barttorvik.com/playerstat.php?year=2021&p=Terrence%20Shannon%20Jr.&t=Texas%20Tech"
}

In [1003]:
# doesn't remove the period at the end of the names
def split_name2(name : str):
    def strip_non_alphanumeric(s):
        pattern = re.compile(r'[^.\w-]+')
        return pattern.sub('', s)
    
    names = name.split(" ")
    first_name = names[0]
    last_names = names[1:]

    first_name = strip_non_alphanumeric(first_name)
    last_names = [strip_non_alphanumeric(l).lower() for l in last_names]

    return first_name.lower(), last_names

In [1004]:
def scrape_shot_selection(df_input):
    df : pd.DataFrame = None

    for index, row in df_input.iterrows():
        name = row['player']
        year = row['year']
        first_name, last_name = split_name2(name)

        url = ""

        if first_name == 'cj':
            first_name = "c.j."
        if first_name == 'dangelo':
            first_name = "D%27Angelo"
        if first_name == 'deaaron':
            first_name = 'De%27Aaron'
        if first_name == 'bam':
            first_name = 'edrice'
        if first_name == 'mo':
            first_name = "mohamed"
        if first_name == 'rj':
            first_name = "r.j."
        if first_name == 'deandre' and last_name[0] != "ayton":
            first_name = "De%27andre"
        if first_name == "p.j." and last_name[0] == "washington":
            first_name = "pj"
        if first_name == "shaedon":
            continue
        if first_name == "jakobe":
            first_name = "Ja%27Kobe"
        if first_name == "kelel":
            first_name = "kel%27el"

        if len(last_name) == 1:
            last_name = last_name[0]
            url = f"https://barttorvik.com/playerstat.php?year={year}&p={first_name}%20{last_name}"
        else:
            url = shot_urls[first_name]

        print(url)
        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            
            tables = soup.find_all('table')
            filtered_table = [table for table in tables if has_desired_headers(table.find('thead').text)][0]
            
            tds = filtered_table.find_all('td')

            shot_stat_dict = {}
            shot_stat_dict['player'] = name
            i = 0
            for td in tds:
                header = shot_stat_headers[i]
                shot_stat_dict[header] = td.text
                i+=1

            df_temp = pd.DataFrame([shot_stat_dict])
            if df is None:
                df = df_temp
            else: 
                df = pd.concat([df, df_temp]).reset_index(drop = True)
        else:
            print("Bad Request: ", response.status_code)
            break
        time.sleep(5)

    return df


In [1005]:
# df_shot_selection = scrape_shot_selection(df_cbb_players)

In [1006]:
path = "./data/shotdata.csv"
# df_shot_selection.to_csv(path, index = False)

## Defining Success through several metrics
### Comparing players to players of the same draft position across draft years

In [1007]:
df_career_stats = pd.read_csv("./data/draftdata.csv")

df_career_stats.columns

Index(['pick_overall', 'team_id', 'player', 'college_name', 'seasons', 'g',
       'mp', 'pts', 'trb', 'ast', 'fg_pct', 'fg3_pct', 'ft_pct', 'mp_per_g',
       'pts_per_g', 'trb_per_g', 'ast_per_g', 'ws', 'ws_per_48', 'bpm', 'vorp',
       'year', 'skip'],
      dtype='object')

### Filtering and Cleaning data
Dropping the column skip, using only the per game stats

In [1008]:
drop_cols = [
    'team_id', 'skip', 'mp', 'pts', 'trb', 'ast'
]

In [1009]:
df_career_stats = df_career_stats.drop(columns=drop_cols)

In [1010]:
df_career_stats2 = df_career_stats.copy()
df_notcbb_players = df_career_stats2[df_career_stats2['college_name'].isna()]
df_notcbb_players = df_notcbb_players.reset_index(drop = True)
df_other_lot = df_notcbb_players[df_notcbb_players['pick_overall'] < 15]
df_other_lot = df_other_lot.reset_index(drop=True)
df_other_lot

Unnamed: 0,pick_overall,player,college_name,seasons,g,fg_pct,fg3_pct,ft_pct,mp_per_g,pts_per_g,trb_per_g,ast_per_g,ws,ws_per_48,bpm,vorp,year
0,6.0,Danilo Gallinari,,14.0,777.0,0.428,0.381,0.876,28.8,14.9,4.7,1.9,65.4,0.14,1.7,20.8,2008
1,5.0,Ricky Rubio,,12.0,698.0,0.388,0.324,0.843,29.6,10.8,4.1,7.4,42.3,0.098,0.5,13.3,2009
2,10.0,Brandon Jennings,,9.0,555.0,0.387,0.345,0.796,30.3,14.1,3.0,5.7,29.9,0.085,0.8,11.8,2009
3,3.0,Enes Freedom,,11.0,748.0,0.548,0.289,0.777,21.5,11.2,7.8,0.9,52.8,0.157,-0.1,7.6,2011
4,5.0,Jonas Valančiūnas,,12.0,856.0,0.561,0.348,0.789,25.7,13.4,9.5,1.3,80.3,0.175,1.0,16.7,2011
5,6.0,Jan Veselý,,3.0,162.0,0.521,0.0,0.408,15.2,3.6,3.5,0.6,4.0,0.078,-2.6,-0.4,2011
6,7.0,Bismack Biyombo,,13.0,839.0,0.535,0.0,0.553,19.5,5.1,5.9,0.7,32.4,0.095,-2.5,-2.2,2011
7,5.0,Dante Exum,,7.0,300.0,0.431,0.336,0.767,18.8,6.1,2.0,2.2,5.9,0.051,-2.8,-1.2,2014
8,12.0,Dario Šarić,,7.0,477.0,0.445,0.362,0.839,22.8,10.6,5.4,1.9,20.9,0.092,-0.7,3.6,2014
9,4.0,Kristaps Porziņģis,,8.0,459.0,0.459,0.361,0.831,30.9,19.7,7.9,1.8,42.7,0.144,2.5,16.1,2015


In [1011]:
df_career_stats = df_career_stats.dropna(axis = 0)

In [1012]:
df_career_stats = df_career_stats.reset_index(drop = True)

Filtering only the lottery picks, starting from 2010

In [1013]:
pre2010list = df_career_stats[df_career_stats['year'] < 2010]
pre2010list = pre2010list[pre2010list['pick_overall'] < 15]['player'].tolist()
pre2010list

['Derrick Rose',
 'Michael Beasley',
 'O.J. Mayo',
 'Russell Westbrook',
 'Kevin Love',
 'Eric Gordon',
 'Joe Alexander',
 'D.J. Augustin',
 'Brook Lopez',
 'Jerryd Bayless',
 'Jason Thompson',
 'Brandon Rush',
 'Anthony Randolph',
 'Blake Griffin',
 'James Harden',
 'Tyreke Evans',
 'Jonny Flynn',
 'Stephen Curry',
 'Jordan Hill',
 'DeMar DeRozan',
 'Terrence Williams',
 'Gerald Henderson',
 'Tyler Hansbrough',
 'Earl Clark']

In [1014]:
df_career_lot = df_career_stats[df_career_stats['year'] >= 2010]
df_career_lot = df_career_lot[df_career_lot['pick_overall'] < 15].reset_index(drop = True)
df_career_lot

Unnamed: 0,pick_overall,player,college_name,seasons,g,fg_pct,fg3_pct,ft_pct,mp_per_g,pts_per_g,trb_per_g,ast_per_g,ws,ws_per_48,bpm,vorp,year
0,1.0,John Wall,Kentucky,11.0,647.0,0.430,0.322,0.776,34.9,18.7,4.2,8.9,44.5,0.094,2.2,24.1,2010
1,2.0,Evan Turner,Ohio State,10.0,705.0,0.434,0.294,0.782,26.9,9.7,4.6,3.5,20.2,0.051,-1.9,0.5,2010
2,3.0,Derrick Favors,Georgia Tech,12.0,790.0,0.534,0.198,0.663,24.3,10.6,7.1,1.1,60.2,0.150,0.9,14.0,2010
3,4.0,Wes Johnson,Syracuse,9.0,609.0,0.404,0.337,0.741,22.1,7.0,3.2,1.1,10.7,0.038,-1.6,1.2,2010
4,5.0,DeMarcus Cousins,Kentucky,11.0,654.0,0.460,0.331,0.737,29.8,19.6,10.2,3.0,46.8,0.115,2.4,21.6,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,10.0,Cason Wallace,Kentucky,1.0,82.0,0.491,0.419,0.784,20.6,6.8,2.3,1.5,4.0,0.115,0.1,0.9,2023
164,11.0,Jett Howard,Michigan,1.0,18.0,0.333,0.280,0.500,3.7,1.6,0.4,0.3,0.0,0.005,-1.8,0.0,2023
165,12.0,Dereck Lively II,Duke,1.0,55.0,0.747,0.000,0.506,23.5,8.8,6.9,1.1,4.9,0.183,0.5,0.8,2023
166,13.0,Gradey Dick,Kansas,1.0,60.0,0.425,0.365,0.863,21.1,8.5,2.2,1.1,0.3,0.010,-5.4,-1.1,2023


Made some graphics in R

### Combining data frames

In [1015]:
df_comb = df_career_lot.rename(columns = {col : "nba_" + col for col in df_career_lot})
df_comb = df_comb.rename(columns = {'nba_player': 'player', 'nba_year': 'year'})
df_comb.columns

Index(['nba_pick_overall', 'player', 'nba_college_name', 'nba_seasons',
       'nba_g', 'nba_fg_pct', 'nba_fg3_pct', 'nba_ft_pct', 'nba_mp_per_g',
       'nba_pts_per_g', 'nba_trb_per_g', 'nba_ast_per_g', 'nba_ws',
       'nba_ws_per_48', 'nba_bpm', 'nba_vorp', 'year'],
      dtype='object')

In [1016]:
drop_cols = [
    'conf_abbr', 
    'class', 
    'sos-dum',
    'sos',
    'school_name',
]

df = pd.read_csv("./data/cbbdata.csv")
df_cbb_stats = df.drop(drop_cols, axis = 1)


In [1017]:
df_cbb_stats = df_cbb_stats[~df_cbb_stats['player'].isin(pre2010list)]
df_cbb_stats 

Unnamed: 0,games,games_started,mp_per_g,fg_per_g,fga_per_g,fg_pct,fg2_per_g,fg2a_per_g,fg2_pct,fg3_per_g,...,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g,player
14,100,99,29.3,3.6,5.9,0.611,3.6,5.9,0.611,0.0,...,3.0,5.4,8.5,0.4,0.4,4.2,1.8,2.6,10.3,Hasheem Thabeet
25,37,37,34.8,5.5,11.8,0.461,4.5,8.8,0.509,1.0,...,0.8,3.5,4.3,6.5,1.8,0.5,4.0,1.9,16.6,John Wall
26,101,94,32.8,5.3,10.6,0.502,4.8,9.1,0.525,0.5,...,1.5,5.3,6.8,4.1,1.6,0.7,3.5,2.7,15.0,Evan Turner
27,36,35,27.5,5.0,8.1,0.611,5.0,8.1,0.613,0.0,...,3.0,5.4,8.4,1.0,0.9,2.1,2.5,2.6,12.4,Derrick Favors
28,35,35,35.0,5.9,11.8,0.502,4.5,8.3,0.540,1.5,...,2.1,6.4,8.5,2.2,1.7,1.8,2.3,2.1,16.5,Wes Johnson
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,32,32,32.2,4.3,9.8,0.446,3.0,5.8,0.514,1.4,...,0.9,2.8,3.7,4.3,2.0,0.5,2.1,2.4,11.7,Cason Wallace
189,29,29,31.8,4.7,11.4,0.414,2.0,4.1,0.496,2.7,...,0.3,2.6,2.8,2.0,0.4,0.7,1.3,2.2,14.2,Jett Howard
190,34,27,20.6,2.3,3.4,0.658,2.2,3.1,0.721,0.1,...,2.1,3.3,5.4,1.1,0.5,2.4,0.7,2.7,5.2,Dereck Lively II
191,36,36,32.7,4.8,10.9,0.442,2.5,5.2,0.484,2.3,...,1.1,4.0,5.1,1.7,1.4,0.3,1.3,2.1,14.1,Gradey Dick


In [1018]:
df_shots = pd.read_csv("./data/shotdata.csv")

In [1019]:
names1 = df_cbb_stats['player'].tolist()
names2 = df_shots['player'].tolist()

In [1020]:
in_names1_not_2 = [name for name in names1 if name not in names2]

In [1021]:
print(in_names1_not_2)

['Hasheem Thabeet', 'Michael Porter Jr.', 'Darius Garland', 'James Wiseman']


Removing the Hasheem Thabeet, Michael Porter Jr, Darius Garland, James Wiseman to merge the two dataframes

In [1022]:
df_cbb_stats = df_cbb_stats[~df_cbb_stats['player'].isin(in_names1_not_2)]
df_cbb_stats

Unnamed: 0,games,games_started,mp_per_g,fg_per_g,fga_per_g,fg_pct,fg2_per_g,fg2a_per_g,fg2_pct,fg3_per_g,...,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g,player
25,37,37,34.8,5.5,11.8,0.461,4.5,8.8,0.509,1.0,...,0.8,3.5,4.3,6.5,1.8,0.5,4.0,1.9,16.6,John Wall
26,101,94,32.8,5.3,10.6,0.502,4.8,9.1,0.525,0.5,...,1.5,5.3,6.8,4.1,1.6,0.7,3.5,2.7,15.0,Evan Turner
27,36,35,27.5,5.0,8.1,0.611,5.0,8.1,0.613,0.0,...,3.0,5.4,8.4,1.0,0.9,2.1,2.5,2.6,12.4,Derrick Favors
28,35,35,35.0,5.9,11.8,0.502,4.5,8.3,0.540,1.5,...,2.1,6.4,8.5,2.2,1.7,1.8,2.3,2.1,16.5,Wes Johnson
29,38,37,23.5,5.4,9.7,0.558,5.4,9.6,0.565,0.0,...,4.1,5.8,9.8,1.0,1.0,1.8,2.1,3.2,15.1,DeMarcus Cousins
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,32,32,32.2,4.3,9.8,0.446,3.0,5.8,0.514,1.4,...,0.9,2.8,3.7,4.3,2.0,0.5,2.1,2.4,11.7,Cason Wallace
189,29,29,31.8,4.7,11.4,0.414,2.0,4.1,0.496,2.7,...,0.3,2.6,2.8,2.0,0.4,0.7,1.3,2.2,14.2,Jett Howard
190,34,27,20.6,2.3,3.4,0.658,2.2,3.1,0.721,0.1,...,2.1,3.3,5.4,1.1,0.5,2.4,0.7,2.7,5.2,Dereck Lively II
191,36,36,32.7,4.8,10.9,0.442,2.5,5.2,0.484,2.3,...,1.1,4.0,5.1,1.7,1.4,0.3,1.3,2.1,14.1,Gradey Dick


In [1023]:
df_shots.columns

Index(['player', 'dunk_tot', 'dunk_pct', 'rim_tot', 'rim_pct', 'rim_asted',
       'other2pt_tot', 'other2pt_pct', 'other2pt_asted', '3pt_tot', '3pt_pct',
       '3pt_asted'],
      dtype='object')

In [1024]:
df_merge_cbb = pd.merge(df_shots, df_cbb_stats, on = 'player', how = 'left')
df_merge_cbb

Unnamed: 0,player,dunk_tot,dunk_pct,rim_tot,rim_pct,rim_asted,other2pt_tot,other2pt_pct,other2pt_asted,3pt_tot,...,ft_pct,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g
0,John Wall,33-36,91.7%,116-182,63.7%,27.6%,40-127,31.5%,7.5%,36-113,...,0.754,0.8,3.5,4.3,6.5,1.8,0.5,4.0,1.9,16.6
1,Evan Turner,18-19,94.7%,102-130,78.5%,18.6%,106-258,41.1%,17.0%,20-55,...,0.758,1.5,5.3,6.8,4.1,1.6,0.7,3.5,2.7,15.0
2,Derrick Favors,53-56,94.6%,137-175,78.3%,54.7%,40-112,35.7%,65.0%,0-1,...,0.629,3.0,5.4,8.4,1.0,0.9,2.1,2.5,2.6,12.4
3,Wes Johnson,37-41,90.2%,93-129,72.1%,55.9%,48-135,35.6%,64.6%,41-106,...,0.772,2.1,6.4,8.5,2.2,1.7,1.8,2.3,2.1,16.5
4,DeMarcus Cousins,53-57,93.0%,144-189,76.2%,54.2%,57-163,35.0%,31.6%,1-6,...,0.604,4.1,5.8,9.8,1.0,1.0,1.8,2.1,3.2,15.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160,Cason Wallace,11-11,100.0%,52-73,71.2%,9.6%,43-112,38.4%,9.3%,44-127,...,0.757,0.9,2.8,3.7,4.3,2.0,0.5,2.1,2.4,11.7
161,Jett Howard,6-6,100.0%,29-47,61.7%,31.0%,30-72,41.7%,16.7%,78-212,...,0.800,0.3,2.6,2.8,2.0,0.4,0.7,1.3,2.2,14.2
162,Dereck Lively II,54-55,98.2%,74-96,77.1%,74.3%,1-8,12.5%,100.0%,2-13,...,0.600,2.1,3.3,5.4,1.1,0.5,2.4,0.7,2.7,5.2
163,Gradey Dick,15-16,93.8%,62-101,61.4%,59.7%,29-87,33.3%,44.8%,83-206,...,0.854,1.1,4.0,5.1,1.7,1.4,0.3,1.3,2.1,14.1


In [1025]:
path3 = "./data/combinedstatshot.csv"
df_merge_cbb.to_csv(path3, index = False)

# Getting NBA draft prospects from the NCAA

https://www.espn.com/nba/draft/bestavailable
Getting top 50 college players from the website

In [1026]:
prospect_dict = {
    "Donovan Clingan": "UConn",
    "Reed Sheppard": "Kentucky",
    "Stephon Castle": "UConn",
    "Rob Dillingham": "Kentucky",
    "Dalton Knecht": "Tennessee",
    "Cody Williams": "Colorado",
    "Devin Carter": "Providence",
    "Ja'Kobe Walter": "Baylor",
    "Jared McCain": "Duke",
    "Zach Edey": "Purdue",
    "Tristan Da Silva": "Colorado",
    "Johnny Furphy": "Kansas",
    "Kyshawn George": "Miami",
    "Kyle Filipowski": "Duke",
    "Isaiah Collier": "USC",
    "Carlton Carrington": "Pitt",
    "Yves Missi": "Baylor",
    "Kel'el Ware": "Indiana",
    "Baylor Scheierman": "Creighton",
    "Justin Edwards": "Kentucky",
    "Tyler Kolek": "Marquette",
    "Kevin McCullar Jr.": "Kansas",
    "Terrence Shannon Jr.": "Illinois",
    "Ryan Dunn": "Virginia",
    "Jaylon Tyson": "California",
    "Cam Christie": "Minnesota",
    "Alex Karaban": "UConn",
    "Adem Bona": "UCLA",
    "Jonathan Mogbo": "San Francisco",
    "Harrison Ingram": "UNC",
    "Ajay Mitchell": "UCSB",
    "Pelle Larsson": "Arizona",
    "Payton Sandfort": "Iowa",
    "Dillon Jones": "Weber State",
    "Keshad Johnson": "Arizona",
    "KJ Simpson": "Colorado",
    "DaRon Holmes II": "Dayton",
    "Jamal Shead": "Houston",
    "Bronny James": "USC",
    "Hunter Sallis": "Wake Forest",
    "Jalen Bridges": "Baylor",
    "Oso Ighodaro": "Marquette",
    "Isaac Jones": "Washington State",
    "Jaylen Wells": "Washington State",
    "Enrique Freeman": "Akron",
    "Cam Spencer": "UConn",
    "Antonio Reeves": "Kentucky",
    "Jaxson Robinson": "BYU",
    "Trevon Brazile": "Arkansas",
    "Ugonna Onyenso": "Kentucky"
}

In [1027]:
len(prospect_dict)

50

In [1028]:
df_prospects = pd.DataFrame(list(prospect_dict.items()), columns = ["player", "college_name"])

In [1029]:
df_prospects["year"] = 2024
df_prospects

Unnamed: 0,player,college_name,year
0,Donovan Clingan,UConn,2024
1,Reed Sheppard,Kentucky,2024
2,Stephon Castle,UConn,2024
3,Rob Dillingham,Kentucky,2024
4,Dalton Knecht,Tennessee,2024
5,Cody Williams,Colorado,2024
6,Devin Carter,Providence,2024
7,Ja'Kobe Walter,Baylor,2024
8,Jared McCain,Duke,2024
9,Zach Edey,Purdue,2024


### Scraping prospect data

In [1030]:
# df_prospect_stats = scrape_cbb_stats(df_input = df_prospects)

In [1031]:
drop_cols = [
    'conf_abbr', 
    'class', 
    'sos-dum',
    'sos'
]

In [1032]:
# df_prospect_stats = df_prospect_stats.drop(drop_cols, axis = 1).reset_index(drop = True)

In [1037]:
path4 = "./data/prospectstat.csv"
# df_prospect_stats.to_csv(path)

In [1034]:
# df_prospect_shots = scrape_shot_selection(df_prospects)

https://barttorvik.com/playerstat.php?year=2024&p=donovan%20clingan
https://barttorvik.com/playerstat.php?year=2024&p=reed%20sheppard
https://barttorvik.com/playerstat.php?year=2024&p=stephon%20castle
https://barttorvik.com/playerstat.php?year=2024&p=rob%20dillingham
https://barttorvik.com/playerstat.php?year=2024&p=dalton%20knecht
https://barttorvik.com/playerstat.php?year=2024&p=cody%20williams
https://barttorvik.com/playerstat.php?year=2024&p=devin%20carter
https://barttorvik.com/playerstat.php?year=2024&p=Ja%27Kobe%20walter
https://barttorvik.com/playerstat.php?year=2024&p=jared%20mccain
https://barttorvik.com/playerstat.php?year=2024&p=zach%20edey
https://barttorvik.com/playerstat.php?year=2024&p=Tristan%20da%20Silva&t=Colorado
https://barttorvik.com/playerstat.php?year=2024&p=johnny%20furphy
https://barttorvik.com/playerstat.php?year=2024&p=kyshawn%20george
https://barttorvik.com/playerstat.php?year=2024&p=kyle%20filipowski
https://barttorvik.com/playerstat.php?year=2024&p=isaiah

In [1036]:
path5 = "./data/prospectshot.csv"
# df_prospect_shots.to_csv(path)

In [1038]:
df_prospect_shots = pd.read_csv(path5)
df_prospect_stats = pd.read_csv(path4)

In [1039]:
df_merge_prospects = pd.merge(df_prospect_shots, df_prospect_stats, on = 'player', how = 'left')

In [1041]:
path6 = "./data/prospectcombined.csv"
df_merge_prospects.to_csv(path6)