In [3]:
# BeautifulSoup documentation
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/

# Requests documentation
# https://requests.readthedocs.io/en/latest/

from bs4 import BeautifulSoup # for pulling html data
import requests # send http requests to url

In [5]:
# Webscraping tutorial
# https://www.geeksforgeeks.org/implementing-web-scraping-python-beautiful-soup/

url = "https://www.nba.com/news/2022-nba-draft-order" # url for the players that got drafted
r = requests.get(url) # send http request to url and saves response from server 

# print(r.content) # gathers raw HTML content of webpage

In [8]:
soup = BeautifulSoup(r.content, 'html.parser') # represents document as nested data structure

# print(soup.prettify()) # prints nested structure of html content

In [9]:
table = soup.find('script', attrs = {'type':'application/ld+json'}) # all players and draft numbers and rounds found under script with attribute type = application/ld+json

In [10]:
print(table.prettify()) # prints out the specified html content 

<script type="application/ld+json">
 {"@context":"https://schema.org/","@type":"Article","@id":"https://www.nba.com/news/2022-nba-draft-order","headline":"2022 NBA Draft results: Picks 1-58","url":"https://www.nba.com/news/2022-nba-draft-order","articleBody":"See below for every selection and reported trade from the 2022 NBA Draft.\r\n\r\n\r\n\r\n1st Round:\r\n1. Magic draft Paolo Banchero (Duke)\r\n\r\n2. Thunder draft Chet Holmgren (Gonzaga)\r\n\r\n3. Rockets draft Jabari Smith (Auburn)\r\n\r\n4. Kings draft Keegan Murray (Iowa)\r\n\r\n5. Pistons draft Jaden Ivey (Purdue)\r\n\r\n6. Pacers draft Bennedict Mathurin (Arizona)\r\n\r\n7. Blazers draft Shaedon Sharpe (Kentucky)\r\n\r\n8. Pelicans draft Dyson Daniels (G League Ignite)\r\n\r\n9. Spurs draft Jeremy Sochan (Baylor)\r\n\r\n10. Wizards draft Johnny Davis (Wisconsin)\r\n\r\n11. Knicks draft Ousmane Dieng (New Zealand Breakers) -- Traded to OKC\r\n\r\n12. Thunder draft Jalen Williams (Santa Clara)\r\n\r\n13. Hornets draft Jalen Du

In [30]:
import json # extracting json from script tag
# https://morioh.com/p/4ed4ba1fe438

In [35]:
json_object = json.loads(table.contents[0]) # load the json into json_object

'See below for every selection and reported trade from the 2022 NBA Draft.\r\n\r\n\r\n\r\n1st Round:\r\n1. Magic draft Paolo Banchero (Duke)\r\n\r\n2. Thunder draft Chet Holmgren (Gonzaga)\r\n\r\n3.\xa0Rockets draft Jabari Smith (Auburn)\r\n\r\n4. Kings draft Keegan Murray (Iowa)\r\n\r\n5. Pistons draft Jaden Ivey (Purdue)\r\n\r\n6. Pacers draft Bennedict Mathurin (Arizona)\r\n\r\n7. Blazers\xa0draft Shaedon Sharpe (Kentucky)\r\n\r\n8. Pelicans draft Dyson Daniels (G League Ignite)\r\n\r\n9. Spurs draft Jeremy Sochan (Baylor)\r\n\r\n10. Wizards draft Johnny Davis (Wisconsin)\r\n\r\n11. Knicks\xa0draft Ousmane Dieng (New Zealand Breakers) -- Traded to OKC\r\n\r\n12. Thunder\xa0draft Jalen Williams (Santa Clara)\r\n\r\n13. Hornets draft Jalen Duren (Memphis) --\xa0Traded to DET in 3-team deal\r\n\r\n14. Cavaliers draft Ochai Agbaji (Kansas)\r\n\r\n15. Hornets draft Mark Williams (Duke)\r\n\r\n16. Hawks draft A.J. Griffin (Duke)\r\n\r\n17. Rockets draft Tari Eason (LSU)\r\n\r\n18. Bulls d

In [40]:
# 

test = json_object['articleBody']

In [42]:
import re # regular expressions library

In [92]:
# https://stackoverflow.com/questions/32680030/match-text-between-two-strings-with-regular-expression
# matching text between two strings

m = re.findall(r'draft(.*?)(.*?)\(',test)
# m[0][1]

' Paolo Banchero '

In [94]:
import pandas as pd # pandas for dataframe

In [103]:
# https://www.journaldev.com/23763/python-remove-spaces-from-string#:~:text=strip(),remove%20leading%20and%20trailing%20whitespaces.&text=If%20you%20want%20to%20remove,or%20rstrip()%20function%20instead.
# remove trailing and leading space

# store names in a list
testls = []
for name in m:
    testls.append(name[1].strip())

In [134]:
# making dataframe with the player names list
df = pd.DataFrame({'Player':testls})

In [135]:
df.head()

Unnamed: 0,Player
0,Paolo Banchero
1,Chet Holmgren
2,Jabari Smith
3,Keegan Murray
4,Jaden Ivey


In [136]:
# making columns of draft round and number picked

df['Round'] = 1 # setting round to 1 for now
df['DraftNumber'] = df.index + 1 # draft number is just the index plus one

In [137]:
# if draft number greater than 30, then round would be 2
df.loc[df['DraftNumber']> 30, 'Round'] = 2

In [140]:
df.tail()

Unnamed: 0,Player,Round,DraftNumber
53,Yannick Nzosa,2,54
54,Gui Santos,2,55
55,Luke Travers,2,56
56,Jabari Walker,2,57
57,Hugo Besson,2,58


In [141]:
# repeat for years 2021 and 2020

url = "https://www.nba.com/news/2021-nba-draft-results-picks-1-60" # url for the players that got drafted
r = requests.get(url) # send http request to url and saves response from server
soup = BeautifulSoup(r.content, 'html.parser') # represents document as nested data structure
table = soup.find('div', attrs = {'class':'w-full lg:flex'}) # all players and draft numbers and rounds found under div with attribute class = w-full lg:flex

In [245]:
# https://scottlarsen.com/2020/08/19/Scraping-Web-Content-Between-Two-Tags-With-BeautifulSoup-And-Python.html
# for the get_text() function

list2021 = [] # empty list to store names

n = 0 # first 2 rows aren't names, thus have a counter 

for row in table.findAll('p'): # iterate through the table
    
    if n!=2: # if n isn't 2 then can't add row to list
        n = n + 1
        
    elif row.a == None: # last few rows are none and gave an error, came back to fix
        n = n
        
    else: # this step adds the name to the list 
        list2021.append(row.a.get_text())
    

In [248]:
# making dataframe with the player names list
df2021 = pd.DataFrame({'Player':list2021})

# making columns of draft round and number picked

df2021['Round'] = 1 # setting round to 1 for now
df2021['DraftNumber'] = df2021.index + 1 # draft number is just the index plus one
# if draft number greater than 30, then round would be 2
df2021.loc[df2021['DraftNumber']> 30, 'Round'] = 2

In [256]:
url = "https://www.nba.com/news/2020-nba-draft-results-picks-1-60" # url for the players that got drafted
r = requests.get(url) # send http request to url and saves response from server
soup = BeautifulSoup(r.content, 'html.parser') # represents document as nested data structure
table = soup.find('div', attrs = {'class':'lg:pr-3 lg:w-3/4'}) # all players and draft numbers and rounds found under div with attribute class = w-full lg:flex

In [268]:
list2020 = [] # empty list to store names

n = 0 # first 2 rows aren't names, thus have a counter 

for row in table.findAll('a'): # iterate through the table
    
    if n!=2: # if n isn't 2 then can't add row to list
        n = n + 1
        
    elif row.a == None: # last few rows are none and gave an error, came back to fix
        n = n
        
    else: # this step adds the name to the list 
        list2020.append(row.get_text())

In [276]:
list2020 = []

for row in table.findAll('a'):
    # fixed bug where the html content had strings dealt or traded in between the a tag
    if row.get_text()[0].isupper(): # use isupper on first letter of string to see if it is a name
        list2020.append(row.get_text())



In [278]:
# making dataframe with the player names list
df2020 = pd.DataFrame({'Player':list2020})

# making columns of draft round and number picked

df2020['Round'] = 1 # setting round to 1 for now
df2020['DraftNumber'] = df2020.index + 1 # draft number is just the index plus one
# if draft number greater than 30, then round would be 2
df2020.loc[df2020['DraftNumber']> 30, 'Round'] = 2

In [284]:
# cleaned name that had \n after
df2020.loc[26,'Player'] = 'Udoka Azubuike'

In [285]:
df2020

Unnamed: 0,Player,Round,DraftNumber
0,Anthony Edwards,1,1
1,James Wiseman,1,2
2,LaMelo Ball,1,3
3,Patrick Williams,1,4
4,Isaac Okoro,1,5
5,Onyeka Okongwu,1,6
6,Killian Hayes,1,7
7,Obi Toppin,1,8
8,Deni Avdija,1,9
9,Jalen Smith,1,10
