In [1]:
# BeautifulSoup documentation
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/

# Requests documentation
# https://requests.readthedocs.io/en/latest/

from bs4 import BeautifulSoup # for pulling html data
import requests # send http requests to url

In [2]:
# Webscraping tutorial
# https://www.geeksforgeeks.org/implementing-web-scraping-python-beautiful-soup/

url = "https://www.nba.com/news/2022-nba-draft-order" # url for the players that got drafted
r = requests.get(url) # send http request to url and saves response from server 

# print(r.content) # gathers raw HTML content of webpage

In [3]:
soup = BeautifulSoup(r.content, 'html.parser') # represents document as nested data structure

# print(soup.prettify()) # prints nested structure of html content

In [4]:
table = soup.find('script', attrs = {'type':'application/ld+json'}) # all players and draft numbers and rounds found under script with attribute type = application/ld+json

In [5]:
# print(table.prettify()) # prints out the specified html content 

In [6]:
import json # extracting json from script tag
# https://morioh.com/p/4ed4ba1fe438

In [7]:
json_object = json.loads(table.contents[0]) # load the json into json_object

In [8]:
# 

test = json_object['articleBody']

In [9]:
import re # regular expressions library

In [10]:
# https://stackoverflow.com/questions/32680030/match-text-between-two-strings-with-regular-expression
# matching text between two strings

m = re.findall(r'draft(.*?)(.*?)\(',test)
# m[0][1]

In [11]:
import pandas as pd # pandas for dataframe

In [12]:
# https://www.journaldev.com/23763/python-remove-spaces-from-string#:~:text=strip(),remove%20leading%20and%20trailing%20whitespaces.&text=If%20you%20want%20to%20remove,or%20rstrip()%20function%20instead.
# remove trailing and leading space

# store names in a list
testls = []
for name in m:
    testls.append(name[1].strip())

In [13]:
# making dataframe with the player names list
df = pd.DataFrame({'Player':testls})

In [14]:
df.head()

Unnamed: 0,Player
0,Paolo Banchero
1,Chet Holmgren
2,Jabari Smith
3,Keegan Murray
4,Jaden Ivey


In [15]:
# making columns of draft round and number picked

df['Round'] = 1 # setting round to 1 for now
df['DraftNumber'] = df.index + 1 # draft number is just the index plus one

In [16]:
# if draft number greater than 30, then round would be 2
df.loc[df['DraftNumber']> 30, 'Round'] = 2

In [17]:
df.tail()

Unnamed: 0,Player,Round,DraftNumber
53,Yannick Nzosa,2,54
54,Gui Santos,2,55
55,Luke Travers,2,56
56,Jabari Walker,2,57
57,Hugo Besson,2,58


In [18]:
# repeat for years 2021 and 2020

url = "https://www.nba.com/news/2021-nba-draft-results-picks-1-60" # url for the players that got drafted
r = requests.get(url) # send http request to url and saves response from server
soup = BeautifulSoup(r.content, 'html.parser') # represents document as nested data structure
table = soup.find('div', attrs = {'class':'w-full lg:flex'}) # all players and draft numbers and rounds found under div with attribute class = w-full lg:flex

In [19]:
# https://scottlarsen.com/2020/08/19/Scraping-Web-Content-Between-Two-Tags-With-BeautifulSoup-And-Python.html
# for the get_text() function

list2021 = [] # empty list to store names

n = 0 # first 2 rows aren't names, thus have a counter 

for row in table.findAll('p'): # iterate through the table
    
    if n!=2: # if n isn't 2 then can't add row to list
        n = n + 1
        
    elif row.a == None: # last few rows are none and gave an error, came back to fix
        n = n
        
    else: # this step adds the name to the list 
        list2021.append(row.a.get_text())
    

In [20]:
# making dataframe with the player names list
df2021 = pd.DataFrame({'Player':list2021})

# making columns of draft round and number picked

df2021['Round'] = 1 # setting round to 1 for now
df2021['DraftNumber'] = df2021.index + 1 # draft number is just the index plus one
# if draft number greater than 30, then round would be 2
df2021.loc[df2021['DraftNumber']> 30, 'Round'] = 2

In [21]:
url = "https://www.nba.com/news/2020-nba-draft-results-picks-1-60" # url for the players that got drafted
r = requests.get(url) # send http request to url and saves response from server
soup = BeautifulSoup(r.content, 'html.parser') # represents document as nested data structure
table = soup.find('div', attrs = {'class':'lg:pr-3 lg:w-3/4'}) # all players and draft numbers and rounds found under div with attribute class = w-full lg:flex

In [22]:
list2020 = [] # empty list to store names

n = 0 # first 2 rows aren't names, thus have a counter 

for row in table.findAll('a'): # iterate through the table
    
    if n!=2: # if n isn't 2 then can't add row to list
        n = n + 1
        
    elif row.a == None: # last few rows are none and gave an error, came back to fix
        n = n
        
    else: # this step adds the name to the list 
        list2020.append(row.get_text())

In [23]:
list2020 = []

for row in table.findAll('a'):
    # fixed bug where the html content had strings dealt or traded in between the a tag
    if row.get_text()[0].isupper(): # use isupper on first letter of string to see if it is a name
        list2020.append(row.get_text())



In [24]:
# making dataframe with the player names list
df2020 = pd.DataFrame({'Player':list2020})

# making columns of draft round and number picked

df2020['Round'] = 1 # setting round to 1 for now
df2020['DraftNumber'] = df2020.index + 1 # draft number is just the index plus one
# if draft number greater than 30, then round would be 2
df2020.loc[df2020['DraftNumber']> 30, 'Round'] = 2

In [25]:
# cleaned name that had \n after
df2020.loc[26,'Player'] = 'Udoka Azubuike'

In [26]:
df2020.head()

Unnamed: 0,Player,Round,DraftNumber
0,Anthony Edwards,1,1
1,James Wiseman,1,2
2,LaMelo Ball,1,3
3,Patrick Williams,1,4
4,Isaac Okoro,1,5


In [27]:
url = "https://basketball.realgm.com/nba/draft/past-drafts" # url for the players that got drafted
r = requests.get(url) # send http request to url and saves response from server
soup = BeautifulSoup(r.content, 'html.parser') # represents document as nested data structure
table = soup.find('tbody') # find pre league team

In [28]:
from re import search

In [29]:

i = 0 # counter for searching player name in dataframe
j = 0 # counter for every third row to get predraft team

# list to store predraft team
predraftteam = []

# iterate through all rows containing the tag a
for row in table.findAll('a'):
    
    # searches if first name is contained in player name in the table since some names were written differently between the two sources
    if search(df.Player[i].split()[0], row.get_text()): 
        j = 1 # makes counter 1 when player name is found
        
        # print(df.Player[i]) # to check to see if condition is met
    
    if j == 3: # every third row, add predraft team to list
        
        predraftteam.append(row.get_text()) # add predraft team to list
        i = i + 1 # iterate to the next player from first dataframe
    
    j = j + 1 # iterate counter

In [30]:
table1 = soup.find_all('tbody') # find pre league team

In [31]:
i = 30 # index 30 is the start of the second round
j = 0 # counter to get correct row to add pre draft team

# iterate through each row of second table with tag td
for row in table1[1].findAll('td'):
    
    # searches if first name is contained in player name in the table since some names were written differently between the two sources
    if search(df.Player[i].split()[1], row.get_text()) or search(df.Player[i].split()[0], row.get_text()):
        
        j = 1 # makes counter 1 when player name is found
    
    if j == 9: # every eighth row after name, add predraft team to list
        
        predraftteam.append(row.get_text()) # add predraft team to list
        
        # print(row.get_text()) # used print to see if for loop was working correctly
        
        i = i + 1 # iterate to the next player from first dataframe
    
    if i==58: # fixed error where index went out of range of dataframe
        i = 57
        
    j = j + 1 # iterate counter


In [32]:
predraftteam

['Duke',
 'Gonzaga',
 'Auburn',
 'Iowa',
 'Purdue',
 'Arizona',
 'Kentucky',
 'NBA G League Ignite',
 'Baylor',
 'Wisconsin',
 'New Zealand (New Zealand)',
 'Santa Clara',
 'Memphis',
 'Kansas',
 'Duke',
 'Duke',
 'LSU',
 'Arizona',
 'Wake Forest',
 'Ohio State',
 'Kansas',
 'Auburn',
 'Colorado State',
 'NBA G League Ignite',
 'Notre Dame',
 'Duke',
 'KK Mega Bemax (Serbia)',
 'Milwaukee',
 'Kentucky',
 'UCLA',
 'Gonzaga',
 'Michigan',
 'Arizona',
 'Arkansas',
 'Michigan State',
 'Fortituto Kontatto Bologna (Italy)',
 'NBA G League Ignite',
 'Tennessee',
 'Gran Canaria (Spain)',
 'Nebraska',
 'Ohio State',
 'Duke',
 'Michigan',
 'Toledo',
 'Memphis',
 'Paris Basketball (France)',
 'VCU',
 'Baylor',
 'USC',
 'Vanoli Cremona (Italy)',
 'Connecticut',
 'KK Mega Bemax (Serbia)',
 'Alabama',
 'Unicaja (Spain)',
 'Minas (Brazil)',
 'Perth (Australia)',
 'Colorado',
 'New Zealand (New Zealand)']

In [33]:
df['PreTeam'] = predraftteam

In [34]:
df2021.head()

Unnamed: 0,Player,Round,DraftNumber
0,Cade Cunningham,1,1
1,Jalen Green,1,2
2,Evan Mobley,1,3
3,Scottie Barnes,1,4
4,Jalen Suggs,1,5


In [35]:
url = "https://basketball.realgm.com/nba/draft/past_drafts/2021" # url for the players that got drafted
r = requests.get(url) # send http request to url and saves response from server
soup = BeautifulSoup(r.content, 'html.parser') # represents document as nested data structure
table = soup.find('tbody') # find pre league team

In [36]:
i = 0 # counter for searching player name in dataframe
j = 0 # counter for every third row to get predraft team

# list to store predraft team
predraftteam2021 = []

# iterate through all rows containing the tag a
for row in table.findAll('a'):
    
    # searches if last name is contained in player name in the table since some names were written differently between the two sources
    if search(df2021.Player[i].split()[1], row.get_text()): 
        j = 1 # makes counter 1 when player name is found
        
        # print(df2021.Player[i]) # to check to see if condition is met
    
    if j == 3: # every third row, add predraft team to list
        
        predraftteam2021.append(row.get_text()) # add predraft team to list
        i = i + 1 # iterate to the next player from first dataframe
    
    j = j + 1 # iterate counter

In [37]:
table1 = soup.find_all('tbody') # find pre league team

In [38]:
i = 30 # index 30 is the start of the second round
j = 0 # counter to get correct row to add pre draft team

# iterate through each row of second table with tag td
for row in table1[1].findAll('td'):
    
    # searches if first name is contained in player name in the table since some names were written differently between the two sources
    if search(df2021.Player[i].split()[1], row.get_text()) or search(df2021.Player[i].split()[0], row.get_text()):
        
        j = 1 # makes counter 1 when player name is found
    
    if j == 9: # every eighth row after name, add predraft team to list
        
        predraftteam2021.append(row.get_text()) # add predraft team to list
        
        # print(row.get_text()) # used print to see if for loop was working correctly
        
        i = i + 1 # iterate to the next player from first dataframe
    
    if i==60: # fixed error where index went out of range of dataframe
        i = 59
        
    j = j + 1 # iterate counter

In [39]:
df2021['PreTeam'] = predraftteam2021

In [40]:
url = "https://basketball.realgm.com/nba/draft/past_drafts/2020" # url for the players that got drafted
r = requests.get(url) # send http request to url and saves response from server
soup = BeautifulSoup(r.content, 'html.parser') # represents document as nested data structure
table = soup.find('tbody') # find pre league team

i = 0 # counter for searching player name in dataframe
j = 0 # counter for every third row to get predraft team

# list to store predraft team
predraftteam2020 = []

# iterate through all rows containing the tag a
for row in table.findAll('a'):
    
    # searches if first/last name is contained in player name in the table since some names were written differently between the two sources
    if search(df2020.Player[i].split()[1], row.get_text()) or search(df2020.Player[i].split()[0], row.get_text()): 
        j = 1 # makes counter 1 when player name is found
        
        # print(df2020.Player[i]) # to check to see if condition is met
    
    if j == 3: # every third row, add predraft team to list
        
        predraftteam2020.append(row.get_text()) # add predraft team to list
        i = i + 1 # iterate to the next player from first dataframe
    
    j = j + 1 # iterate counter


In [41]:
table1 = soup.find_all('tbody') # find pre league team

i = 30 # index 30 is the start of the second round
j = 0 # counter to get correct row to add pre draft team

# iterate through each row of second table with tag td
for row in table1[1].findAll('td'):
    #print(j) # finding bug where loop didn't capture the last 3 predraft teams
    #print(row.get_text()) # finding bug where loop didn't capture the last 3 predraft teams
    # searches if first/last name is contained in player name in the table since some names were written differently between the two sources
    if (search(df2020.Player[i].split()[1], row.get_text()) or search(df2020.Player[i].split()[0], row.get_text())) and row.get_text()!="DePaul": 
        # School name DePaul was matched as the name Paul Reed Jr., added an AND condition to fix bug
        
        j = 1 # makes counter 1 when player name is found
    
    if j == 9: # every eighth row after name, add predraft team to list
        
        predraftteam2020.append(row.get_text()) # add predraft team to list
        
        #print(row.get_text()) # used print to see if for loop was working correctly
        i = i + 1 # iterate to the next player from first dataframe
    
    if i==60: # fixed error where index went out of range of dataframe
        i = 59
        
    j = j + 1 # iterate counter

In [42]:
df2020['PreTeam'] = predraftteam2020

In [43]:
df2020.head()

Unnamed: 0,Player,Round,DraftNumber,PreTeam
0,Anthony Edwards,1,1,Georgia
1,James Wiseman,1,2,Memphis
2,LaMelo Ball,1,3,Illawarra (Australia)
3,Patrick Williams,1,4,Florida State
4,Isaac Okoro,1,5,Auburn


In [44]:
url = "https://basketball.realgm.com/nba/draft/past_drafts/2020" # url for the players that got drafted
r = requests.get(url) # send http request to url and saves response from server
soup = BeautifulSoup(r.content, 'html.parser') # represents document as nested data structure

In [45]:
# inspiration from https://stackoverflow.com/questions/5815747/beautifulsoup-getting-href

table1 = soup.find_all('tbody') # find pre league team

# empty list to store url links to player summary
urllist2020 = []

# outside loop to access the first and second round players
for i in range(0,2):
    
    # finding all a tags with reference to links to find link for player summary
    for row in table1[i].find_all('a', href = True):
        
        # search for links that contain /player to find player stats
        if search("/player",row['href']):
            
            # add the main website and the slash after it to list
            url = "https://basketball.realgm.com"
            urllist2020.append(url+row['href'])

In [46]:
df2020['urlsummary'] = urllist2020

In [47]:
url = "https://basketball.realgm.com/nba/draft/past_drafts/2021" # url for the players that got drafted
r = requests.get(url) # send http request to url and saves response from server
soup = BeautifulSoup(r.content, 'html.parser') # represents document as nested data structure
table1 = soup.find_all('tbody') # find pre league team

# empty list to store url links to player summary
urllist2021 = []

# outside loop to access the first and second round players
for i in range(0,2):
    
    # finding all a tags with reference to links to find link for player summary
    for row in table1[i].find_all('a', href = True):
        
        # search for links that contain /player to find player stats
        if search("/player",row['href']):
            
            # add the main website and the slash after it to list
            url = "https://basketball.realgm.com"
            urllist2021.append(url+row['href'])

In [48]:
df2021['urlsummary'] = urllist2021

In [49]:
url = "https://basketball.realgm.com/nba/draft/past_drafts/2022" # url for the players that got drafted
r = requests.get(url) # send http request to url and saves response from server
soup = BeautifulSoup(r.content, 'html.parser') # represents document as nested data structure
table1 = soup.find_all('tbody') # find pre league team

# empty list to store url links to player summary
urllist2022 = []

# outside loop to access the first and second round players
for i in range(0,2):
    
    # finding all a tags with reference to links to find link for player summary
    for row in table1[i].find_all('a', href = True):
        
        # search for links that contain /player to find player stats
        if search("/player",row['href']):
            
            # add the main website and the slash after it to list
            url = "https://basketball.realgm.com"
            urllist2022.append(url+row['href'])

In [50]:
df['urlsummary'] = urllist2022

In [51]:
# Setting preteam type to college as majority of draft players played in NCAA
df['PreType'] = "College"

# Going through all the teams in the dataframe
for team in df['PreTeam']:
    
    # Overseas team has brackets after the international team indicating country
    if search("\(",team):
        df.loc[df['PreTeam'] == team,'PreType'] = "Overseas"
    
    # G League Players that got drafted came from NBA G League Ignite
    elif search("NBA",team):
        df.loc[df['PreTeam'] == team, 'PreType'] = "G League"

In [52]:
# Setting preteam type to college as majority of draft players played in NCAA
df2021['PreType'] = "College"

# Going through all the teams in the dataframe
for team in df2021['PreTeam']:
    
    # Overseas team has brackets after the international team indicating country
    if search("\(",team):
        df2021.loc[df2021['PreTeam'] == team,'PreType'] = "Overseas"
    
    # G League Players that got drafted came from NBA G League Ignite
    elif search("NBA",team):
        df2021.loc[df2021['PreTeam'] == team, 'PreType'] = "G League"

In [53]:
# Setting preteam type to college as majority of draft players played in NCAA
df2020['PreType'] = "College"

# Going through all the teams in the dataframe
for team in df2020['PreTeam']:
    
    # Overseas team has brackets after the international team indicating country
    if search("\(",team):
        df2020.loc[df2020['PreTeam'] == team,'PreType'] = "Overseas"
    
    # G League Players that got drafted came from NBA G League Ignite
    elif search("NBA",team):
        df2021.loc[df2020['PreTeam'] == team, 'PreType'] = "G League"

In [54]:
# for url in df['urlsummary']:
#     r = requests.get(url) # send http request to url and saves response from server
#     soup = BeautifulSoup(r.content, 'html.parser') # represents document as nested data structure
#     table1 = soup.find_all('tbody') # find pre league team



In [85]:
########### Find common pattern to get stats

url = 'https://basketball.realgm.com/player/Ousmane-Dieng/Summary/148828'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
table1 = soup.find_all('tbody')

In [92]:
# inspired by https://stackoverflow.com/questions/67938081/use-beautifulsoup-to-find-a-table-after-a-header

# checking before running loop through all players

checker = 0
numberofstats = 1

for row in tableh2tag.findAll('td'):
    if row.get_text() == '2021-22':
        checker = 1
    elif checker == 1:
        if any(c.isalpha() for c in row.get_text()) == False and numberofstats <= 21:
            print(float(row.get_text()))
            numberofstats = numberofstats + 1

25.0
19.0
27.4
13.92
4.68
12.12
0.386
1.92
6.24
0.308
2.64
3.32
0.795
0.52
3.44
3.96
2.28
0.6
0.04
1.8
2.72


In [112]:

# list to store all draft players stats
totalls = []

# looping through all draft players summary url
for url in df['urlsummary']:
    
    # send http request to url and saves response from server
    r = requests.get(url)
    
    # represents document as nested data structure
    soup = BeautifulSoup(r.content, 'html.parser')
    
    # conditions to check whether player came from college, overseas or g league
    # then finding the table containing stats per game under the respective h2 tag
    if (df.loc[df['urlsummary']==url,'PreType'] == 'College').bool():
        tableh2tag = soup.find(lambda tag: tag.name == "h2" and "NCAA Season Stats - Per Game" in tag.text).find_next("table")
    
    elif (df.loc[df['urlsummary']==url,'PreType'] == 'Overseas').bool():
        tableh2tag = soup.find(lambda tag: tag.name == "h2" and "International Regular Season Stats - Per Game" in tag.text).find_next("table")
    
    else:
        tableh2tag = soup.find(lambda tag: tag.name == "h2" and "G League Full Season Stats - Per Game" in tag.text).find_next("table")
    
    # list to store individual player stats in
    playerls = []
    
    # checker for finding the right year which is 2021-22, the year previous of the draft
    checker = 0
    
    # checker for having the correct number of stats in the table
    numberofstats = 1
    
    # iterate through the table containing the tag td
    for row in tableh2tag.findAll('td'):
        
        # find the right year and change the checker to 1
        if row.get_text() == '2021-22' or row.get_text() == '2021-22 *':
            checker = 1
        
        # if the checker is 1 then all the rows below are the stats in that year
        elif checker == 1:
            
            # condition for letters as we only want numbers, the beginning rows of the table 
            # contained team name and league, second condition makes sure that it retrieve 
            # the correct number of stats which is 21
            if any(c.isalpha() for c in row.get_text()) == False and numberofstats <= 21:
                
                # saving the stat in a variable
                placeholder = row.get_text()
                
                # there was an issue where a player didn't play in college but still got drafted (Shaedon Sharpe)
                # thus his stats were a dash, so change all the dashes to 0 to represent empty or nothing
                if row.get_text() == '-':
                    placeholder = 0
                    
                # changing the type to float as the text came out as strings
                playerls.append(float(placeholder))
                
                # add 1 to this checker after adding one stat to the list
                numberofstats = numberofstats + 1
    # add list of stats of one player to the total list
    totalls.append(playerls)
    

In [82]:
# Shaedon Sharpe didn't play a single game at Kentucky but still declared for the draft
# Still was considered as high lottery player

# Could study highschool stats to see if highschool translate to NBA

# totalls[6] = ['0']*21

In [97]:
import numpy as np

In [124]:
# 1. gather header names for stats

url = 'https://basketball.realgm.com/player/Paolo-Banchero/Summary/134150'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
table1 = soup.find_all('thead')

statheaderls = []

for row in table1[0].findAll('th'):
    if len(row.get_text()) < 4:
        statheaderls.append(row.get_text())

statheaderls





['GP',
 'GS',
 'MIN',
 'PTS',
 'FGM',
 'FGA',
 'FG%',
 '3PM',
 '3PA',
 '3P%',
 'FTM',
 'FTA',
 'FT%',
 'OFF',
 'DEF',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF']

In [128]:
# 2. add column names to df

# testdf = df.copy()

# 3. add array of stats to df 
df[statheaderls] = np.array(totalls)

In [184]:
# to check for empty lists 

# ran into an issue with some overseas players that played in different leagues 
# issue was the year was 2021-22 * and not 2021-22 so added the condition to the for loop above

checkls = []
for i in range(0,len(totalls)):
    if len(totalls[i]) == 0:
        checkls.append(i)
checkls

[51]

In [None]:
# issue with Loyola(MD) with being college but was seen as overseas because
# the pattern with overseas on this site is that usually overseas team have
# team(country) so the brackets in my earlier code found this college to be overseas

df2021.loc[df2021['Player']=="Santi Aldama",'PreType'] = "College"

In [177]:
# list to store all draft players stats
totalls = []

# looping through all draft players summary url
for url in df2021['urlsummary']:
    
    # send http request to url and saves response from server
    r = requests.get(url)
    
    # represents document as nested data structure
    soup = BeautifulSoup(r.content, 'html.parser')
    
    #print(url) # checking error, issue where table under header is different this year than 2022
    
    # conditions to check whether player came from college, overseas or g league
    # then finding the table containing stats per game under the respective h2 tag
    if (df2021.loc[df2021['urlsummary']==url,'PreType'] == 'College').bool():
        tableh2tag = soup.find(lambda tag: tag.name == "h2" and "NCAA Season Stats - Per Game" in tag.text).find_next("table")
    
    elif (df2021.loc[df2021['urlsummary']==url,'PreType'] == 'Overseas').bool():
        tableh2tag = soup.find(lambda tag: tag.name == "h2" and "International Regular Season Stats - Per Game" in tag.text).find_next("table")
    
    else:
        tableh2tag = soup.find(lambda tag: tag.name == "h2" and "G League Regular Season Stats - Per Game" in tag.text).find_next("table")
    
    # list to store individual player stats in
    playerls = []
    
    # checker for finding the right year which is 2020-21, the year previous of the draft
    checker = 0
    
    # checker for having the correct number of stats in the table
    numberofstats = 1
    
    ######################
    # issue with index 29#
    ######################
    # loyola(md) was classified as overseas because of the brackets
    # fixed in code in the chunk above
    
    # iterate through the table containing the tag td
    for row in tableh2tag.findAll('td'):
        # find the right year and change the checker to 1
        if row.get_text() == '2020-21' or row.get_text() == '2020-21 *':
            checker = 1
        
        # if the checker is 1 then all the rows below are the stats in that year
        elif checker == 1:
            
            
            # condition for letters as we only want numbers, the beginning rows of the table 
            # contained team name and league, second condition makes sure that it retrieve 
            # the correct number of stats which is 21
            if any(c.isalpha() for c in row.get_text()) == False and numberofstats <= 21:
                
                # saving the stat in a variable
                placeholder = row.get_text()
                
                # there was an issue where a player didn't play in college but still got drafted (Shaedon Sharpe)
                # thus his stats were a dash, so change all the dashes to 0 to represent empty or nothing
                if row.get_text() == '-':
                    placeholder = 0
                    
                # changing the type to float as the text came out as strings
                playerls.append(float(placeholder))
                
                # add 1 to this checker after adding one stat to the list
                numberofstats = numberofstats + 1
    # add list of stats of one player to the total list
    totalls.append(playerls)

# adding array of stats to the dataframe with the respective stat header names
df2021[statheaderls] = np.array(totalls)

In [182]:
# issue with IMG Academy(Florida) with being prep school but was seen as overseas because
# the pattern with overseas on this site is that usually overseas team have
# team(country) so the brackets in my earlier code found this prep school to be overseas

df2020.loc[df2020['Player']=="Kenyon Martin Jr",'PreType'] = "Prep School"

In [183]:
# list to store all draft players stats
totalls = []

# looping through all draft players summary url
for url in df2020['urlsummary']:
    
    # send http request to url and saves response from server
    r = requests.get(url)
    
    # represents document as nested data structure
    soup = BeautifulSoup(r.content, 'html.parser')
    
    # print(url) # checking error, issue where table under header is different this year than 2022
    
    # conditions to check whether player came from college, overseas or g league
    # then finding the table containing stats per game under the respective h2 tag
    if (df2020.loc[df2020['urlsummary']==url,'PreType'] == 'College').bool():
        tableh2tag = soup.find(lambda tag: tag.name == "h2" and "NCAA Season Stats - Per Game" in tag.text).find_next("table")
    
    elif (df2020.loc[df2020['urlsummary']==url,'PreType'] == 'Overseas').bool():
        tableh2tag = soup.find(lambda tag: tag.name == "h2" and "International Regular Season Stats - Per Game" in tag.text).find_next("table")
    
    else:
        tableh2tag = soup.find(lambda tag: tag.name == "h2" and "G League Regular Season Stats - Per Game" in tag.text).find_next("table")
    
    
    # list to store individual player stats in
    playerls = []
    
    # checker for finding the right year which is 2019-20, the year previous of the draft
    checker = 0
    
    # checker for having the correct number of stats in the table
    numberofstats = 1
        
    
    # iterate through the table containing the tag td
    for row in tableh2tag.findAll('td'):
        # find the right year and change the checker to 1
        if row.get_text() == '2019-20' or row.get_text() == '2019-20 *':
            checker = 1
        
        # if the checker is 1 then all the rows below are the stats in that year
        elif checker == 1:
            
            
            # condition for letters as we only want numbers, the beginning rows of the table 
            # contained team name and league, second condition makes sure that it retrieve 
            # the correct number of stats which is 21
            if any(c.isalpha() for c in row.get_text()) == False and numberofstats <= 21:
                
                # saving the stat in a variable
                placeholder = row.get_text()
                
                # there was an issue where a player didn't play in college but still got drafted (Shaedon Sharpe)
                # thus his stats were a dash, so change all the dashes to 0 to represent empty or nothing
                if row.get_text() == '-':
                    placeholder = 0
                    
                # changing the type to float as the text came out as strings
                playerls.append(float(placeholder))
                
                # add 1 to this checker after adding one stat to the list
                numberofstats = numberofstats + 1
    # add list of stats of one player to the total list
    totalls.append(playerls)

# adding array of stats to the dataframe with the respective stat header names
# df2020[statheaderls] = np.array(totalls)

In [186]:
# KJ Martin played in IMG academy (prep school) so no stats were available

totalls[51] = ['0']*21

In [187]:
df2020[statheaderls] = np.array(totalls)


In [188]:
df2020

Unnamed: 0,Player,Round,DraftNumber,PreTeam,urlsummary,PreType,GP,GS,MIN,PTS,...,FTA,FT%,OFF,DEF,TRB,AST,STL,BLK,TOV,PF
0,Anthony Edwards,1,1,Georgia,https://basketball.realgm.com/player/Anthony-E...,College,32.0,31.0,33.0,19.06,...,5.34,0.772,0.75,4.47,5.22,2.84,1.34,0.56,2.72,2.19
1,James Wiseman,1,2,Memphis,https://basketball.realgm.com/player/James-Wis...,College,3.0,3.0,23.0,19.67,...,9.0,0.704,4.33,6.33,10.67,0.33,0.33,3.0,1.0,1.67
2,LaMelo Ball,1,3,Illawarra (Australia),https://basketball.realgm.com/player/LaMelo-Ba...,Overseas,13.0,13.0,31.3,17.15,...,3.85,0.7,2.0,5.85,7.85,6.77,1.54,0.15,2.54,2.62
3,Patrick Williams,1,4,Florida State,https://basketball.realgm.com/player/Patrick-W...,College,29.0,0.0,22.5,9.24,...,2.55,0.838,1.31,2.69,4.0,1.0,1.0,1.03,1.72,1.62
4,Isaac Okoro,1,5,Auburn,https://basketball.realgm.com/player/Isaac-Oko...,College,28.0,28.0,31.5,12.82,...,4.71,0.674,1.89,2.54,4.43,2.04,0.93,0.89,1.96,2.71
5,Onyeka Okongwu,1,6,USC,https://basketball.realgm.com/player/Onyeka-Ok...,College,28.0,28.0,30.6,16.21,...,5.11,0.72,3.29,5.36,8.64,1.07,1.21,2.71,2.0,2.68
6,Killian Hayes,1,7,Ratiopharm Ulm (Germany),https://basketball.realgm.com/player/Killian-H...,Overseas,33.0,33.0,24.8,11.58,...,2.7,0.876,0.36,2.45,2.82,5.39,1.45,0.27,3.21,2.97
7,Obi Toppin,1,8,Dayton,https://basketball.realgm.com/player/Obi-Toppi...,College,31.0,31.0,31.6,20.03,...,4.55,0.702,1.19,6.35,7.55,2.16,0.97,1.23,2.19,1.65
8,Deni Avdija,1,9,Maccabi Tel Aviv U18 (Israel),https://basketball.realgm.com/player/Deni-Avdi...,Overseas,59.0,26.0,21.7,9.02,...,1.93,0.588,0.56,4.1,4.66,2.0,0.68,0.66,1.58,2.07
9,Jalen Smith,1,10,Maryland,https://basketball.realgm.com/player/Jalen-Smi...,College,31.0,31.0,31.2,15.45,...,4.77,0.75,3.19,7.32,10.52,0.81,0.71,2.35,1.71,2.35


In [191]:
df.to_csv('data/df2022.csv')

In [192]:
df2021.to_csv('data/df2021.csv')

In [193]:
df2020.to_csv('data/df2020.csv')