In [1]:
import os
import numpy as np
import re
import requests
import pandas as pd
import html5lib
import time
from selenium import webdriver
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
from urllib.request import urlopen
import warnings
warnings.filterwarnings("ignore")

import zipfile
import os
sns.set(style = "whitegrid", 
        color_codes = True,
        font_scale = 1.5)

In [2]:
years = range(1980,2025)

## Per Game Stat Scraping

In [3]:
pg_frames = {}
for y in years:   
    year = y     
    pg_frames["url" + str(y)] = f"https://www.basketball-reference.com/leagues/NBA_{year}_per_game.html"

In [18]:
season_from = 1979 # for season column
season_to = 80 # for season column
x = 0
y = 0
counter = 0
pl_pg_frames = {}

for i in pg_frames:
    if counter == 19:
        time.sleep(60)
        
    page_html = urlopen(pg_frames[i])
    soup = BeautifulSoup(page_html)
    df = pd.read_html(str(soup.find('table')))[0]
    df = df.drop(len(df) - 1, axis = 0)
    all_links = np.array([i.get('href') for i in soup.findAll('table')[0].findAll('a')])
    player_links = np.array(all_links)[pd.Series(all_links).str.contains(r'players')]
    
    if season_to in range (80, 100): # for years 1997-98 and 1998-99
        pl_pg_frames["pl" + str(season_from + 1)] = df
        pl_pg_frames["pl" + str(season_from + 1)]['Player Link'] = player_links
        pl_pg_frames["pl" + str(season_from + 1)]['Season'] = str(season_from)+'-'+str(season_to)
        
 

    elif season_to in range(0, 10): # for years 1999-00 through 2008-09
        season_to = 0 + x
        pl_pg_frames["pl" + str(season_from + 1)] = df
        pl_pg_frames["pl" + str(season_from + 1)]['Player Link'] = player_links
        pl_pg_frames["pl" + str(season_from + 1)]['Season'] = str(season_from)+'-0'+str(season_to)
        x += 1

    elif season_to in range(10, 25): # for years 2009-10 through 2021-22
        season_to = 10 + y
        pl_pg_frames["pl" + str(season_from + 1)] = df
        pl_pg_frames["pl" + str(season_from + 1)]['Player Link'] = player_links
        pl_pg_frames["pl" + str(season_from + 1)]['Season'] = str(season_from)+'-'+str(season_to)
        y += 1
    season_from += 1    
    season_to = (season_to + 1) % 100
    counter  = (counter + 1) % 20

In [19]:
PG_DF = pd.concat(pl_pg_frames)
PG_DF = PG_DF.drop_duplicates()
PG_DF.reset_index(drop=True, inplace=True)

In [20]:
with open("pergamestats.csv", "w", encoding='utf-8') as file:
    file.write(PG_DF.to_csv())

## Advanced Stat Scraping

In [21]:
adv_frames = {}
for y in years:        # NBA season to scrape
    year = y     
    # URL to scrape, notice f string:
    adv_frames["url" + str(y)] = f"https://www.basketball-reference.com/leagues/NBA_{year}_advanced.html"

In [22]:
season_from = 1979 # for season column
season_to = 80 # for season column
x = 0
y = 0
counter = 0
adv_pl_frames = {}

for i in adv_frames:
    if counter == 19:
        time.sleep(60)
        
    page_html = urlopen(adv_frames[i])
    soup = BeautifulSoup(page_html)
    df = pd.read_html(str(soup.find('table')))[0]
    df = df[df["Player"] != "Player"]
    all_links = np.array([i.get('href') for i in soup.findAll('table')[0].findAll('a')])
    player_links = np.array(all_links)[pd.Series(all_links).str.contains(r'players')]
    
    if season_to in range (80, 100): # for years 1997-98 and 1998-99
        adv_pl_frames["pl" + str(season_from + 1)] = df
        adv_pl_frames["pl" + str(season_from + 1)]['Player Link'] = player_links
        adv_pl_frames["pl" + str(season_from + 1)]['Season'] = str(season_from)+'-'+str(season_to)
        
 

    elif season_to in range(0, 10): # for years 1999-00 through 2008-09
        season_to = 0 + x
        adv_pl_frames["pl" + str(season_from + 1)] = df
        adv_pl_frames["pl" + str(season_from + 1)]['Player Link'] = player_links
        adv_pl_frames["pl" + str(season_from + 1)]['Season'] = str(season_from)+'-0'+str(season_to)
        x += 1

    elif season_to in range(10, 25): # for years 2009-10 through 2021-22
        season_to = 10 + y
        adv_pl_frames["pl" + str(season_from + 1)] = df
        adv_pl_frames["pl" + str(season_from + 1)]['Player Link'] = player_links
        adv_pl_frames["pl" + str(season_from + 1)]['Season'] = str(season_from)+'-'+str(season_to)
        y += 1
    season_from += 1    
    season_to = (season_to + 1) % 100
    counter  = (counter + 1) % 20

In [23]:
Advanced_DF = pd.concat(adv_pl_frames)
Advanced_DF = Advanced_DF.drop_duplicates()
Advanced_DF.reset_index(drop=True, inplace=True)

In [24]:
with open("advancedstats.csv", "w", encoding='utf-8') as file:
    file.write(Advanced_DF.to_csv())

## Salary Tables Scraping
### Split into parts in order to save progress as this is a long process

In [66]:
all_unique_players = np.unique(np.array(NBA_Player_DF["Player Links"]))

In [67]:
part1 = all_unique_players[0:500]
part2 = all_unique_players[500:1000]
part3 = all_unique_players[1000:1500]
part4 = all_unique_players[1500:2000]
part5 = all_unique_players[2000:2500]
part6 = all_unique_players[2500:3000]
part7 = all_unique_players[3000:]

In [80]:
driver = webdriver.Chrome()

In [18]:
player_salaries1 = {}
start = time.time()

for player_link in part1:
        
    full_player_link = "https://www.basketball-reference.com" + player_link
    driver.get(full_player_link)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    salary_table = soup.find('table', {'id': 'all_salaries'})
    
    if salary_table == None:
        player_salaries1[player_link] = pd.DataFrame({"Season": [None], 
                                                     "Team": [None], 
                                                     "Lg": [None], 
                                                     "Salary": [None], 
                                                     "Player Link": [player_link]})
    else:
        salary_df = pd.read_html(str(salary_table))[0]
        salary_df = salary_df.drop(len(salary_df) - 1, axis = 0) #removing career total salary row
        salary_df["Player Link"] = player_link
        player_salaries1[player_link] = salary_df
    
end = time.time()

print(end - start)

2529.4333233833313


In [20]:
Salary_DF1 = pd.concat(player_salaries1)
Salary_DF1 = Salary_DF1.drop_duplicates()
Salary_DF1.reset_index(drop=True, inplace=True)

In [25]:
with open("salaries1.csv", "w") as file:
    file.write(Salary_DF1.to_csv())

In [6]:
player_salaries2 = {}
start = time.time()
x = 0
y = 0
for player_link in part2:
    
    if x % 10 == 0:
        print("still going... " + str(y))
        y += 1
    
    full_player_link = "https://www.basketball-reference.com" + player_link
    driver.get(full_player_link)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    salary_table = soup.find('table', {'id': 'all_salaries'})
    
    if salary_table == None:
        player_salaries2[player_link] = pd.DataFrame({"Season": [None], 
                                                     "Team": [None], 
                                                     "Lg": [None], 
                                                     "Salary": [None], 
                                                     "Player Link": [player_link]})
    else:
        salary_df = pd.read_html(str(salary_table))[0]
        salary_df = salary_df.drop(len(salary_df) - 1, axis = 0) #removing career total salary row
        salary_df["Player Link"] = player_link
        player_salaries2[player_link] = salary_df
    x += 1
end = time.time()

print(end - start)

still going... 0
still going... 1
still going... 2
still going... 3
still going... 4
still going... 5
still going... 6
still going... 7
still going... 8
still going... 9
still going... 10
still going... 11
still going... 12
still going... 13
still going... 14
still going... 15
still going... 16
still going... 17
still going... 18
still going... 19
still going... 20
still going... 21
still going... 22
still going... 23
still going... 24
still going... 25
still going... 26
still going... 27
still going... 28
still going... 29
still going... 30
still going... 31
still going... 32
still going... 33
still going... 34
still going... 35
still going... 36
still going... 37
still going... 38
still going... 39
still going... 40
still going... 41
still going... 42
still going... 43
still going... 44
still going... 45
still going... 46
still going... 47
still going... 48
still going... 49
1438.8448770046234


In [7]:
Salary_DF2 = pd.concat(player_salaries2)
Salary_DF2 = Salary_DF2.drop_duplicates()
Salary_DF2.reset_index(drop=True, inplace=True)
with open("salaries2.csv", "w") as file:
    file.write(Salary_DF2.to_csv())

In [9]:
player_salaries3 = {}
start = time.time()
x = 0
y = 0
for player_link in part3:
        
    if x % 10 == 0:
        print("still going... " + str(y))
        y += 1
    full_player_link = "https://www.basketball-reference.com" + player_link
    driver.get(full_player_link)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    salary_table = soup.find('table', {'id': 'all_salaries'})
    
    if salary_table == None:
        player_salaries3[player_link] = pd.DataFrame({"Season": [None], 
                                                     "Team": [None], 
                                                     "Lg": [None], 
                                                     "Salary": [None], 
                                                     "Player Link": [player_link]})
    else:
        salary_df = pd.read_html(str(salary_table))[0]
        salary_df = salary_df.drop(len(salary_df) - 1, axis = 0) #removing career total salary row
        salary_df["Player Link"] = player_link
        player_salaries3[player_link] = salary_df
    x += 1
end = time.time()

print(end - start)

still going... 0
still going... 1
still going... 2
still going... 3
still going... 4
still going... 5
still going... 6
still going... 7
still going... 8
still going... 9
still going... 10
still going... 11
still going... 12
still going... 13
still going... 14
still going... 15
still going... 16
still going... 17
still going... 18
still going... 19
still going... 20
still going... 21
still going... 22
still going... 23
still going... 24
still going... 25
still going... 26
still going... 27
still going... 28
still going... 29
still going... 30
still going... 31
still going... 32
still going... 33
still going... 34
still going... 35
still going... 36
still going... 37
still going... 38
still going... 39
still going... 40
still going... 41
still going... 42
still going... 43
still going... 44
still going... 45
still going... 46
still going... 47
still going... 48
still going... 49
1426.0486125946045


In [10]:
Salary_DF3 = pd.concat(player_salaries3)
Salary_DF3 = Salary_DF3.drop_duplicates()
Salary_DF3.reset_index(drop=True, inplace=True)
with open("salaries3.csv", "w") as file:
    file.write(Salary_DF3.to_csv())

In [21]:
Salary_DF3

Unnamed: 0,Season,Team,Lg,Salary,Player Link
0,1985-86,Milwaukee Bucks,NBA,"$145,000",/players/f/fieldke01.html
1,2010-11,New York Knicks,NBA,"$473,604",/players/f/fieldla01.html
2,2011-12,New York Knicks,NBA,"$762,195",/players/f/fieldla01.html
3,2012-13,Toronto Raptors,NBA,"$6,250,000",/players/f/fieldla01.html
4,2013-14,Toronto Raptors,NBA,"$6,250,000",/players/f/fieldla01.html
...,...,...,...,...,...
2588,2005-06,Detroit Pistons,NBA,"$1,760,000",/players/h/hunteli01.html
2589,2006-07,Detroit Pistons,NBA,"$2,250,000",/players/h/hunteli01.html
2590,2007-08,Detroit Pistons,NBA,"$2,250,000",/players/h/hunteli01.html
2591,2008-09,Chicago Bulls,NBA,"$1,143,473",/players/h/hunteli01.html


In [47]:
player_salaries4 = {}
start = time.time()
x = 1
y = 0
for player_link in part4:
        
    if x % 10 == 0:
        print("still going... " + str(y))
        y += 1
    if x % 15 == 0:
        time.sleep(30)
    full_player_link = "https://www.basketball-reference.com" + player_link
    driver.get(full_player_link)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    salary_table = soup.find('table', {'id': 'all_salaries'})
    
    if salary_table == None:
        player_salaries4[player_link] = pd.DataFrame({"Season": [None], 
                                                     "Team": [None], 
                                                     "Lg": [None], 
                                                     "Salary": [None], 
                                                     "Player Link": [player_link]})
    else:
        salary_df = pd.read_html(str(salary_table))[0]
        salary_df = salary_df.drop(len(salary_df) - 1, axis = 0) #removing career total salary row
        salary_df["Player Link"] = player_link
        player_salaries4[player_link] = salary_df
    x += 1
end = time.time()

print(end - start)

still going... 0
still going... 1
still going... 2
still going... 3
still going... 4
still going... 5
still going... 6
still going... 7
still going... 8
still going... 9
still going... 10
still going... 11
still going... 12
still going... 13
still going... 14
still going... 15
still going... 16
still going... 17
still going... 18
still going... 19
still going... 20
still going... 21
still going... 22
still going... 23
still going... 24
still going... 25
still going... 26
still going... 27
still going... 28
still going... 29
still going... 30
still going... 31
still going... 32
still going... 33
still going... 34
still going... 35
still going... 36
still going... 37
still going... 38
still going... 39
still going... 40
still going... 41
still going... 42
still going... 43
still going... 44
still going... 45
still going... 46
still going... 47
still going... 48
still going... 49
2405.063406944275


In [48]:
Salary_DF4 = pd.concat(player_salaries4)
Salary_DF4 = Salary_DF4.drop_duplicates()
Salary_DF4.reset_index(drop=True, inplace=True)
with open("salaries4.csv", "w") as file:
    file.write(Salary_DF4.to_csv())

In [58]:
player_salaries5 = {}
start = time.time()
x = 0
y = 0
for player_link in part5:
    if x % 10 == 0:
        print("still going... " + str(y))
        y += 1
    if x % 12 == 0:
        time.sleep(30)
    full_player_link = "https://www.basketball-reference.com" + player_link
    driver.get(full_player_link)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    salary_table = soup.find('table', {'id': 'all_salaries'})
    
    if salary_table == None:
        player_salaries5[player_link] = pd.DataFrame({"Season": [None], 
                                                     "Team": [None], 
                                                     "Lg": [None], 
                                                     "Salary": [None], 
                                                     "Player Link": [player_link]})
    else:
        salary_df = pd.read_html(str(salary_table))[0]
        salary_df = salary_df.drop(len(salary_df) - 1, axis = 0) #removing career total salary row
        salary_df["Player Link"] = player_link
        player_salaries5[player_link] = salary_df
    x += 1
end = time.time()

print(end - start)

still going... 0
still going... 1
still going... 2
still going... 3
still going... 4
still going... 5
still going... 6
still going... 7
still going... 8
still going... 9
still going... 10
still going... 11
still going... 12
still going... 13
still going... 14
still going... 15
still going... 16
still going... 17
still going... 18
still going... 19
still going... 20
still going... 21
still going... 22
still going... 23
still going... 24
still going... 25
still going... 26
still going... 27
still going... 28
still going... 29
still going... 30
still going... 31
still going... 32
still going... 33
still going... 34
still going... 35
still going... 36
still going... 37
still going... 38
still going... 39
still going... 40
still going... 41
still going... 42
still going... 43
still going... 44
still going... 45
still going... 46
still going... 47
still going... 48
still going... 49
3560.4429562091827


In [59]:
Salary_DF5 = pd.concat(player_salaries5)
Salary_DF5 = Salary_DF5.drop_duplicates()
Salary_DF5.reset_index(drop=True, inplace=True)
with open("salaries5.csv", "w") as file:
    file.write(Salary_DF5.to_csv())

In [74]:
player_salaries6 = {}
start = time.time()
x = 0
y = 0
for player_link in part6:
    if x % 10 == 0:
        print("still going... " + str(y))
        y += 1
    if x % 13 == 0:
        time.sleep(30)
    full_player_link = "https://www.basketball-reference.com" + player_link
    driver.get(full_player_link)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    salary_table = soup.find('table', {'id': 'all_salaries'})
    
    if salary_table == None:
        player_salaries6[player_link] = pd.DataFrame({"Season": [None], 
                                                     "Team": [None], 
                                                     "Lg": [None], 
                                                     "Salary": [None], 
                                                     "Player Link": [player_link]})
    else:
        salary_df = pd.read_html(str(salary_table))[0]
        salary_df = salary_df.drop(len(salary_df) - 1, axis = 0) #removing career total salary row
        salary_df["Player Link"] = player_link
        player_salaries6[player_link] = salary_df
    x += 1
end = time.time()

print(end - start)

still going... 0
still going... 1
still going... 2
still going... 3
still going... 4
still going... 5
still going... 6
still going... 7
still going... 8
still going... 9
still going... 10
still going... 11
still going... 12
still going... 13
still going... 14
still going... 15
still going... 16
still going... 17
still going... 18
still going... 19
still going... 20
still going... 21
still going... 22
still going... 23
still going... 24
still going... 25
still going... 26
still going... 27
still going... 28
still going... 29
still going... 30
still going... 31
still going... 32
still going... 33
still going... 34
still going... 35
still going... 36
still going... 37
still going... 38
still going... 39
still going... 40
still going... 41
still going... 42
still going... 43
still going... 44
still going... 45
still going... 46
still going... 47
still going... 48
still going... 49
2558.6829097270966


In [75]:
Salary_DF6 = pd.concat(player_salaries6)
Salary_DF6 = Salary_DF6.drop_duplicates()
Salary_DF6.reset_index(drop=True, inplace=True)
with open("salaries6.csv", "w") as file:
    file.write(Salary_DF6.to_csv())

In [83]:
player_salaries7 = {}
start = time.time()
x = 0
y = 0
for player_link in part7:
    if x % 10 == 0:
        print("still going... " + str(y))
        y += 1
    if x % 12 == 0:
        time.sleep(30)
    full_player_link = "https://www.basketball-reference.com" + player_link
    driver.get(full_player_link)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    salary_table = soup.find('table', {'id': 'all_salaries'})
    
    if salary_table == None:
        player_salaries7[player_link] = pd.DataFrame({"Season": [None], 
                                                     "Team": [None], 
                                                     "Lg": [None], 
                                                     "Salary": [None], 
                                                     "Player Link": [player_link]})
    else:
        salary_df = pd.read_html(str(salary_table))[0]
        salary_df = salary_df.drop(len(salary_df) - 1, axis = 0) #removing career total salary row
        salary_df["Player Link"] = player_link
        player_salaries7[player_link] = salary_df
    x += 1
    
end = time.time()

driver.quit()
print(end - start)

still going... 0
still going... 1
still going... 2
still going... 3
still going... 4
still going... 5
still going... 6
still going... 7
still going... 8
still going... 9
still going... 10
still going... 11
still going... 12
still going... 13
still going... 14
still going... 15
still going... 16
still going... 17
still going... 18
still going... 19
still going... 20
still going... 21
still going... 22
still going... 23
still going... 24
still going... 25
still going... 26
still going... 27
still going... 28
still going... 29
still going... 30
still going... 31
still going... 32
still going... 33
still going... 34
still going... 35
still going... 36
still going... 37
still going... 38
still going... 39
still going... 40
still going... 41
still going... 42
still going... 43
still going... 44
still going... 45
still going... 46
still going... 47
still going... 48
still going... 49
still going... 50
still going... 51
still going... 52
still going... 53
still going... 54
still going... 55
st

In [84]:
Salary_DF7 = pd.concat(player_salaries7)
Salary_DF7 = Salary_DF7.drop_duplicates()
Salary_DF7.reset_index(drop=True, inplace=True)
with open("salaries7.csv", "w") as file:
    file.write(Salary_DF7.to_csv())

In [10]:
Salary_DF1 = pd.read_csv("salaries1.csv")
Salary_DF2 = pd.read_csv("salaries2.csv")
Salary_DF3 = pd.read_csv("salaries3.csv")
Salary_DF4 = pd.read_csv("salaries4.csv")
Salary_DF5 = pd.read_csv("salaries5.csv")
Salary_DF6 = pd.read_csv("salaries6.csv")
Salary_DF7 = pd.read_csv("salaries7.csv")

In [11]:
Salary_DF = pd.concat([Salary_DF1, Salary_DF2, Salary_DF3, Salary_DF4, Salary_DF5, Salary_DF6, Salary_DF7])

In [13]:
with open("allsalaries.csv", "w") as file:
    file.write(Salary_DF.to_csv())