In [1]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup

In [2]:
from time import sleep
from random import randint

## Scraping the data with BeautifulSoup library

### Collecting all the stats of each player in each season from 1990 to 2021

In [3]:
array_of_df = []

#Collecting data of each year from 1990 to 2021.

for Year in range(1990, 2022): 
    sleep (randint(2,4))
    
    #Get the url of the specific year.
    URL = f'https://www.basketball-reference.com/leagues/NBA_{Year}_totals.html'
    response = requests.get(URL)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    #Get table from page
    tbl = soup.find_all("table",id = "totals_stats")[0]
    #Get headers from table
    headers = [th.getText() for th in tbl.thead.findAll('th')]
    #Remove Rank column its not needed
    headers = headers[1:]
    #Get rows without first header row
    rows = soup.findAll('tr')[1:]
    #Get all cells
    player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
    
    #Create dataframe from data
    df = pd.DataFrame(player_stats, columns = headers)
    
    #Insert the Year columm to the DF.
    df.insert(0,'Year',Year)
    
    #Drop duplicated players in the same season, keep the one's with the most games played.
    df.drop_duplicates(subset=['Player'], keep = 'first', inplace = True)
    
    #insert this years dataframe to the the array of dataframes.
    array_of_df.insert(0,df)

In [4]:
#Combine all data frames of each year to 1 big dataframe.
stats_df = pd.concat(array_of_df)
stats_df

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,2021,Precious Achiuwa,PF,21,MIA,61,4,737,124,228,...,.509,73,135,208,29,20,28,43,91,304
1,2021,Jaylen Adams,PG,24,MIL,7,0,18,1,8,...,,0,3,3,2,0,0,0,1,2
2,2021,Steven Adams,C,27,NOP,58,58,1605,189,308,...,.444,213,301,514,111,54,38,78,113,438
3,2021,Bam Adebayo,C,23,MIA,64,64,2143,456,800,...,.799,142,431,573,346,75,66,169,145,1197
4,2021,LaMarcus Aldridge,C,35,TOT,26,23,674,140,296,...,.872,19,99,118,49,11,29,27,47,352
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472,1990,Orlando Woolridge,SF,30,LAL,62,2,1421,306,550,...,.733,49,136,185,96,39,46,73,160,788
473,1990,Haywoode Workman,PG,24,ATL,6,0,16,2,3,...,1.000,0,3,3,2,3,0,0,3,6
474,1990,James Worthy*,SF,28,LAL,80,80,2960,711,1298,...,.782,160,318,478,288,99,49,160,190,1685
475,1990,Danny Young,PG,27,POR,82,8,1393,138,328,...,.813,29,93,122,231,82,4,80,84,383


### Collecting the MVP's of each season from 1990 to 2021

In [5]:
    #Get table of MVP'S in the last 30 years
    
    URL2 = f'https://www.basketball-reference.com/awards/mvp.html'
    response2 = requests.get(URL2)
    soup2 = BeautifulSoup(response2.content, 'html.parser')
    
    #Get table from page
    tbl2 = soup2.find_all("table",id = "mvp_NBA")[0]
    
    #Get headers from table
    headers2 = [th.getText() for th in tbl2.thead.findAll('th')][6:]
    
    #Get rows without first header row
    rows2 = soup2.findAll('tr')[2:]
    
    #Get all cells
    player_stats2 = [[td.getText() for td in rows2[i].findAll('td')] for i in range(len(rows2))]
    
    #Create dataframe from data
    mvp_df = pd.DataFrame(player_stats2, columns = headers2)
    
    mvp_df
    

Unnamed: 0,Lg,Player,Voting,Age,Tm,G,MP,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48
0,NBA,Nikola Jokić,(V),25,DEN,72,34.6,26.4,10.8,8.3,1.3,0.7,.566,.388,.868,15.6,.301
1,NBA,Giannis Antetokounmpo,(V),25,MIL,63,30.4,29.5,13.6,5.6,1.0,1.0,.553,.304,.633,11.1,.279
2,NBA,Giannis Antetokounmpo,(V),24,MIL,72,32.8,27.7,12.5,5.9,1.3,1.5,.578,.256,.729,14.4,.292
3,NBA,James Harden,(V),28,HOU,72,35.4,30.4,5.4,8.8,1.8,0.7,.449,.367,.858,15.4,.289
4,NBA,Russell Westbrook,(V),28,OKC,81,34.6,31.6,10.7,10.4,1.6,0.4,.425,.343,.845,13.1,.224
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,NBA,1,,,,,,,,,,,,,,,
117,NBA,1,,,,,,,,,,,,,,,
118,NBA,1,,,,,,,,,,,,,,,
119,NBA,1,,,,,,,,,,,,,,,


### Collecting the Hollinger theory on the meaning of the PER value

In [6]:
    #Get table of MVP'S in the last 30 years
    
    URL3 = f'https://en.wikipedia.org/wiki/Player_efficiency_rating'
    response3 = requests.get(URL3)
    soup3 = BeautifulSoup(response3.content, 'html.parser')
    
    #Get table from page
    tbl3 = soup3.find_all("table",class_ = "wikitable")[0]
    
    #Get rows 
    rows3 = tbl3.findAll('tr')
    
    #Get all cells
    player_stats3 = [[td.getText() for td in rows3[i].findAll('td')] for i in range(len(rows3))]
    
    #Create dataframe from data
    hollinger_theory = pd.DataFrame(player_stats3, columns = ['Reference guide','PER'])
    
    #Clean table from char "\n"
    hollinger_theory = hollinger_theory.replace('\n','', regex=True)
    
    hollinger_theory

Unnamed: 0,Reference guide,PER
0,All-time great season,35.0+
1,Runaway MVP candidate,30.0–35.0
2,Strong MVP candidate,27.5–30.0
3,Weak MVP candidate,25.0–27.5
4,Definite All-Star,22.5–25.0
5,Borderline All-Star,20.0–22.5
6,Second offensive option,18.0–20.0
7,Third offensive option,16.5–18.0
8,Slightly above-average player,15.0–16.5
9,Rotation player,13.0–15.0


### Create CSV files

In [7]:
#Create CSV file of our large stats dataframe
stats_df.to_csv('NBA_raw_data.csv', index=False)

In [8]:
#Create CSV file of our MVP dataframe
mvp_df.to_csv('NBA_MVP_raw_data.csv', index=False)

In [9]:
#Create CSV file of our Hollinger theory dataframe
hollinger_theory.to_csv('hollinger_theory.csv', index=False)