In [1]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup

In [2]:
from time import sleep
from random import randint

## Scraping the data with BeautifulSoup library

### Collecting all the stats of each player in each season from 1990 to 2021

In [None]:
array_of_df = []

#Collecting data of each year from 1990 to 2021.

for Year in range(1990, 2022): 
    sleep (randint(2,4))
    
    #Get the url of the specific year.
    URL = f'https://www.basketball-reference.com/leagues/NBA_{Year}_totals.html'
    response = requests.get(URL)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    #Get table from page
    tbl = soup.find_all("table",id = "totals_stats")[0]
    #Get headers from table
    headers = [th.getText() for th in tbl.thead.findAll('th')]
    #Remove Rank column its not needed
    headers = headers[1:]
    #Get rows without first header row
    rows = soup.findAll('tr')[1:]
    #Get all cells
    player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
    
    #Create dataframe from data
    df = pd.DataFrame(player_stats, columns = headers)
    
    #Insert the Year columm to the DF.
    df.insert(0,'Year',Year)
    
    #Drop duplicated players in the same season, keep the one's with the most games played.
    df.drop_duplicates(subset=['Player'], keep = 'first', inplace = True)
    
    #insert this years dataframe to the the array of dataframes.
    array_of_df.insert(0,df)

In [None]:
#Combine all data frames of each year to 1 big dataframe.
stats_df = pd.concat(array_of_df)
stats_df

### Collecting the MVP's of each season from 1990 to 2021

In [None]:
    #Get table of MVP'S in the last 30 years
    
    URL2 = f'https://www.basketball-reference.com/awards/mvp.html'
    response2 = requests.get(URL2)
    soup2 = BeautifulSoup(response2.content, 'html.parser')
    
    #Get table from page
    tbl2 = soup2.find_all("table",id = "mvp_NBA")[0]
    
    #Get headers from table
    headers2 = [th.getText() for th in tbl2.thead.findAll('th')][6:]
    
    #Get rows without first header row
    rows2 = soup2.findAll('tr')[2:]
    
    #Get all cells
    player_stats2 = [[td.getText() for td in rows2[i].findAll('td')] for i in range(len(rows2))]
    
    #Create dataframe from data
    mvp_df = pd.DataFrame(player_stats2, columns = headers2)
    
    mvp_df
    

### Collecting the Hollinger theory on the meaning of the PER value

In [None]:
    #Get table of MVP'S in the last 30 years
    
    URL3 = f'https://en.wikipedia.org/wiki/Player_efficiency_rating'
    response3 = requests.get(URL3)
    soup3 = BeautifulSoup(response3.content, 'html.parser')
    
    #Get table from page
    tbl3 = soup3.find_all("table",class_ = "wikitable")[0]
    
    #Get rows 
    rows3 = tbl3.findAll('tr')
    
    #Get all cells
    player_stats3 = [[td.getText() for td in rows3[i].findAll('td')] for i in range(len(rows3))]
    
    #Create dataframe from data
    hollinger_theory = pd.DataFrame(player_stats3, columns = ['Reference guide','PER'])
    
    #Clean table from char "\n"
    hollinger_theory = hollinger_theory.replace('\n','', regex=True)
    
    hollinger_theory

### Create CSV files

In [None]:
#Create CSV file of our large stats dataframe
stats_df.to_csv('NBA_raw_data.csv', index=False)

In [None]:
#Create CSV file of our MVP dataframe
mvp_df.to_csv('NBA_MVP_raw_data.csv', index=False)

In [None]:
#Create CSV file of our Hollinger theory dataframe
hollinger_theory.to_csv('hollinger_theory.csv', index=False)