# Web Scraping NBA Data

- This Notebook is a quick reference on how to use Python, BeautifulSoup and pandas to scrape NBA data from basketball-reference.com

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [7]:
url = 'https://www.basketball-reference.com/leagues/NBA_2020_per_game.html'
page = requests.get(url)
page

<Response [200]>

In [8]:
page.content



In [9]:
soup = BeautifulSoup(page.content, 'html.parser')

In [10]:
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" data-root="/home/bbr/build" data-version="klecko-" itemscope="" itemtype="https://schema.org/WebSite" lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="ie=edge" http-equiv="x-ua-compatible"/>
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport">
   <link href="https://d2p3bygnnzw9w3.cloudfront.net/req/202010221" rel="dns-prefetch"/>
   <title>
    2019-20 NBA Player Stats: Per Game | Basketball-Reference.com
   </title>
   <meta content="Player stat per game averages for the 2019-20 NBA season" name="Description"/>
   <link href="https://www.basketball-reference.com/leagues/NBA_2020_per_game.html" rel="canonical"/>
   <!-- include:start ="/inc/klecko_header_bbr.html_f" -->
   <!-- no:cookie fast load the css.           -->
   <script>
    function gup(n) {n = n.replace(/[\[]/, '\\[').replace(/[\]]/, '\\]'); var r = new RegExp('[\\?&]'+n+'=([^&#]*)'); var re = r.exec(location.search);   return re === n

In [11]:
table = soup.find_all(class_="full_table")

In [12]:
table

[<tr class="full_table"><th class="right" csk="1" data-stat="ranker" scope="row">1</th><td class="left" csk="Adams,Steven" data-append-csv="adamsst01" data-stat="player"><a href="/players/a/adamsst01.html">Steven Adams</a></td><td class="center" data-stat="pos">C</td><td class="right" data-stat="age">26</td><td class="left" data-stat="team_id"><a href="/teams/OKC/2020.html">OKC</a></td><td class="right" data-stat="g">63</td><td class="right" data-stat="gs">63</td><td class="right" data-stat="mp_per_g">26.7</td><td class="right" data-stat="fg_per_g">4.5</td><td class="right" data-stat="fga_per_g">7.6</td><td class="right" data-stat="fg_pct">.592</td><td class="right" data-stat="fg3_per_g">0.0</td><td class="right" data-stat="fg3a_per_g">0.0</td><td class="right non_qual" data-stat="fg3_pct">.333</td><td class="right" data-stat="fg2_per_g">4.5</td><td class="right" data-stat="fg2a_per_g">7.5</td><td class="right" data-stat="fg2_pct">.594</td><td class="right" data-stat="efg_pct">.593</td

In [13]:
head = soup.find(class_="thead")

column_names_raw=[head.text for item in head][0]

column_names_raw

'\nRk\nPlayer\nPos\nAge\nTm\nG\nGS\nMP\nFG\nFGA\nFG%\n3P\n3PA\n3P%\n2P\n2PA\n2P%\neFG%\nFT\nFTA\nFT%\nORB\nDRB\nTRB\nAST\nSTL\nBLK\nTOV\nPF\nPTS\n'

In [19]:
column_names_clean = column_names_raw.replace("\n",",").split(",")[2:-1]

column_names_clean

['Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'GS',
 'MP',
 'FG',
 'FGA',
 'FG%',
 '3P',
 '3PA',
 '3P%',
 '2P',
 '2PA',
 '2P%',
 'eFG%',
 'FT',
 'FTA',
 'FT%',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS']

In [24]:
"""Extracting full list of player_data"""

players = []
    
for i in range(len(table)):
    
    player_ = []
    
    for td in table[i].find_all("td"):
        player_.append(td.text)
    
    players.append(player_)
        
df = pd.DataFrame(players, columns = column_names_clean).set_index("Player")



#cleaning the player's name from occasional special characters
df.index = df.index.str.replace('*', '')

In [25]:
df

Unnamed: 0_level_0,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Steven Adams,C,26,OKC,63,63,26.7,4.5,7.6,.592,0.0,...,.582,3.3,6.0,9.3,2.3,0.8,1.1,1.5,1.9,10.9
Bam Adebayo,PF,22,MIA,72,72,33.6,6.1,11.0,.557,0.0,...,.691,2.4,7.8,10.2,5.1,1.1,1.3,2.8,2.5,15.9
LaMarcus Aldridge,C,34,SAS,53,53,33.1,7.4,15.0,.493,1.2,...,.827,1.9,5.5,7.4,2.4,0.7,1.6,1.4,2.4,18.9
Kyle Alexander,C,23,MIA,2,0,6.5,0.5,1.0,.500,0.0,...,,1.0,0.5,1.5,0.0,0.0,0.0,0.5,0.5,1.0
Nickeil Alexander-Walker,SG,21,NOP,47,1,12.6,2.1,5.7,.368,1.0,...,.676,0.2,1.6,1.8,1.9,0.4,0.2,1.1,1.2,5.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Trae Young,PG,21,ATL,60,60,35.3,9.1,20.8,.437,3.4,...,.860,0.5,3.7,4.3,9.3,1.1,0.1,4.8,1.7,29.6
Cody Zeller,C,27,CHO,58,39,23.1,4.3,8.3,.524,0.3,...,.682,2.8,4.3,7.1,1.5,0.7,0.4,1.3,2.4,11.1
Tyler Zeller,C,30,SAS,2,0,2.0,0.5,2.0,.250,0.0,...,,1.5,0.5,2.0,0.0,0.0,0.0,0.0,0.0,1.0
Ante Žižić,C,23,CLE,22,0,10.0,1.9,3.3,.569,0.0,...,.737,0.8,2.2,3.0,0.3,0.3,0.2,0.5,1.2,4.4


In [29]:
df.to_csv ('2020_nba_data_per_game.csv', header=True)