In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.basketball-reference.com/players/i/iguodan01.html"

html = urlopen(url)
soup = BeautifulSoup(html, "html5lib")

In [4]:
# Finds all 'tr' in the soup, but only returns the 1st two via limit=2
# Note soup.findAll() returns a Tag object
# Note returns the 'tr' tag as well as the contents within
print(soup.findAll('tr', limit=1))

[<tr>
         <th aria-label="If listed as single number, the year the season ended.★ - Indicates All-Star for league.Only on regular season tables." class=" poptip sort_default_asc center" data-stat="season" data-tip="If listed as single number, the year the season ended.&lt;br&gt;★ - Indicates All-Star for league.&lt;br&gt;Only on regular season tables." scope="col">Season</th>
         <th aria-label="Age of Player at the start of February 1st of that season." class=" poptip sort_default_asc center" data-stat="age" data-tip="Age of Player at the start of February 1st of that season." scope="col">Age</th>
         <th aria-label="Team" class=" poptip sort_default_asc center" data-stat="team_id" data-tip="Team" scope="col">Tm</th>
         <th aria-label="League" class=" poptip sort_default_asc center" data-stat="lg_id" data-tip="League" scope="col">Lg</th>
         <th aria-label="Position" class=" poptip sort_default_asc center" data-stat="pos" data-tip="Position" scope="col">Pos</

In [7]:
# soup is a soup object with findAll function
# soup.findAll() is a Tag object with findAll function
# soup.findAll().findAll() results in a ResultSet
# within the previous slice this finds all of the 'th' tagged text
print(soup.findAll('tr', limit=1)[0].findAll('th'))

[<th aria-label="If listed as single number, the year the season ended.★ - Indicates All-Star for league.Only on regular season tables." class=" poptip sort_default_asc center" data-stat="season" data-tip="If listed as single number, the year the season ended.&lt;br&gt;★ - Indicates All-Star for league.&lt;br&gt;Only on regular season tables." scope="col">Season</th>, <th aria-label="Age of Player at the start of February 1st of that season." class=" poptip sort_default_asc center" data-stat="age" data-tip="Age of Player at the start of February 1st of that season." scope="col">Age</th>, <th aria-label="Team" class=" poptip sort_default_asc center" data-stat="team_id" data-tip="Team" scope="col">Tm</th>, <th aria-label="League" class=" poptip sort_default_asc center" data-stat="lg_id" data-tip="League" scope="col">Lg</th>, <th aria-label="Position" class=" poptip sort_default_asc center" data-stat="pos" data-tip="Position" scope="col">Pos</th>, <th aria-label="Games" class=" poptip cen

In [8]:
# soup.findAll().findAll() results in a ResultSet object
# ResultSet object is iterable array 
# ResultSet object array values have the getText method

column_headers = [th.getText() for th in
                  soup.findAll('tr', limit=1)[0].findAll('th')]

print(column_headers)

['Season', 'Age', 'Tm', 'Lg', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']


In [9]:
# from soup return all except the first 2 elements of the 'tr' tagged data 
# Note setting x = soup.findAll('tr')[2:] makes x of type list
data_rows = soup.findAll('tr')[1:]
print(data_rows)

[<tr class="full_table" id="per_game.2005"><th class="left " data-stat="season" scope="row"><a href="/players/i/iguodan01/gamelog/2005/">2004-05</a></th><td class="center " data-stat="age">21</td><td class="left " data-stat="team_id"><a href="/teams/PHI/2005.html">PHI</a></td><td class="left " data-stat="lg_id"><a href="/leagues/NBA_2005.html">NBA</a></td><td class="center " data-stat="pos">SG</td><td class="right " data-stat="g">82</td><td class="right " data-stat="gs">82</td><td class="right " data-stat="mp_per_g">32.8</td><td class="right " data-stat="fg_per_g">3.3</td><td class="right " data-stat="fga_per_g">6.7</td><td class="right " data-stat="fg_pct">.493</td><td class="right " data-stat="fg3_per_g">0.6</td><td class="right " data-stat="fg3a_per_g">1.7</td><td class="right " data-stat="fg3_pct">.331</td><td class="right " data-stat="fg2_per_g">2.7</td><td class="right " data-stat="fg2a_per_g">4.9</td><td class="right " data-stat="fg2_pct">.550</td><td class="right " data-stat="e

In [10]:
# player data is in 2D list (table) while column headers are a 1D list 
# need to make player data a 2d list
player_data = [[td.getText() for td in data_rows[i].findAll('td')]
               for i in range(len(data_rows))]
print(player_data)
print(type(player_data))
# above same as
# player_data_02 = []
# for i in range(len(data_rows)):
    # player_row = []
    # for td in data_rows[i].findAll('td'):
        # player_row.append(td.getText())
    #player_data_02.append(player_row) 

[['21', 'PHI', 'NBA', 'SG', '82', '82', '32.8', '3.3', '6.7', '.493', '0.6', '1.7', '.331', '2.7', '4.9', '.550', '.536', '1.9', '2.6', '.743', '1.1', '4.6', '5.7', '3.0', '1.7', '0.6', '1.7', '2.5', '9.0'], ['22', 'PHI', 'NBA', 'SF', '82', '82', '37.6', '4.2', '8.4', '.500', '0.7', '1.9', '.354', '3.5', '6.5', '.543', '.541', '3.2', '4.3', '.754', '1.4', '4.4', '5.9', '3.1', '1.6', '0.3', '1.9', '2.4', '12.3'], ['23', 'PHI', 'NBA', 'SG', '76', '76', '40.3', '5.8', '13.0', '.447', '0.6', '2.0', '.310', '5.2', '11.0', '.472', '.471', '5.9', '7.3', '.820', '1.0', '4.7', '5.7', '5.7', '2.0', '0.4', '3.4', '2.6', '18.2'], ['24', 'PHI', 'NBA', 'SF', '82', '82', '39.5', '7.1', '15.6', '.456', '1.2', '3.7', '.329', '5.9', '11.8', '.496', '.495', '4.5', '6.2', '.721', '1.0', '4.4', '5.4', '4.8', '2.1', '0.6', '2.6', '2.3', '19.9'], ['25', 'PHI', 'NBA', 'SF', '82', '82', '39.9', '6.6', '14.0', '.473', '1.0', '3.2', '.307', '5.6', '10.8', '.521', '.507', '4.6', '6.4', '.724', '1.1', '4.6', '5.7'

In [11]:
df = pd.DataFrame(player_data, columns=column_headers[1:])
df.head()
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 29 columns):
Age     19 non-null object
Tm      19 non-null object
Lg      19 non-null object
Pos     19 non-null object
G       19 non-null object
GS      19 non-null object
MP      19 non-null object
FG      19 non-null object
FGA     19 non-null object
FG%     19 non-null object
3P      19 non-null object
3PA     19 non-null object
3P%     19 non-null object
2P      19 non-null object
2PA     19 non-null object
2P%     19 non-null object
eFG%    19 non-null object
FT      19 non-null object
FTA     19 non-null object
FT%     19 non-null object
ORB     19 non-null object
DRB     19 non-null object
TRB     19 non-null object
AST     19 non-null object
STL     19 non-null object
BLK     19 non-null object
TOV     19 non-null object
PF      19 non-null object
PTS     19 non-null object
dtypes: object(29)
memory usage: 4.4+ KB


Unnamed: 0,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
count,19.0,19,19,19,19,19,19.0,19.0,19.0,19.0,...,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0
unique,15.0,4,2,3,13,12,18.0,15.0,17.0,18.0,...,18.0,8.0,14.0,11.0,15.0,10.0,6.0,14.0,13.0,17.0
top,,PHI,NBA,SF,82,82,34.7,5.0,11.0,0.451,...,0.574,1.0,4.3,5.7,3.0,1.7,0.3,2.6,1.6,13.0
freq,5.0,9,18,11,5,5,2.0,3.0,2.0,2.0,...,2.0,7.0,3.0,3.0,3.0,6.0,5.0,3.0,5.0,3.0
