In [1]:
import pandas as pd 

# Parsing raw html strings

In [2]:
html_string = """
<table>
    <thead>
      <tr>
        <th>Order date</th>
        <th>Region</th> 
        <th>Item</th>
        <th>Units</th>
        <th>Unit cost</th>
      </tr>
    </thead>
    <tbody>
      <tr>
        <td>1/6/2018</td>
        <td>East</td> 
        <td>Pencil</td>
        <td>95</td>
        <td>1.99</td>
      </tr>
      <tr>
        <td>1/23/2018</td>
        <td>Central</td> 
        <td>Binder</td>
        <td>50</td>
        <td>19.99</td>
      </tr>
      <tr>
        <td>2/9/2018</td>
        <td>Central</td> 
        <td>Pencil</td>
        <td>36</td>
        <td>4.99</td>
      </tr>
      <tr>
        <td>3/15/2018</td>
        <td>West</td> 
        <td>Pen</td>
        <td>27</td>
        <td>19.99</td>
      </tr>
    </tbody>
</table>
"""

In [3]:
from IPython.core.display import display, HTML
display(HTML(html_string))

Order date,Region,Item,Units,Unit cost
1/6/2018,East,Pencil,95,1.99
1/23/2018,Central,Binder,50,19.99
2/9/2018,Central,Pencil,36,4.99
3/15/2018,West,Pen,27,19.99


In [4]:
dfs = pd.read_html(html_string)

In [5]:
dfs 

[  Order date   Region    Item  Units  Unit cost
 0   1/6/2018     East  Pencil     95       1.99
 1  1/23/2018  Central  Binder     50      19.99
 2   2/9/2018  Central  Pencil     36       4.99
 3  3/15/2018     West     Pen     27      19.99]

In [6]:
len(dfs)

1

In [7]:
df = dfs[0]

In [8]:
df 

Unnamed: 0,Order date,Region,Item,Units,Unit cost
0,1/6/2018,East,Pencil,95,1.99
1,1/23/2018,Central,Binder,50,19.99
2,2/9/2018,Central,Pencil,36,4.99
3,3/15/2018,West,Pen,27,19.99


In [9]:
df.loc[df['Region'] == 'Central']

Unnamed: 0,Order date,Region,Item,Units,Unit cost
1,1/23/2018,Central,Binder,50,19.99
2,2/9/2018,Central,Pencil,36,4.99


In [10]:
df.loc[df['Units'] >= 50]

Unnamed: 0,Order date,Region,Item,Units,Unit cost
0,1/6/2018,East,Pencil,95,1.99
1,1/23/2018,Central,Binder,50,19.99


## Defining the header

In [11]:
html_string = """
<table>
  <tr>
    <td>Order date</td>
    <td>Region</td> 
    <td>Item</td>
    <td>Units</td>
    <td>Unit cost</td>
  </tr>
  <tr>
    <td>1/6/2018</td>
    <td>East</td> 
    <td>Pencil</td>
    <td>95</td>
    <td>1.99</td>
  </tr>
  <tr>
    <td>1/23/2018</td>
    <td>Central</td> 
    <td>Binder</td>
    <td>50</td>
    <td>19.99</td>
  </tr>
  <tr>
    <td>2/9/2018</td>
    <td>Central</td> 
    <td>Pencil</td>
    <td>36</td>
    <td>4.99</td>
  </tr>
  <tr>
    <td>3/15/2018</td>
    <td>West</td> 
    <td>Pen</td>
    <td>27</td>
    <td>19.99</td>
  </tr>
</table>
"""

In [12]:
display(HTML(html_string))

0,1,2,3,4
Order date,Region,Item,Units,Unit cost
1/6/2018,East,Pencil,95,1.99
1/23/2018,Central,Binder,50,19.99
2/9/2018,Central,Pencil,36,4.99
3/15/2018,West,Pen,27,19.99


In [13]:
dfs = pd.read_html(html_string, header=[0])

In [14]:
dfs 

[  Order date   Region    Item  Units  Unit cost
 0   1/6/2018     East  Pencil     95       1.99
 1  1/23/2018  Central  Binder     50      19.99
 2   2/9/2018  Central  Pencil     36       4.99
 3  3/15/2018     West     Pen     27      19.99]

In [15]:
df = dfs[0]

In [16]:
df 

Unnamed: 0,Order date,Region,Item,Units,Unit cost
0,1/6/2018,East,Pencil,95,1.99
1,1/23/2018,Central,Binder,50,19.99
2,2/9/2018,Central,Pencil,36,4.99
3,3/15/2018,West,Pen,27,19.99


## Parsing HTML tables from the web

### Simple example

In [17]:
html_url = "https://www.basketball-reference.com/leagues/NBA_2019_per_game.html"

In [18]:
nba_tables = pd.read_html(html_url)

In [19]:
len(nba_tables)

1

In [20]:
nba = nba_tables[0]

In [21]:
nba

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Álex Abrines,SG,25,OKC,31,2,19.0,1.8,5.1,...,.923,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3
1,2,Quincy Acy,PF,28,PHO,10,0,12.3,0.4,1.8,...,.700,0.3,2.2,2.5,0.8,0.1,0.4,0.4,2.4,1.7
2,3,Jaylen Adams,PG,22,ATL,34,1,12.6,1.1,3.2,...,.778,0.3,1.4,1.8,1.9,0.4,0.1,0.8,1.3,3.2
3,4,Steven Adams,C,25,OKC,80,80,33.4,6.0,10.1,...,.500,4.9,4.6,9.5,1.6,1.5,1.0,1.7,2.6,13.9
4,5,Bam Adebayo,C,21,MIA,82,28,23.3,3.4,5.9,...,.735,2.0,5.3,7.3,2.2,0.9,0.8,1.5,2.5,8.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,528,Tyler Zeller,C,29,MEM,4,1,20.5,4.0,7.0,...,.778,2.3,2.3,4.5,0.8,0.3,0.8,1.0,4.0,11.5
730,529,Ante Žižić,C,22,CLE,59,25,18.3,3.1,5.6,...,.705,1.8,3.6,5.4,0.9,0.2,0.4,1.0,1.9,7.8
731,530,Ivica Zubac,C,21,TOT,59,37,17.6,3.6,6.4,...,.802,1.9,4.2,6.1,1.1,0.2,0.9,1.2,2.3,8.9
732,530,Ivica Zubac,C,21,LAL,33,12,15.6,3.4,5.8,...,.864,1.6,3.3,4.9,0.8,0.1,0.8,1.0,2.2,8.5


In [22]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734 entries, 0 to 733
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Rk      734 non-null    object
 1   Player  734 non-null    object
 2   Pos     734 non-null    object
 3   Age     734 non-null    object
 4   Tm      734 non-null    object
 5   G       734 non-null    object
 6   GS      734 non-null    object
 7   MP      734 non-null    object
 8   FG      734 non-null    object
 9   FGA     734 non-null    object
 10  FG%     728 non-null    object
 11  3P      734 non-null    object
 12  3PA     734 non-null    object
 13  3P%     687 non-null    object
 14  2P      734 non-null    object
 15  2PA     734 non-null    object
 16  2P%     719 non-null    object
 17  eFG%    728 non-null    object
 18  FT      734 non-null    object
 19  FTA     734 non-null    object
 20  FT%     691 non-null    object
 21  ORB     734 non-null    object
 22  DRB     734 non-null    ob

In [23]:
nba.describe()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
count,734,734,734,734,734,734,734,734,734,734,...,691,734.0,734.0,734,734.0,734.0,734.0,734.0,734.0,734
unique,531,531,11,23,32,83,81,291,96,172,...,273,44.0,90.0,111,75.0,26.0,26.0,40.0,41.0,200
top,Rk,Player,SG,23,TOT,G,0,MP,FG,FGA,...,FT%,0.3,1.8,TRB,0.8,0.3,0.1,0.8,1.7,PTS
freq,26,26,174,75,86,26,199,26,26,26,...,26,81.0,26.0,26,40.0,93.0,121.0,59.0,46.0,26


In [24]:
nba.isnull().count()

Rk        734
Player    734
Pos       734
Age       734
Tm        734
G         734
GS        734
MP        734
FG        734
FGA       734
FG%       734
3P        734
3PA       734
3P%       734
2P        734
2PA       734
2P%       734
eFG%      734
FT        734
FTA       734
FT%       734
ORB       734
DRB       734
TRB       734
AST       734
STL       734
BLK       734
TOV       734
PF        734
PTS       734
dtype: int64

In [25]:
nba.head(30)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Álex Abrines,SG,25,OKC,31,2,19.0,1.8,5.1,...,.923,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3
1,2,Quincy Acy,PF,28,PHO,10,0,12.3,0.4,1.8,...,.700,0.3,2.2,2.5,0.8,0.1,0.4,0.4,2.4,1.7
2,3,Jaylen Adams,PG,22,ATL,34,1,12.6,1.1,3.2,...,.778,0.3,1.4,1.8,1.9,0.4,0.1,0.8,1.3,3.2
3,4,Steven Adams,C,25,OKC,80,80,33.4,6.0,10.1,...,.500,4.9,4.6,9.5,1.6,1.5,1.0,1.7,2.6,13.9
4,5,Bam Adebayo,C,21,MIA,82,28,23.3,3.4,5.9,...,.735,2.0,5.3,7.3,2.2,0.9,0.8,1.5,2.5,8.9
5,6,Deng Adel,SF,21,CLE,19,3,10.2,0.6,1.9,...,1.000,0.2,0.8,1.0,0.3,0.1,0.2,0.3,0.7,1.7
6,7,DeVaughn Akoon-Purcell,SG,25,DEN,7,0,3.1,0.4,1.4,...,.500,0.1,0.4,0.6,0.9,0.3,0.0,0.3,0.6,1.0
7,8,LaMarcus Aldridge,C,33,SAS,81,81,33.2,8.4,16.3,...,.847,3.1,6.1,9.2,2.4,0.5,1.3,1.8,2.2,21.3
8,9,Rawle Alkins,SG,21,CHI,10,1,12.0,1.3,3.9,...,.667,1.1,1.5,2.6,1.3,0.1,0.0,0.8,0.7,3.7
9,10,Grayson Allen,SG,23,UTA,38,2,10.9,1.8,4.7,...,.750,0.1,0.5,0.6,0.7,0.2,0.2,0.9,1.2,5.6


In [26]:
nba.drop(range(22, df.shape[0] + 1, 20))

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Álex Abrines,SG,25,OKC,31,2,19.0,1.8,5.1,...,.923,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3
1,2,Quincy Acy,PF,28,PHO,10,0,12.3,0.4,1.8,...,.700,0.3,2.2,2.5,0.8,0.1,0.4,0.4,2.4,1.7
2,3,Jaylen Adams,PG,22,ATL,34,1,12.6,1.1,3.2,...,.778,0.3,1.4,1.8,1.9,0.4,0.1,0.8,1.3,3.2
3,4,Steven Adams,C,25,OKC,80,80,33.4,6.0,10.1,...,.500,4.9,4.6,9.5,1.6,1.5,1.0,1.7,2.6,13.9
4,5,Bam Adebayo,C,21,MIA,82,28,23.3,3.4,5.9,...,.735,2.0,5.3,7.3,2.2,0.9,0.8,1.5,2.5,8.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,528,Tyler Zeller,C,29,MEM,4,1,20.5,4.0,7.0,...,.778,2.3,2.3,4.5,0.8,0.3,0.8,1.0,4.0,11.5
730,529,Ante Žižić,C,22,CLE,59,25,18.3,3.1,5.6,...,.705,1.8,3.6,5.4,0.9,0.2,0.4,1.0,1.9,7.8
731,530,Ivica Zubac,C,21,TOT,59,37,17.6,3.6,6.4,...,.802,1.9,4.2,6.1,1.1,0.2,0.9,1.2,2.3,8.9
732,530,Ivica Zubac,C,21,LAL,33,12,15.6,3.4,5.8,...,.864,1.6,3.3,4.9,0.8,0.1,0.8,1.0,2.2,8.5


In [27]:
nba.iloc[22]

Rk            Rk
Player    Player
Pos          Pos
Age          Age
Tm            Tm
G              G
GS            GS
MP            MP
FG            FG
FGA          FGA
FG%          FG%
3P            3P
3PA          3PA
3P%          3P%
2P            2P
2PA          2PA
2P%          2P%
eFG%        eFG%
FT            FT
FTA          FTA
FT%          FT%
ORB          ORB
DRB          DRB
TRB          TRB
AST          AST
STL          STL
BLK          BLK
TOV          TOV
PF            PF
PTS          PTS
Name: 22, dtype: object

In [37]:
nba.drop_duplicates(subset=['Rk'], inplace=True)

In [40]:
nba.head(50)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Álex Abrines,SG,25,OKC,31,2,19.0,1.8,5.1,...,.923,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3
1,2,Quincy Acy,PF,28,PHO,10,0,12.3,0.4,1.8,...,.700,0.3,2.2,2.5,0.8,0.1,0.4,0.4,2.4,1.7
2,3,Jaylen Adams,PG,22,ATL,34,1,12.6,1.1,3.2,...,.778,0.3,1.4,1.8,1.9,0.4,0.1,0.8,1.3,3.2
3,4,Steven Adams,C,25,OKC,80,80,33.4,6.0,10.1,...,.500,4.9,4.6,9.5,1.6,1.5,1.0,1.7,2.6,13.9
4,5,Bam Adebayo,C,21,MIA,82,28,23.3,3.4,5.9,...,.735,2.0,5.3,7.3,2.2,0.9,0.8,1.5,2.5,8.9
5,6,Deng Adel,SF,21,CLE,19,3,10.2,0.6,1.9,...,1.000,0.2,0.8,1.0,0.3,0.1,0.2,0.3,0.7,1.7
6,7,DeVaughn Akoon-Purcell,SG,25,DEN,7,0,3.1,0.4,1.4,...,.500,0.1,0.4,0.6,0.9,0.3,0.0,0.3,0.6,1.0
7,8,LaMarcus Aldridge,C,33,SAS,81,81,33.2,8.4,16.3,...,.847,3.1,6.1,9.2,2.4,0.5,1.3,1.8,2.2,21.3
8,9,Rawle Alkins,SG,21,CHI,10,1,12.0,1.3,3.9,...,.667,1.1,1.5,2.6,1.3,0.1,0.0,0.8,0.7,3.7
9,10,Grayson Allen,SG,23,UTA,38,2,10.9,1.8,4.7,...,.750,0.1,0.5,0.6,0.7,0.2,0.2,0.9,1.2,5.6


In [43]:
nba.drop([22], inplace=True)

In [44]:
nba.head(25)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Álex Abrines,SG,25,OKC,31,2,19.0,1.8,5.1,...,0.923,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3
1,2,Quincy Acy,PF,28,PHO,10,0,12.3,0.4,1.8,...,0.7,0.3,2.2,2.5,0.8,0.1,0.4,0.4,2.4,1.7
2,3,Jaylen Adams,PG,22,ATL,34,1,12.6,1.1,3.2,...,0.778,0.3,1.4,1.8,1.9,0.4,0.1,0.8,1.3,3.2
3,4,Steven Adams,C,25,OKC,80,80,33.4,6.0,10.1,...,0.5,4.9,4.6,9.5,1.6,1.5,1.0,1.7,2.6,13.9
4,5,Bam Adebayo,C,21,MIA,82,28,23.3,3.4,5.9,...,0.735,2.0,5.3,7.3,2.2,0.9,0.8,1.5,2.5,8.9
5,6,Deng Adel,SF,21,CLE,19,3,10.2,0.6,1.9,...,1.0,0.2,0.8,1.0,0.3,0.1,0.2,0.3,0.7,1.7
6,7,DeVaughn Akoon-Purcell,SG,25,DEN,7,0,3.1,0.4,1.4,...,0.5,0.1,0.4,0.6,0.9,0.3,0.0,0.3,0.6,1.0
7,8,LaMarcus Aldridge,C,33,SAS,81,81,33.2,8.4,16.3,...,0.847,3.1,6.1,9.2,2.4,0.5,1.3,1.8,2.2,21.3
8,9,Rawle Alkins,SG,21,CHI,10,1,12.0,1.3,3.9,...,0.667,1.1,1.5,2.6,1.3,0.1,0.0,0.8,0.7,3.7
9,10,Grayson Allen,SG,23,UTA,38,2,10.9,1.8,4.7,...,0.75,0.1,0.5,0.6,0.7,0.2,0.2,0.9,1.2,5.6


### Complex sample

In [28]:
import requests

html_url = "https://en.wikipedia.org/wiki/The_Simpsons"

In [29]:
r = requests.get(html_url)

wiki_tables = pd.read_html(r.text, header=0)

In [30]:
len(wiki_tables)

31

In [31]:
simpsons = wiki_tables[1]

In [32]:
simpsons.head()

Unnamed: 0,Season,Season.1,No. ofepisodes,Originally aired,Originally aired.1,Originally aired.2,Viewership,Viewership.1,Viewership.2
0,Season,Season,No. ofepisodes,Season premiere,Season finale,Time slot (ET),Avg. viewers(in millions),Most watched episode,Most watched episode
1,Season,Season,No. ofepisodes,Season premiere,Season finale,Time slot (ET),Avg. viewers(in millions),Viewers(millions),Episode title
2,1,1989–90,13,"December 17, 1989","May 13, 1990",Sunday 8:30 pm,27.8,33.5,"""Life on the Fast Lane"""
3,2,1990–91,22,"October 11, 1990","July 11, 1991",Thursday 8:00 pm,24.4,33.6,"""Bart Gets an 'F'"""
4,3,1991–92,24,"September 19, 1991","August 27, 1992",Thursday 8:00 pm,21.8,25.5,"""Colonel Homer"""


In [33]:
simpsons.drop([0,1], inplace=True)
simpsons.head()

Unnamed: 0,Season,Season.1,No. ofepisodes,Originally aired,Originally aired.1,Originally aired.2,Viewership,Viewership.1,Viewership.2
2,1,1989–90,13,"December 17, 1989","May 13, 1990",Sunday 8:30 pm,27.8,33.5,"""Life on the Fast Lane"""
3,2,1990–91,22,"October 11, 1990","July 11, 1991",Thursday 8:00 pm,24.4,33.6,"""Bart Gets an 'F'"""
4,3,1991–92,24,"September 19, 1991","August 27, 1992",Thursday 8:00 pm,21.8,25.5,"""Colonel Homer"""
5,4,1992–93,22,"September 24, 1992","May 13, 1993",Thursday 8:00 pm,22.4,28.6,"""Lisa's First Word"""
6,5,1993–94,22,"September 30, 1993","May 19, 1994",Thursday 8:00 pm,18.9,24.0,"""Treehouse of Horror IV"""


In [34]:
simpsons.set_index('Season', inplace=True)

In [35]:
simpsons['No. ofepisodes'].unique()

array(['13', '22', '24', '25', '23', '21', '20'], dtype=object)

In [36]:
simpsons['No. ofepisodes']

Season
1     13
2     22
3     24
4     22
5     22
6     25
7     25
8     25
9     25
10    23
11    22
12    21
13    22
14    22
15    22
16    21
17    22
18    22
19    20
20    21
21    23
22    22
23    22
24    22
25    22
26    22
27    22
28    22
29    21
30    23
31    22
32    22
33    22
Name: No. ofepisodes, dtype: object

In [45]:
simpsons = simpsons.loc(simpsons['No. ofepisodes'] != 'TBA')

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [46]:
min_episodes = simpsons['No. ofepisodes'].min()

min_episodes

'13'

In [48]:
si = simpsons['No. ofepisodes'] == min_episodes

simpsons[si] 

Unnamed: 0_level_0,Season.1,No. ofepisodes,Originally aired,Originally aired.1,Originally aired.2,Viewership,Viewership.1,Viewership.2
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1989–90,13,"December 17, 1989","May 13, 1990",Sunday 8:30 pm,27.8,33.5,"""Life on the Fast Lane"""
