# Scraping with Pandas

In [1]:
import pandas as pd

We can use the `read_html` function in Pandas to automatically scrape any tabular data from a page.

In [2]:
url = 'https://olympics.com/tokyo-2020/olympic-games/en/results/all-sports/medal-standings.htm'

In [3]:
tables = pd.read_html(url, flavor='html5lib')
tables

[    Rank                    Team/NOC  Unnamed: 2  Unnamed: 3  Unnamed: 4  \
 0      1  People's Republic of China          19          10          11   
 1      2                       Japan          17           4           7   
 2      3    United States of America          14          17          11   
 3      4                         ROC          10          14          10   
 4      5                   Australia           9           2          11   
 ..   ...                         ...         ...         ...         ...   
 65    63                     Finland           0           0           1   
 66    63                      Israel           0           0           1   
 67    63                      Kuwait           0           0           1   
 68    63                    Portugal           0           0           1   
 69    63                  San Marino           0           0           1   
 
     Total  RankbyTotal NOCCode  
 0      40            2     CHN  
 1    

In [4]:
len(tables

9

#### What we get in return is a list of dataframes for any tabular data that Pandas found.

In [4]:
type(tables)

list

#### We can slice off any of those dataframes that we want using normal indexing.

In [5]:
df = tables[0]
df.head()

Unnamed: 0,Rank,Team/NOC,Unnamed: 2,Unnamed: 3,Unnamed: 4,Total,RankbyTotal,NOCCode
0,1,People's Republic of China,19,10,11,40,2,CHN
1,2,Japan,17,4,7,28,4,JPN
2,3,United States of America,14,17,11,42,1,USA
3,4,ROC,10,14,10,34,3,ROC
4,5,Australia,9,2,11,22,6,AUS


#### Drop all single header rows

In [14]:
df.columns = df.columns.get_level_values(0)
df = df.loc[df.Ref.str.startswith("[")]
df.head()

Unnamed: 0,City,Building,Start date,End date,Duration,Ref
0,"Philadelphia, Pennsylvania",Independence Hall,"July 4, 1776 (convened May 10, 1775, prior to ...","December 12, 1776",5 months and 8 days,[8]
1,"Baltimore, Maryland",Henry Fite House,"December 20, 1776","February 27, 1777",2 months and 7 days,[9]
2,"Philadelphia, Pennsylvania",Independence Hall,"March 5, 1777","September 18, 1777",6 months and 13 days,[10]
3,"Lancaster, Pennsylvania",Court House,"September 27, 1777","September 27, 1777",1 day,[10]
4,"York, Pennsylvania",Court House (now Colonial Court House),"September 30, 1777","June 27, 1778",8 months and 28 days,[10]


#### Slipt column values into two separate columns

In [15]:
columnsplit = df['City'].str.split(", ", expand=True)
columnsplit

Unnamed: 0,0,1
0,Philadelphia,Pennsylvania
1,Baltimore,Maryland
2,Philadelphia,Pennsylvania
3,Lancaster,Pennsylvania
4,York,Pennsylvania
5,Philadelphia,Pennsylvania
7,Philadelphia,Pennsylvania
8,Princeton,New Jersey
9,Annapolis,Maryland
10,Trenton,New Jersey


In [16]:
df = df.assign(City=columnsplit[0],
               State=columnsplit[1])
df.head()

Unnamed: 0,City,Building,Start date,End date,Duration,Ref,State
0,Philadelphia,Independence Hall,"July 4, 1776 (convened May 10, 1775, prior to ...","December 12, 1776",5 months and 8 days,[8],Pennsylvania
1,Baltimore,Henry Fite House,"December 20, 1776","February 27, 1777",2 months and 7 days,[9],Maryland
2,Philadelphia,Independence Hall,"March 5, 1777","September 18, 1777",6 months and 13 days,[10],Pennsylvania
3,Lancaster,Court House,"September 27, 1777","September 27, 1777",1 day,[10],Pennsylvania
4,York,Court House (now Colonial Court House),"September 30, 1777","June 27, 1778",8 months and 28 days,[10],Pennsylvania


#### Drop a column

In [17]:
df = df.drop(['Ref'], axis=1)
df.head()

Unnamed: 0,City,Building,Start date,End date,Duration,State
0,Philadelphia,Independence Hall,"July 4, 1776 (convened May 10, 1775, prior to ...","December 12, 1776",5 months and 8 days,Pennsylvania
1,Baltimore,Henry Fite House,"December 20, 1776","February 27, 1777",2 months and 7 days,Maryland
2,Philadelphia,Independence Hall,"March 5, 1777","September 18, 1777",6 months and 13 days,Pennsylvania
3,Lancaster,Court House,"September 27, 1777","September 27, 1777",1 day,Pennsylvania
4,York,Court House (now Colonial Court House),"September 30, 1777","June 27, 1778",8 months and 28 days,Pennsylvania


#### Reset an index

In [18]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,City,Building,Start date,End date,Duration,State
0,Philadelphia,Independence Hall,"July 4, 1776 (convened May 10, 1775, prior to ...","December 12, 1776",5 months and 8 days,Pennsylvania
1,Baltimore,Henry Fite House,"December 20, 1776","February 27, 1777",2 months and 7 days,Maryland
2,Philadelphia,Independence Hall,"March 5, 1777","September 18, 1777",6 months and 13 days,Pennsylvania
3,Lancaster,Court House,"September 27, 1777","September 27, 1777",1 day,Pennsylvania
4,York,Court House (now Colonial Court House),"September 30, 1777","June 27, 1778",8 months and 28 days,Pennsylvania


In [19]:
df.loc[df.State=="New York"]

Unnamed: 0,City,Building,Start date,End date,Duration,State
10,New York,City Hall,"January 11, 1785","October 6, 1788","3 years, 11 months and 5 days",New York
11,New York,Federal Hall,"March 4, 1789","December 5, 1790","1 year, 9 months and 1 day",New York


## DataFrames as HTML

#### Pandas also had a `to_html` method that we can use to generate HTML tables from DataFrames.

In [11]:
html_table = df.to_html()
html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>City</th>\n      <th>Building</th>\n      <th>Start Date</th>\n      <th>End Date</th>\n      <th>Duration</th>\n      <th>State</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Albany</td>\n      <td>Stadt Huys</td>\n      <td>June 19, 1754</td>\n      <td>July 11, 1754</td>\n      <td>22\xa0days</td>\n      <td>New York</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>New York</td>\n      <td>City Hall</td>\n      <td>October 7, 1765</td>\n      <td>October 25, 1765</td>\n      <td>23\xa0days</td>\n      <td>New York</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Philadelphia</td>\n      <td>Carpenters\' Hall</td>\n      <td>September 5, 1774</td>\n      <td>October 26, 1774</td>\n      <td>1\xa0month and 21\xa0days</td>\n      <td>Pennsylvania</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Philadelphia</td>\n      <td>In

#### You may have to strip unwanted newlines to clean up the table.

In [12]:
html_table.replace('\n', '')

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>City</th>      <th>Building</th>      <th>Start Date</th>      <th>End Date</th>      <th>Duration</th>      <th>State</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>Albany</td>      <td>Stadt Huys</td>      <td>June 19, 1754</td>      <td>July 11, 1754</td>      <td>22\xa0days</td>      <td>New York</td>    </tr>    <tr>      <th>1</th>      <td>New York</td>      <td>City Hall</td>      <td>October 7, 1765</td>      <td>October 25, 1765</td>      <td>23\xa0days</td>      <td>New York</td>    </tr>    <tr>      <th>2</th>      <td>Philadelphia</td>      <td>Carpenters\' Hall</td>      <td>September 5, 1774</td>      <td>October 26, 1774</td>      <td>1\xa0month and 21\xa0days</td>      <td>Pennsylvania</td>    </tr>    <tr>      <th>3</th>      <td>Philadelphia</td>      <td>Independence Hall</td>      <td>May 10, 1775</td>      <td>December 12, 1776</td>      <

You can also save the table directly to a file.

In [13]:
df.to_html('table.html')

In [20]:
# OSX Users can run this to open the file in a browser, 
# or you can manually find the file and open it in the browser
!open table.html