# Scraping from a real website + Pandas (Part 1)

In [61]:
from bs4 import BeautifulSoup
import requests

In [62]:
url = 'https://ipho-unofficial.org/timeline/2022/individual'

page = requests.get(url)

soup = BeautifulSoup(page.text, 'html')

In [63]:
print(soup)

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta content="application/xhtml+xml; charset=utf-8" http-equiv="content-type"/>
<link href="../../img/fav-logo.ico" rel="shortcut icon" type="image/x-icon"/>
<link href="../../css/design.css" rel="stylesheet" type="text/css"/>
<link href="../../css/print.css" media="print" rel="stylesheet" type="text/css"/>
<title>IPhO 2022 - Individual Results</title>
</head>
<body>
<div id="header">
<div id="h1">
<h1><a href="../../.">International Physics Olympiad</a></h1>
</div>
<div id="sub">
<span class="previous"><a href="https://ipho2023.jp/en/" target="_blank">IPhO 2023</a></span>
<span class="img"><a href="../../."><img alt="IPhO" height="22" src="../../img/logo.png" width="30"/></a></span>
<span class="next"><a href="" target="_blank">IPhO 2024</a></span>
</div>
</div>
<div id="sidebar">
<ul>
<li><a class="highlight" href="../../tim

In [64]:
table = soup.find('table')

In [65]:
print(table)

<table>
<thead>
<tr>
<th>Contestant</th>
<th>Country</th>
<th>Rank</th>
<th>Award</th>
<th style="">Theoretical</th>
<th style="">Experimental</th>
<th style="">Total</th>
</tr>
</thead>
<tbody>
<tr>
<td>Guowei Xu</td>
<td><a href="../../countries/CHN/individual">People's Republic of China</a></td>
<td align="right">1</td>
<td><img height="9" src="../../img/gold.png" width="9"/> Gold Medal</td>
<td align="right" style="">23.70</td>
<td align="right" style="">19.50</td>
<td align="right" style="">43.20</td>
</tr>
<tr>
<td>Mingxuan Yang</td>
<td><a href="../../countries/CHN/individual">People's Republic of China</a></td>
<td align="right">2</td>
<td><img height="9" src="../../img/gold.png" width="9"/> Gold Medal</td>
<td align="right" style="">23.50</td>
<td align="right" style="">19.60</td>
<td align="right" style="">43.10</td>
</tr>
<tr>
<td>Zirui Liu</td>
<td><a href="../../countries/CHN/individual">People's Republic of China</a></td>
<td align="right">3</td>
<td><img height="9" src="

In [66]:
world_titles = table.find_all('th')

In [67]:
world_titles

[<th>Contestant</th>,
 <th>Country</th>,
 <th>Rank</th>,
 <th>Award</th>,
 <th style="">Theoretical</th>,
 <th style="">Experimental</th>,
 <th style="">Total</th>]

In [68]:
world_table_titles = [title.text.strip() for title in world_titles ]

print(world_table_titles)

['Contestant', 'Country', 'Rank', 'Award', 'Theoretical', 'Experimental', 'Total']


In [69]:
import pandas as pd

In [70]:
# Create data frame column
df = pd.DataFrame(columns = world_table_titles)
df

Unnamed: 0,Contestant,Country,Rank,Award,Theoretical,Experimental,Total


In [71]:
column_data = table.find_all('tr')

In [72]:
column_data

[<tr>
 <th>Contestant</th>
 <th>Country</th>
 <th>Rank</th>
 <th>Award</th>
 <th style="">Theoretical</th>
 <th style="">Experimental</th>
 <th style="">Total</th>
 </tr>,
 <tr>
 <td>Guowei Xu</td>
 <td><a href="../../countries/CHN/individual">People's Republic of China</a></td>
 <td align="right">1</td>
 <td><img height="9" src="../../img/gold.png" width="9"/> Gold Medal</td>
 <td align="right" style="">23.70</td>
 <td align="right" style="">19.50</td>
 <td align="right" style="">43.20</td>
 </tr>,
 <tr>
 <td>Mingxuan Yang</td>
 <td><a href="../../countries/CHN/individual">People's Republic of China</a></td>
 <td align="right">2</td>
 <td><img height="9" src="../../img/gold.png" width="9"/> Gold Medal</td>
 <td align="right" style="">23.50</td>
 <td align="right" style="">19.60</td>
 <td align="right" style="">43.10</td>
 </tr>,
 <tr>
 <td>Zirui Liu</td>
 <td><a href="../../countries/CHN/individual">People's Republic of China</a></td>
 <td align="right">3</td>
 <td><img height="9" src

In [73]:
for row in column_data[1:]:
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]
    # print(individual_row_data)
    length = len(df)
    df.loc[length] = individual_row_data

In [79]:
df

Unnamed: 0,Contestant,Country,Rank,Award,Theoretical,Experimental,Total
0,Guowei Xu,People's Republic of China,1,Gold Medal,23.70,19.50,43.20
1,Mingxuan Yang,People's Republic of China,2,Gold Medal,23.50,19.60,43.10
2,Zirui Liu,People's Republic of China,3,Gold Medal,24.10,18.85,42.95
3,Qiancheng Li,People's Republic of China,4,Gold Medal,24.40,18.15,42.55
4,Yuqi Ren,People's Republic of China,5,Gold Medal,23.20,17.30,40.50
...,...,...,...,...,...,...,...
297,Giorgos Zevedeos,Cyprus,298,Honourable Mention,2.60,4.65,7.25
298,Sergio Soto Laso,Spain,298,Honourable Mention,3.00,4.25,7.25
299,Kristín Ingibjörg Magnúsdóttir,Iceland,300,Honourable Mention,1.70,5.50,7.20
300,Madinabonu Khasanjonova,Uzbekistan,300,Honourable Mention,2.80,4.40,7.20


In [75]:
df.to_csv(r'C:\Users\f.pranata\Documents\ipho2022.csv', index = False)