## 1. Imports

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## 2. Create a Soup Object

In [2]:
with open("html-doc.html") as file:
    soup = BeautifulSoup(file, "html.parser")

In [3]:
type(soup)

bs4.BeautifulSoup

### 3. Basics of the `soup` object

- `prettify()`
- individual tags:
  - `title`
  - `a`
  - `p`
- `text`
- `name`
- `parent`
- `children`
- `descendants`
- `get_text()`
- `find()`
- `find_all()`
- `get()` / square bracket notation


In [4]:
# prettify
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>



In [5]:
soup.title

<title>The Dormouse's story</title>

In [6]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [7]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [8]:
# text
soup.title

<title>The Dormouse's story</title>

In [9]:
type(soup.title)

bs4.element.Tag

In [10]:
soup.title.text

"The Dormouse's story"

In [11]:
type(soup.title.text)

str

In [12]:
soup.title.text.upper()

"THE DORMOUSE'S STORY"

In [13]:
soup.title.text.lower()

"the dormouse's story"

In [14]:
soup.title.text.split()

['The', "Dormouse's", 'story']

In [15]:
# name
soup.title

<title>The Dormouse's story</title>

In [16]:
soup.title.name

'title'

In [17]:
# parent
soup.title.parent

<head><title>The Dormouse's story</title></head>

In [18]:
soup.title.parent.parent

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p></body></html>

In [19]:
# children

soup.body

<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p></body>

In [20]:
soup.body.children

<list_iterator at 0x21f2af60fa0>

In [21]:
for child in soup.body.children:
    print(child)



<p class="title"><b>The Dormouse's story</b></p>


<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>


<p class="story">...</p>


In [22]:
for child in soup.head.children:
    print(child)

<title>The Dormouse's story</title>


In [23]:
# descendants

soup.body.descendants

<generator object Tag.descendants at 0x0000021F0BED1630>

In [24]:
for descendant in soup.body.descendants:
    print(descendant)



<p class="title"><b>The Dormouse's story</b></p>
<b>The Dormouse's story</b>
The Dormouse's story


<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
Once upon a time there were three little sisters; and their names were

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
Elsie
,

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
Lacie
 and

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
Tillie
;
and they lived at the bottom of a well.


<p class="story">...</p>
...


In [25]:
# get_text()

soup.body

<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p></body>

In [26]:
soup.body.get_text()

"\nThe Dormouse's story\nOnce upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.\n..."

In [27]:
print(soup.body.get_text().strip())

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...


In [28]:
type(soup.body.get_text())

str

In [29]:
soup.title.get_text()

"The Dormouse's story"

In [30]:
# find

soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [31]:
soup.find('a')

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [32]:
# find all
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [33]:
type(soup.find_all('a'))

bs4.element.ResultSet

In [34]:
# get
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [35]:
soup.a['id']

'link1'

In [36]:
soup.a.get('class')

['sister']

In [37]:
soup.a.get('href')

'http://example.com/elsie'

In [38]:
soup.a.get('id')

'link1'

## 4. Fetching Webpage with Requests
- URL: https://www.bbc.com/sport/football/premier-league/top-scorers

In [39]:
# GET request
url = "https://www.bbc.com/sport/football/premier-league/top-scorers"

response = requests.get(url)

In [40]:
# check for errors
response.raise_for_status()

In [41]:
print(response.raise_for_status())

None


In [42]:
# status code
response.status_code

200

In [43]:
# text (string format)
response.text[:200]

'<!DOCTYPE html><html lang="en-GB" class="no-js"><head><meta charSet="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" /><title data-rh="true">Premier League Top Scorers - B'

In [44]:
type(response.text)

str

In [45]:
# content (binary format)

response.content[:200]

b'<!DOCTYPE html><html lang="en-GB" class="no-js"><head><meta charSet="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" /><title data-rh="true">Premier League Top Scorers - B'

In [46]:
type(response.content)

bytes

- can use either `content` or `text` attribute to create the `soup` object  
- with `text`, we must be certain about the encoding  
- with `content`, we let Beautiful Soup handle the encoding mostly


In [47]:
# soup object

soup = BeautifulSoup(response.content, "html.parser")

In [48]:
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en-GB">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title data-rh="true">
   Premier League Top Scorers - BBC Sport
  </title>
  <meta content="Premier League top scorers. Showing assists, time on pitch and the shots on and off target." data-rh="true" name="description"/>
  <meta content="#FFFFFF" data-rh="true" name="theme-color"/>
  <meta content="Premier League top scorers. Showing assists, time on pitch and the shots on and off target." data-rh="true" property="og:description"/>
  <meta content="https://static.files.bbci.co.uk/core/website/assets/static/sport/bbc-sport-logo.0da9386782.png" data-rh="true" property="og:image"/>
  <meta content="BBC Sport" data-rh="true" property="og:site_name"/>
  <meta content="Premier League Top Scorers - BBC Sport" data-rh="true" property="og:title"/>
  <meta content="article" data-rh="true" property="og:type"/>
  <meta content="https://www.b

### 5. HTML parsing with Beautiful Soup: A Mini Project

- scrape data from a public website  
- organize the data as a dataframe  
- export the data as an excel sheet


In [49]:
# the site was outdated and could find the data
# so i copied the html to do the further task
with open("bbc sports top scorer.html") as file:
    soup = BeautifulSoup(file, "html.parser")

print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en-GB">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title data-rh="true">
   Premier League Top Scorers - BBC Sport
  </title>
  <meta content="Premier League top scorers. Showing assists, time on pitch and the shots on and off target." data-rh="true" name="description"/>
  <meta content="#FFFFFF" data-rh="true" name="theme-color"/>
  <meta content="Premier League top scorers. Showing assists, time on pitch and the shots on and off target." data-rh="true" property="og:description"/>
  <meta content="https://static.files.bbci.co.uk/core/website/assets/static/sport/bbc-sport-logo.0da9386782224b380455.png" data-rh="true" property="og:image"/>
  <meta content="BBC Sport" data-rh="true" property="og:site_name"/>
  <meta content="Premier League Top Scorers - BBC Sport" data-rh="true" property="og:title"/>
  <meta content="article" data-rh="true" property="og:type"/>
  <meta content="htt

In [50]:
player_names = []
team_names = []
goals = []
assists = []
num_matches = []
shots = []

In [51]:
# The original code if the the site is scrapped directly using requests but it wasnt available so i did the alternative way by making the exact copy 
# of the html code to do the demo task.

# try:
#     response = requests.get(url)
#     response.raise_for_status()
# except Exception as e:
#     print(e)
# else:
#     soup = BeautifulSoup(response.content, 'html.parser')
#     players = soup.find('tbody').find_all('tr', class_ = 'ssrcss-dhlz6k-TableRowBody e1icz100')
#     for player in players:
#         player_name = player.find('div', class_ = 'ssrcss-m6ah29-PlayerName e1n8xy5b1').get_text(strip = True)
#         team_name = player.find('div', class_ = 'ssrcss-qvpga1-TeamsSummary e1n8xy5b0').get_text(strip = True)
#         goals_scored = int(player.find('div', class_ = 'ssrcss-8k20kk-CellWrapper ef9ipf0')).get_text(strip = True)

#         stats = player.find_all('div', class_ = 'ssrcss-150z8d-CellWrapper ef9ipf0')
#         assists_made = int(stats[0].get_text(strip = True))
#         matches_playedd = int(stats[2].get_text(strip = True))
#         shots_taken = int(stats[-3].get_text(strip = True))

#         player_names.append(player_name)
#         team_names.append(team_name)
#         goals.append(goals_scored)
#         assists.append(assists_made)
#         num_matches.append(matches_played)
#         shots.append(shots_taken)

#     data = {
#         'player': player_names,
#         'team': team_names,
#         'matches': num_matches,
#         'goals': goals,
#         'assists': assists,
#         'shots': shots
#     }
    
#     df_players = pd.DataFrame(data)

In [52]:
# Alternative approach just to demonstrate doing things correctly
# Parse the relevant table rows
tbody = soup.find('tbody')
if tbody:
    players = tbody.find_all('tr', class_='ssrcss-dhlz6k-TableRowBody e1icz100')
    
    for player in players:
        player_name = player.find('div', class_='ssrcss-m6ah29-PlayerName e1n8xy5b1').get_text(strip=True)
        team_name = player.find('div', class_='ssrcss-qvpga1-TeamsSummary e1n8xy5b0').get_text(strip=True)
        goals_scored = int(player.find('div', class_='ssrcss-8k20kk-CellWrapper ef9ipf0').get_text(strip=True))

        stats = player.find_all('div', class_='ssrcss-150z8d-CellWrapper ef9ipf0')
        assists_made = int(stats[0].get_text(strip=True))
        matches_played = int(stats[2].get_text(strip=True))
        shots_taken = int(stats[-3].get_text(strip=True))

        player_names.append(player_name)
        team_names.append(team_name)
        goals.append(goals_scored)
        assists.append(assists_made)
        num_matches.append(matches_played)
        shots.append(shots_taken)

    # Create DataFrame
    data = {
        'player': player_names,
        'team': team_names,
        'matches': num_matches,
        'goals': goals,
        'assists': assists,
        'shots': shots
    }

    df_players = pd.DataFrame(data)


In [53]:
df_players

Unnamed: 0,player,team,matches,goals,assists,shots
0,E. Haaland,Man City,12,12,0,62
1,Mohamed Salah,Liverpool,12,10,6,40
2,B. Mbeumo,Brentford,12,8,1,22
3,C. Wood,Nottm Forest,12,8,0,21
4,C. Palmer,Chelsea,12,7,5,38
5,N. Jackson,Chelsea,12,7,3,28
6,Matheus Cunha,Wolves,12,7,3,35
7,Y. Wissa,Brentford,9,7,1,17
8,O. Watkins,Aston Villa,12,6,2,31
9,D. Welbeck,Brighton,12,6,2,31


In [54]:
df_players.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   player   27 non-null     object
 1   team     27 non-null     object
 2   matches  27 non-null     int64 
 3   goals    27 non-null     int64 
 4   assists  27 non-null     int64 
 5   shots    27 non-null     int64 
dtypes: int64(4), object(2)
memory usage: 1.4+ KB


In [55]:
df_players.describe()

Unnamed: 0,matches,goals,assists,shots
count,27.0,27.0,27.0,27.0
mean,11.333333,5.592593,1.888889,27.851852
std,1.330124,2.098704,1.908147,11.870143
min,6.0,4.0,0.0,9.0
25%,11.0,4.0,1.0,20.0
50%,12.0,5.0,2.0,28.0
75%,12.0,7.0,2.0,34.5
max,12.0,12.0,8.0,62.0


In [57]:
pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl

   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   ---------------------------------------- 2/2 [openpyxl]

Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.


In [58]:
df_players.to_excel('EPL Top Scorers.xlsx', index = False)