# By Alex Dance

https://www.linkedin.com/in/alex-dance/

# Web Scraping in Python (using BeautifulSoup)


In [1]:
## Import Libraries
import regex as re
from urllib.parse import unquote
import urllib3
from bs4 import BeautifulSoup

# Research
*  https://stackoverflow.com/questions/46490626/getting-all-links-from-a-page-beautiful-soup/46490657

In [2]:
page =  'https://billieeilish.fandom.com/wiki/Billie_Eilish_Wiki'

In [3]:
# query the website and return the html to the variable ‘page’
http = urllib3.PoolManager()
r = http.request('GET', page)
if r.status == 200:
    page = r.data
    print('Type of the variable \'page\':', page.__class__.__name__)
    print('Page Retrieved. Request Status: %d, Page Size: %d' % (r.status, len(page)))
else:
    print('Some problem occurred. Request Status: %s' % r.status)

Type of the variable 'page': bytes
Page Retrieved. Request Status: 200, Page Size: 225087




In [4]:
# parse the html using beautiful soup and store in variable `soup`
soup = BeautifulSoup(page, 'html.parser')
print('Type of the variable \'soup\':', soup.__class__.__name__)

Type of the variable 'soup': BeautifulSoup


In [5]:
print(soup.prettify()[:3000])

<!DOCTYPE html>
<html class="" dir="ltr" lang="en">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, user-scalable=yes" name="viewport"/>
  <meta content="MediaWiki 1.19.24" name="generator">
   <meta content="Billie Eilish Wiki,wherearetheavocados,Billie Eilish Wiki,When We All Fall Asleep, Where Do We Go?,Don't Smile at Me,She's Broken,Fingers Crossed,Six Feet Under,Ocean Eyes,Bellyache,Bored,Watch,Copycat" name="keywords">
    <meta content="Welcome The Billie Eilish Wiki is the free encyclopedia and a collaborative community website that provides details of the American alt pop singer Billie Eilish, including you, can edit! We have topics about Billie Eilish, her music, and many more! Before editing, please read our policies below..." name="description"/>
    <meta content="summary" name="twitter:card"/>
    <meta content="@getfandom" name="twitter:site"/>
    <meta content="https://billieeilish.fandom.com/wiki/Bill

In [6]:
print('Title tag :%s:' % soup.title)
print('Title text:%s:' % soup.title.string)

Title tag :<title>Billie Eilish Wiki | Fandom</title>:
Title text:Billie Eilish Wiki | Fandom:


In [7]:
tags = soup.find_all(id = True, href = True) 

In [8]:
print(tags)

[<a class="wds-button" data-tracking="ca-viewsource" href="/wiki/Billie_Eilish_Wiki?action=edit" id="ca-viewsource">
<svg class="wds-icon wds-icon-small" id="wds-icons-lock-small" viewbox="0 0 18 18" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><defs><path d="M14 16H4V8h10v8zM7 4c0-1.104.897-2 2-2s2 .896 2 2v2H7V4zm8 2h-2V4c0-2.205-1.794-4-4-4S5 1.795 5 4v2H3a1 1 0 0 0-1 1v10a1 1 0 0 0 1 1h12a1 1 0 0 0 1-1V7a1 1 0 0 0-1-1zm-5 4H8a1 1 0 1 0 0 2v1a1 1 0 1 0 2 0v-1a1 1 0 1 0 0-2" id="lock-small"></path></defs><use fill-rule="evenodd" xlink:href="#lock-small"></use></svg> <span>View source</span>
</a>, <a data-tracking="ca-history-dropdown" href="/wiki/Billie_Eilish_Wiki?action=history" id="ca-history">
							History						</a>, <a class="new" data-tracking="ca-talk-dropdown" href="/wiki/Talk:Billie_Eilish_Wiki?action=edit&amp;redlink=1" id="ca-talk">
							Talk (0)						</a>, <a class="wds-button wds-is-secondary" href="#" id="ShareEntryPoint">
<svg class

In [10]:
links = soup.find_all('a', href=True)

In [11]:
print(links)

[<a class="wds-global-navigation__logo" data-tracking-label="logo" href="https://www.fandom.com">
<svg class="wds-global-navigation__logo-image" id="wds-company-logo-fandom-white" viewbox="0 0 164 35" xmlns="http://www.w3.org/2000/svg"><g fill="none" fill-rule="evenodd"><path d="M32.003 16.524c0 .288-.115.564-.32.768L18.3 30.712c-.226.224-.454.324-.738.324-.292 0-.55-.11-.77-.325l-.943-.886a.41.41 0 0 1-.01-.59l15.45-15.46c.262-.263.716-.078.716.29v2.46zm-17.167 10.12l-.766.685a.642.642 0 0 1-.872-.02L3.01 17.362c-.257-.25-.4-.593-.4-.95v-1.858c0-.67.816-1.007 1.298-.536l10.814 10.56c.188.187.505.57.505 1.033 0 .296-.068.715-.39 1.035zM5.73 7.395L9.236 3.93a.421.421 0 0 1 .592 0l11.736 11.603a3.158 3.158 0 0 1 0 4.5l-3.503 3.462a.423.423 0 0 1-.59 0L5.732 11.89a3.132 3.132 0 0 1-.937-2.25c0-.85.332-1.65.935-2.246zm13.89 1.982l3.662-3.62a3.232 3.232 0 0 1 2.737-.897c.722.098 1.378.47 1.893.978l3.708 3.667a.41.41 0 0 1 0 .585l-5.64 5.576a.419.419 0 0 1-.59 0l-5.77-5.704a.411.411 0 0 1 0-

In [12]:
for link in soup.find_all('a', href=True):
    print(link['href'])

https://www.fandom.com
https://www.fandom.com/topics/games
https://www.fandom.com/topics/movies
https://www.fandom.com/topics/tv
https://www.fandom.com/video
https://www.fandom.com/explore
//community.fandom.com/wiki/Community_Central
https://ucp.fandom.com/wiki/Special:CreateNewWiki
https://www.fandom.com/signin?redirect=https%3A%2F%2Fbillieeilish.fandom.com%2Fwiki%2FBillie_Eilish_Wiki
https://www.fandom.com/register?redirect=https%3A%2F%2Fbillieeilish.fandom.com%2Fwiki%2FBillie_Eilish_Wiki
https://ucp.fandom.com/wiki/Special:CreateNewWiki
//billieeilish.fandom.com
//billieeilish.fandom.com
/wiki/Special:CreatePage
/wiki/People
/wiki/Category:Family_Members
/wiki/Billie_Eilish
/wiki/Finneas_O%27Connell
/wiki/Maggie_Baird
/wiki/Patrick_O%27Connell
/wiki/Billie_Eilish
/wiki/Covers
/wiki/Hotline_Bling
/wiki/Can%27t_Help_Falling_in_Love
/wiki/Dollhouse
/wiki/If_You_Want_Me
/wiki/Jealous
/wiki/Extended_Plays
/wiki/Don%27t_Smile_at_Me
/wiki/Albums
/wiki/When_We_All_Fall_Asleep,_Where_Do_We_

In [13]:
print(links[2]) # the 2nd link

<a class="wds-global-navigation__link" data-tracking-label="link.movies" href="https://www.fandom.com/topics/movies">
	Movies</a>


In [14]:
print(link['href'])

#


In [15]:
print(link.get('href'))

#


In [16]:
link = links[2]          # get the first link in the entire page
url1  = link['href']      # get value of the href attribute
url2  = link.get('href')  # or like this

In [17]:
print(url1)

https://www.fandom.com/topics/movies


In [18]:
print(url2)

https://www.fandom.com/topics/movies


In [19]:
print(links)

[<a class="wds-global-navigation__logo" data-tracking-label="logo" href="https://www.fandom.com">
<svg class="wds-global-navigation__logo-image" id="wds-company-logo-fandom-white" viewbox="0 0 164 35" xmlns="http://www.w3.org/2000/svg"><g fill="none" fill-rule="evenodd"><path d="M32.003 16.524c0 .288-.115.564-.32.768L18.3 30.712c-.226.224-.454.324-.738.324-.292 0-.55-.11-.77-.325l-.943-.886a.41.41 0 0 1-.01-.59l15.45-15.46c.262-.263.716-.078.716.29v2.46zm-17.167 10.12l-.766.685a.642.642 0 0 1-.872-.02L3.01 17.362c-.257-.25-.4-.593-.4-.95v-1.858c0-.67.816-1.007 1.298-.536l10.814 10.56c.188.187.505.57.505 1.033 0 .296-.068.715-.39 1.035zM5.73 7.395L9.236 3.93a.421.421 0 0 1 .592 0l11.736 11.603a3.158 3.158 0 0 1 0 4.5l-3.503 3.462a.423.423 0 0 1-.59 0L5.732 11.89a3.132 3.132 0 0 1-.937-2.25c0-.85.332-1.65.935-2.246zm13.89 1.982l3.662-3.62a3.232 3.232 0 0 1 2.737-.897c.722.098 1.378.47 1.893.978l3.708 3.667a.41.41 0 0 1 0 .585l-5.64 5.576a.419.419 0 0 1-.59 0l-5.77-5.704a.411.411 0 0 1 0-

### Create a filter for unwanted types of articles

In [45]:
soup.find_all(href=re.compile("sleep")) # filter for sleep / asleep 

[<a data-tracking="custom-level-3" href="/wiki/When_We_All_Fall_Asleep,_Where_Do_We_Go%3F">WHEN WE ALL FALL ASLEEP, WHERE DO WE GO?</a>,
 <a class="image link-internal" href="/wiki/When_We_All_Fall_Asleep,_Where_Do_We_Go%3F" style="height:125px; width:125px;" title="When_We_All_Fall_Asleep,_Where_Do_We_Go? (513 KB)"><noscript><img alt="When_We_All_Fall_Asleep,_Where_Do_We_Go?" class="thumbimage" data-image-key="Whenwe.jpg" data-image-name="Whenwe.jpg" src="https://vignette.wikia.nocookie.net/wherearetheavocados/images/9/93/Whenwe.jpg/revision/latest/scale-to-width-down/125?cb=20190130213736" style="" title="When_We_All_Fall_Asleep,_Where_Do_We_Go? (513 KB)"/></noscript><img alt="When_We_All_Fall_Asleep,_Where_Do_We_Go?" class="thumbimage lzy lzyPlcHld" data-image-key="Whenwe.jpg" data-image-name="Whenwe.jpg" data-src="https://vignette.wikia.nocookie.net/wherearetheavocados/images/9/93/Whenwe.jpg/revision/latest/scale-to-width-down/125?cb=20190130213736" onload='if(typeof ImgLzy=="objec

# Extra dabbling

In [20]:
print(r.status)

200


In [21]:
print(r.headers)

HTTPHeaderDict({'Connection': 'keep-alive', 'Content-Length': '225087', 'Content-Language': 'en', 'Content-Security-Policy': 'upgrade-insecure-requests', 'Content-Security-Policy-Report-Only': "default-src https: 'self' data: blob:; script-src https: 'self' data: 'unsafe-inline' 'unsafe-eval' blob:; style-src https: 'self' 'unsafe-inline' blob:; report-uri https://services.fandom.com/csp-logger/csp/app", 'Content-Type': 'text/html; charset=utf-8', 'Etag': '"20200702114552-1593801086141"', 'Last-Modified': 'Thu, 02 Jul 2020 11:45:52 GMT', 'X-Backend-Response-Time': '0.088', 'X-Content-Type-Options': 'nosniff', 'X-Span-Id': 'a19b02eb-9197-48d1-b131-06a6baacbe96', 'X-Trace-Id': '3a281d45-b27f-43f8-8c43-f58312bf6c63', 'X-Datacenter': 'SJC', 'X-Cacheable': 'YES', 'Accept-Ranges': 'bytes', 'Date': 'Wed, 08 Jul 2020 10:55:31 GMT', 'Age': '12654', 'X-Served-By': 'mediawiki-prod-ucp-b45c7bfff-s5q5h, cache-wk-sjc3163-WIKIA, cache-syd10144-SYD', 'X-Cache': 'ORIGIN, HIT, HIT', 'X-Cache-Hits': 'ORI

In [22]:
print(type(r))

<class 'urllib3.response.HTTPResponse'>


In [23]:
links = soup.find_all("a")

In [24]:
links = soup.find_all("a")
print(links)
print("\n")

[<a class="wds-global-navigation__logo" data-tracking-label="logo" href="https://www.fandom.com">
<svg class="wds-global-navigation__logo-image" id="wds-company-logo-fandom-white" viewbox="0 0 164 35" xmlns="http://www.w3.org/2000/svg"><g fill="none" fill-rule="evenodd"><path d="M32.003 16.524c0 .288-.115.564-.32.768L18.3 30.712c-.226.224-.454.324-.738.324-.292 0-.55-.11-.77-.325l-.943-.886a.41.41 0 0 1-.01-.59l15.45-15.46c.262-.263.716-.078.716.29v2.46zm-17.167 10.12l-.766.685a.642.642 0 0 1-.872-.02L3.01 17.362c-.257-.25-.4-.593-.4-.95v-1.858c0-.67.816-1.007 1.298-.536l10.814 10.56c.188.187.505.57.505 1.033 0 .296-.068.715-.39 1.035zM5.73 7.395L9.236 3.93a.421.421 0 0 1 .592 0l11.736 11.603a3.158 3.158 0 0 1 0 4.5l-3.503 3.462a.423.423 0 0 1-.59 0L5.732 11.89a3.132 3.132 0 0 1-.937-2.25c0-.85.332-1.65.935-2.246zm13.89 1.982l3.662-3.62a3.232 3.232 0 0 1 2.737-.897c.722.098 1.378.47 1.893.978l3.708 3.667a.41.41 0 0 1 0 .585l-5.64 5.576a.419.419 0 0 1-.59 0l-5.77-5.704a.411.411 0 0 1 0-

In [25]:
for link in links:
    if "About" in link.text:
        print(link)
        print(link.attrs['href'])

<a class="wds-global-footer__link" data-tracking-label="company-overview.about" href="https://www.fandom.com/about">
	About</a>
https://www.fandom.com/about


In [None]:
# by Alex Dance