# Web Scraping (using BeautifulSoup) by Alex Dance


## Find a Page
 Using  [Fandom](http://fandom.wikia.com) 

In [1]:
## Import Libraries
import regex as re
from urllib.parse import unquote
import urllib3
from bs4 import BeautifulSoup

# Research
*  https://stackoverflow.com/questions/46490626/getting-all-links-from-a-page-beautiful-soup/46490657

In [2]:
page =  'https://billieeilish.fandom.com/wiki/Billie_Eilish_Wiki'

In [3]:
# query the website and return the html to the variable ‘page’
http = urllib3.PoolManager()
r = http.request('GET', page)
if r.status == 200:
    page = r.data
    print('Type of the variable \'page\':', page.__class__.__name__)
    print('Page Retrieved. Request Status: %d, Page Size: %d' % (r.status, len(page)))
else:
    print('Some problem occurred. Request Status: %s' % r.status)

Type of the variable 'page': bytes
Page Retrieved. Request Status: 200, Page Size: 194858


In [4]:
# parse the html using beautiful soup and store in variable `soup`
soup = BeautifulSoup(page, 'html.parser')
print('Type of the variable \'soup\':', soup.__class__.__name__)

Type of the variable 'soup': BeautifulSoup


In [5]:
print(soup.prettify()[500:800])

ee3002f200f50","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Billie_Eilish_Wiki","wgTitle":"Billie Eilish Wiki","wgCurRevisionId":17195,"wgRevisionId":17195,"wgArticleId":2,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view"


In [6]:
print('Title tag :%s:' % soup.title)
print('Title text:%s:' % soup.title.string)

Title tag :<title>Billie Eilish Wiki | Fandom</title>:
Title text:Billie Eilish Wiki | Fandom:


In [7]:
tags = soup.find_all(id = True, href = True) 

In [8]:
print(tags)

[<a class="wds-button wds-is-full-width global-navigation__register-link" data-tracking-label="account.register" href="https://auth.fandom.com/register?source=mw&amp;redirect=https%3A%2F%2Fbillieeilish.fandom.com%2Fwiki%2FBillie_Eilish_Wiki" id="global-navigation-register-link" rel="nofollow">
	Register</a>, <a aria-label="Sign In" class="wds-button wds-is-full-width wds-is-secondary global-navigation__signin-link" data-tracking-label="account.sign-in" href="https://auth.fandom.com/signin?source=mw&amp;redirect=https%3A%2F%2Fbillieeilish.fandom.com%2Fwiki%2FBillie_Eilish_Wiki" id="global-navigation-sign-in-link" rel="nofollow">
	Sign In</a>, <a aria-label="Sign In" class="wds-button wds-is-secondary global__signin-link" data-tracking-label="account.sign-in" href="https://auth.fandom.com/signin?source=mw&amp;redirect=https%3A%2F%2Fbillieeilish.fandom.com%2Fwiki%2FBillie_Eilish_Wiki" id="global-sign-in-link" rel="nofollow">
			Sign In		</a>, <a class="wds-button global__register-link" da

### Links in the text

In [9]:
links = soup.find_all('a', href=True)

In [10]:
type(links)

bs4.element.ResultSet

In [11]:
#print(links)

In [12]:
dum = 1
for link in soup.find_all('a', href=True):
    dum = dum+1
    #print (dum)
    if (dum % 10 == 0) :
        print(link['href'])

https://billieeilish.fandom.com/wiki/About
https://billieeilish.fandom.com/wiki/WHEN_WE_ALL_FALL_ASLEEP,_WHERE_DO_WE_GO%3F
https://billieeilish.fandom.com/wiki/Lovely
https://billieeilish.fandom.com/wiki/If_You_Want_Me
https://billieeilish.fandom.com/wiki/List_of_Unreleased_Songs
https://billieeilish.fandom.com/wiki/Bureaucrats
https://www.fandom.com/
//createnewwiki.fandom.com/Special:CreateNewWiki
#
https://billieeilish.fandom.com/wiki/Billie_Eilish/Hair_Colors
https://billieeilish.fandom.com/wiki/Don%27t_Smile_at_Me
https://billieeilish.fandom.com/wiki/Your_Power
https://billieeilish.fandom.com/wiki/6.18.18
https://billieeilish.fandom.com/wiki/Where%27s_My_Mind_Tour
https://billieeilish.fandom.com/wiki/User:URRRO
/wiki/Special:ListAdmins
/wiki/Bellyache
/wiki/When_the_Party%27s_Over
/wiki/Therefore_I_Am
https://harrystyles.fandom.com/wiki/Harry_Styles_Wikia
/wiki/Special:Categories
https://www.muthead.com/
https://www.fandom.com/careers
https://www.fandom.com/do-not-sell-my-info


In [13]:
print(links[2]) # the 2nd link

<a data-tracking="explore-main-page" href="https://billieeilish.fandom.com/wiki/Billie_Eilish_Wiki">
<svg class="wds-icon-tiny wds-icon navigation-item-icon"><use xlink:href="#wds-icons-home-tiny"></use></svg> <span>Main Page</span>
</a>


In [14]:
print(link['href'])

#


In [15]:
print(link.get('href'))

#


In [16]:
link = links[2]          # get the first link in the entire page
url1  = link['href']      # get value of the href attribute
url2  = link.get('href')  # or like this

In [17]:
print(url1)

https://billieeilish.fandom.com/wiki/Billie_Eilish_Wiki


In [18]:
print(url2)

https://billieeilish.fandom.com/wiki/Billie_Eilish_Wiki


In [19]:
#print(links)

### Create a filter for unwanted types of articles

In [20]:
soup.find_all(href=re.compile("sleep")) # filter for sleep / asleep 

[<a data-tracking="custom-level-2" href="https://billieeilish.fandom.com/wiki/When_We_All_Fall_Asleep,_World_Tour">
 <span>When We All Fall Asleep, World Tour</span>
 </a>,
 <a data-tracking="custom-level-2" href="https://billieeilish.fandom.com/wiki/When_We_All_Fall_Asleep,_World_Tour">
 <span>When We All Fall Asleep, World Tour</span>
 </a>]

# Extra dabbling

In [21]:
print(r.status)

200


In [22]:
print(r.headers)

HTTPHeaderDict({'Connection': 'keep-alive', 'Content-Length': '194858', 'content-type': 'text/html; charset=UTF-8', 'x-content-type-options': 'nosniff', 'content-language': 'en', 'report-to': '{"group":"nel","max_age":604800,"include_subdomains":true,"endpoints":[{"url":"https://services.fandom.com/browser-errors/report"}]}', 'nel': '{"report_to":"nel","max_age":604800,"include_subdomains":true,"failure_fraction":0.01}', 'content-security-policy': 'upgrade-insecure-requests', 'content-security-policy-report-only': "script-src 'unsafe-eval' blob: 'self' https: 'self' data: 'unsafe-inline' 'unsafe-eval' blob: 'unsafe-inline' internal-soap.wikia.com internal-soap.fandom.com internal-soap.wikia.org internal-soap.gamepedia.com www.fandom.com www.wikia.com www.wikia.org www.gamepedia.com; default-src 'self' data: blob: https://images.wikia.com https://static.wikia.nocookie.net https: 'self' data: blob: internal-soap.wikia.com internal-soap.fandom.com internal-soap.wikia.org internal-soap.gam

In [23]:
print(type(r))

<class 'urllib3.response.HTTPResponse'>


In [24]:
links = soup.find_all("a")
#print(links)
#print("\n")

In [25]:
for link in links:
    if "About" in link.text:
        print(link)
        print(link.attrs['href'])

<a data-tracking="custom-level-1" href="https://billieeilish.fandom.com/wiki/About">
<span>About</span>
</a>
https://billieeilish.fandom.com/wiki/About
<a data-tracking="custom-level-1" href="https://billieeilish.fandom.com/wiki/About_Us">
<span>About Us</span>
</a>
https://billieeilish.fandom.com/wiki/About_Us
<a data-tracking="custom-level-1" href="https://billieeilish.fandom.com/wiki/About">
<span>About</span>
</a>
https://billieeilish.fandom.com/wiki/About
<a data-tracking="custom-level-1" href="https://billieeilish.fandom.com/wiki/About_Us">
<span>About Us</span>
</a>
https://billieeilish.fandom.com/wiki/About_Us
<a aria-label="" class="global-footer__link" data-tracking-label="company-overview.about" href="https://www.fandom.com/about">
													About											</a>
https://www.fandom.com/about
