## Importing packages

In [1]:
import requests
from bs4 import BeautifulSoup
import lxml

## Making a site request

In [2]:
site = "https://en.wikipedia.org/wiki/Film"
response = requests.get(site)
response

<Response [200]>

In [3]:
html = response.content
soup = BeautifulSoup(html, 'lxml')
soup

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Film - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"a5ad039a-6e42-4822-a349-0ae2494f1333","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Film","wgTitle":"Film","wgCurRevisionId":1093274684,"wgRevisionId":1093274684,"wgArticleId":21555729,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","CS1: Julian–Gregorian uncertainty","Articles with short description","Short description is different from Wikidata","Wikipedia indefinitely sem

In [4]:
link = soup.find_all('a')
link

[<a id="top"></a>,
 <a href="/wiki/Wikipedia:Protection_policy#semi" title="This article is semi-protected."><img alt="Page semi-protected" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/30px-Semi-protection-shackle.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/40px-Semi-protection-shackle.svg.png 2x" width="20"/></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#searchInput">Jump to search</a>,
 <a class="mw-disambig" href="/wiki/Film_(disambiguation)" title="Film (disambiguation)">Film (disambiguation)</a>,
 <a class="mw-disambig" href="/wiki/Movie_(disambiguation)" title="Movie (disambiguation)">Movie (disambiguation)</a>,
 <a class="mw-redirect mw-disambig" href="/

In [5]:
lnk = link[26]
lnk 

<a href="/wiki/Film_finance" title="Film finance">Film finance</a>

In [6]:
lnk.string

'Film finance'

In [7]:
# extract the link's url
lnk['href']

'/wiki/Film_finance'

In [8]:
# obtain the absolute URL address
from urllib.parse import urljoin

In [9]:
site

'https://en.wikipedia.org/wiki/Film'

In [12]:
relative_url = lnk['href']
relative_url

'/wiki/Film_finance'

In [14]:
full_url = urljoin(site, relative_url)
full_url

'https://en.wikipedia.org/wiki/Film_finance'

## Processing multiple links

In [16]:
[l.get('href') for l in link]

[None,
 '/wiki/Wikipedia:Protection_policy#semi',
 '#mw-head',
 '#searchInput',
 '/wiki/Film_(disambiguation)',
 '/wiki/Movie_(disambiguation)',
 '/wiki/Moving_picture_(disambiguation)',
 '/wiki/File:Question_book-new.svg',
 '/wiki/Wikipedia:Verifiability',
 'https://en.wikipedia.org/w/index.php?title=Film&action=edit',
 '/wiki/Help:Referencing_for_beginners',
 '//www.google.com/search?as_eq=wikipedia&q=%22Film%22',
 '//www.google.com/search?tbm=nws&q=%22Film%22+-wikipedia&tbs=ar:1',
 '//www.google.com/search?&q=%22Film%22&tbs=bkt:s&tbm=bks',
 '//www.google.com/search?tbs=bks:1&q=%22Film%22+-wikipedia',
 '//scholar.google.com/scholar?q=%22Film%22',
 'https://www.jstor.org/action/doBasicSearch?Query=%22Film%22&acc=on&wc=on',
 '/wiki/Help:Maintenance_template_removal',
 '/wiki/Category:Filmmaking',
 '/wiki/Filmmaking',
 '/wiki/File:Video-x-generic.svg',
 '/wiki/Filmmaking#Development',
 '/wiki/Step_outline',
 '/wiki/Film_treatment',
 '/wiki/Scriptment',
 '/wiki/Screenplay',
 '/wiki/Film_

In [17]:
# drop the links without href attrs
clean_link = [l for l in  link if l.get('href') != None]

In [18]:
# obtain all relative url
relative_url = [lnk.get('href') for lnk in clean_link]
relative_url

['/wiki/Wikipedia:Protection_policy#semi',
 '#mw-head',
 '#searchInput',
 '/wiki/Film_(disambiguation)',
 '/wiki/Movie_(disambiguation)',
 '/wiki/Moving_picture_(disambiguation)',
 '/wiki/File:Question_book-new.svg',
 '/wiki/Wikipedia:Verifiability',
 'https://en.wikipedia.org/w/index.php?title=Film&action=edit',
 '/wiki/Help:Referencing_for_beginners',
 '//www.google.com/search?as_eq=wikipedia&q=%22Film%22',
 '//www.google.com/search?tbm=nws&q=%22Film%22+-wikipedia&tbs=ar:1',
 '//www.google.com/search?&q=%22Film%22&tbs=bkt:s&tbm=bks',
 '//www.google.com/search?tbs=bks:1&q=%22Film%22+-wikipedia',
 '//scholar.google.com/scholar?q=%22Film%22',
 'https://www.jstor.org/action/doBasicSearch?Query=%22Film%22&acc=on&wc=on',
 '/wiki/Help:Maintenance_template_removal',
 '/wiki/Category:Filmmaking',
 '/wiki/Filmmaking',
 '/wiki/File:Video-x-generic.svg',
 '/wiki/Filmmaking#Development',
 '/wiki/Step_outline',
 '/wiki/Film_treatment',
 '/wiki/Scriptment',
 '/wiki/Screenplay',
 '/wiki/Film_finance

In [19]:
# transforming to absolute url
full_url = [urljoin(site, url) for url in relative_url]
full_url

['https://en.wikipedia.org/wiki/Wikipedia:Protection_policy#semi',
 'https://en.wikipedia.org/wiki/Film#mw-head',
 'https://en.wikipedia.org/wiki/Film#searchInput',
 'https://en.wikipedia.org/wiki/Film_(disambiguation)',
 'https://en.wikipedia.org/wiki/Movie_(disambiguation)',
 'https://en.wikipedia.org/wiki/Moving_picture_(disambiguation)',
 'https://en.wikipedia.org/wiki/File:Question_book-new.svg',
 'https://en.wikipedia.org/wiki/Wikipedia:Verifiability',
 'https://en.wikipedia.org/w/index.php?title=Film&action=edit',
 'https://en.wikipedia.org/wiki/Help:Referencing_for_beginners',
 'https://www.google.com/search?as_eq=wikipedia&q=%22Film%22',
 'https://www.google.com/search?tbm=nws&q=%22Film%22+-wikipedia&tbs=ar:1',
 'https://www.google.com/search?&q=%22Film%22&tbs=bkt:s&tbm=bks',
 'https://www.google.com/search?tbs=bks:1&q=%22Film%22+-wikipedia',
 'https://scholar.google.com/scholar?q=%22Film%22',
 'https://www.jstor.org/action/doBasicSearch?Query=%22Film%22&acc=on&wc=on',
 'https

In [20]:
# extracting only urls pointing to wikipedia (internal URL)
intern_link = [url for url in full_url if 'wikipedia.org' in url]
intern_link

['https://en.wikipedia.org/wiki/Wikipedia:Protection_policy#semi',
 'https://en.wikipedia.org/wiki/Film#mw-head',
 'https://en.wikipedia.org/wiki/Film#searchInput',
 'https://en.wikipedia.org/wiki/Film_(disambiguation)',
 'https://en.wikipedia.org/wiki/Movie_(disambiguation)',
 'https://en.wikipedia.org/wiki/Moving_picture_(disambiguation)',
 'https://en.wikipedia.org/wiki/File:Question_book-new.svg',
 'https://en.wikipedia.org/wiki/Wikipedia:Verifiability',
 'https://en.wikipedia.org/w/index.php?title=Film&action=edit',
 'https://en.wikipedia.org/wiki/Help:Referencing_for_beginners',
 'https://en.wikipedia.org/wiki/Help:Maintenance_template_removal',
 'https://en.wikipedia.org/wiki/Category:Filmmaking',
 'https://en.wikipedia.org/wiki/Filmmaking',
 'https://en.wikipedia.org/wiki/File:Video-x-generic.svg',
 'https://en.wikipedia.org/wiki/Filmmaking#Development',
 'https://en.wikipedia.org/wiki/Step_outline',
 'https://en.wikipedia.org/wiki/Film_treatment',
 'https://en.wikipedia.org/wi