### Importing the packages

In [1]:
# Load the packages
import requests
from bs4 import BeautifulSoup

### Making a get request

In [2]:
# Defining the url of the site
base_site = "https://en.wikipedia.org/wiki/Music"

# Making a get request
response = requests.get(base_site)
response

<Response [200]>

In [29]:
# Extracting the HTML
html = response.content
html

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Music - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"2e8c0ca0-7875-4321-a807-2a6e64994483","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Music","wgTitle":"Music","wgCurRevisionId":1030121973,"wgRevisionId":1030121973,"wgArticleId":18839,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All articles with incomplete citations","Articles with incomplete citations from July 2019","CS1 maint: archived copy as title","CS1: Julian\xe2\x80\x93Gregorian uncertainty","CS1 maint:

### Making the soup

In [42]:
# Convert HTML to a BeautifulSoup object. This will allow us to parse out content from the HTML more easily.
# Using the default parser as it is included in Python
soup = BeautifulSoup(html, "html.parser")

### 1. Extract all existing titles of links

In [5]:
# Find all links on the page 
links = soup.find_all('a')
links

[<a id="top"></a>,
 <a href="/wiki/Wikipedia:Protection_policy#semi" title="This article is semi-protected."><img alt="Page semi-protected" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/30px-Semi-protection-shackle.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/40px-Semi-protection-shackle.svg.png 2x" width="20"/></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#searchInput">Jump to search</a>,
 <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>,
 <a class="image" href="/wiki/File:Fran%C3%A7ois_Boucher,_Allegory_of_Music,_1764,_NGA_32680.jpg"><img alt="François Boucher, Allegory of Music, 1764, NGA 3268

In [34]:
# Dropping the links without 'href' attribute
hlinks = [l for l in links if l.get('href') != None]

print(hlinks)

[<a href="/wiki/Wikipedia:Protection_policy#semi" title="This article is semi-protected."><img alt="Page semi-protected" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/20px-Semi-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/30px-Semi-protection-shackle.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/1/1b/Semi-protection-shackle.svg/40px-Semi-protection-shackle.svg.png 2x" width="20"/></a>, <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>, <a class="mw-jump-link" href="#searchInput">Jump to search</a>, <a class="mw-disambig" href="/wiki/Music_(disambiguation)" title="Music (disambiguation)">Music (disambiguation)</a>, <a class="image" href="/wiki/File:Fran%C3%A7ois_Boucher,_Allegory_of_Music,_1764,_NGA_32680.jpg"><img alt="François Boucher, Allegory of Music, 1764, NGA 32680.jpg" data-file-height

In [20]:
# Getting all titles
title = [l.get('title') for l in hlinks]
title

['This article is semi-protected.',
 None,
 None,
 'Music (disambiguation)',
 None,
 'François Boucher',
 'Sound',
 'Music genre',
 'Paleolithic',
 'Performing arts',
 'Acrobatics',
 'Ballet',
 'List of circus skills',
 'Clown',
 'Dance',
 'Gymnastics',
 'Magic (illusion)',
 'Mime artist',
 'Opera',
 'Professional wrestling',
 'Puppetry',
 'Public speaking',
 'Stand-up comedy',
 'Theatre',
 'Ventriloquism',
 'Template:Performing arts',
 'Template talk:Performing arts',
 None,
 'The arts',
 'Sound',
 'Musical composition',
 'Elements of music',
 None,
 'Cultural universal',
 'Definitions of music',
 'Pitch (music)',
 'Melody',
 'Harmony',
 'Rhythm',
 'Tempo',
 'Meter (music)',
 'Articulation (music)',
 'Dynamics (music)',
 'Timbre',
 'Texture (music)',
 'Music genre',
 'Musical instrument',
 'Singing',
 'Rapping',
 'Instrumental music',
 'A capella',
 'Accompaniment',
 'Greek language',
 'wikt:μουσική',
 'Muses',
 None,
 None,
 'Glossary of musical terminology',
 'Culture',
 'Song',
 'S

In [23]:
# Removing the 'None' titles
cleantitle = [t for t in title if t != None]
cleantitle

['This article is semi-protected.',
 'Music (disambiguation)',
 'François Boucher',
 'Sound',
 'Music genre',
 'Paleolithic',
 'Performing arts',
 'Acrobatics',
 'Ballet',
 'List of circus skills',
 'Clown',
 'Dance',
 'Gymnastics',
 'Magic (illusion)',
 'Mime artist',
 'Opera',
 'Professional wrestling',
 'Puppetry',
 'Public speaking',
 'Stand-up comedy',
 'Theatre',
 'Ventriloquism',
 'Template:Performing arts',
 'Template talk:Performing arts',
 'The arts',
 'Sound',
 'Musical composition',
 'Elements of music',
 'Cultural universal',
 'Definitions of music',
 'Pitch (music)',
 'Melody',
 'Harmony',
 'Rhythm',
 'Tempo',
 'Meter (music)',
 'Articulation (music)',
 'Dynamics (music)',
 'Timbre',
 'Texture (music)',
 'Music genre',
 'Musical instrument',
 'Singing',
 'Rapping',
 'Instrumental music',
 'A capella',
 'Accompaniment',
 'Greek language',
 'wikt:μουσική',
 'Muses',
 'Glossary of musical terminology',
 'Culture',
 'Song',
 'Symphony',
 'Music criticism',
 'Musicology',
 'Ae

### 2. Extract all heading 2 strings.

In [24]:
# Inspect all h2 tags
h2 = soup.find_all("h2")
h2

[<h2 id="mw-toc-heading">Contents</h2>,
 <h2><span class="mw-headline" id="Etymology">Etymology</span></h2>,
 <h2><span class="mw-headline" id="Art_and_entertainment">Art and entertainment</span></h2>,
 <h2><span class="mw-headline" id="Elements">Elements</span></h2>,
 <h2><span class="mw-headline" id="History">History</span></h2>,
 <h2><span class="mw-headline" id="Performance">Performance</span></h2>,
 <h2><span class="mw-headline" id="Philosophy_and_aesthetics">Philosophy and aesthetics</span></h2>,
 <h2><span class="mw-headline" id="Psychology">Psychology</span></h2>,
 <h2><span class="mw-headline" id="Sociological_aspects">Sociological aspects</span></h2>,
 <h2><span class="mw-headline" id="Media_and_technology">Media and technology</span></h2>,
 <h2><span class="mw-headline" id="Business">Business</span></h2>,
 <h2><span class="mw-headline" id="Education">Education</span></h2>,
 <h2><span class="mw-headline" id="Therapy">Therapy</span></h2>,
 <h2><span class="mw-headline" id="See

In [28]:
# Get the text
text =[]
for link in h2 :
    text.append(link.string)
text

['Contents',
 'Etymology',
 'Art and entertainment',
 'Elements',
 'History',
 'Performance',
 'Philosophy and aesthetics',
 'Psychology',
 'Sociological aspects',
 'Media and technology',
 'Business',
 'Education',
 'Therapy',
 'See also',
 'References',
 'Further reading',
 'External links',
 'Navigation menu']

### 3. Print the whole footer text.

In [43]:
# By inspection: we see that the footer is contained inside a ...
print(soup.find('div', id = 'footer').text)

AttributeError: 'NoneType' object has no attribute 'text'