# Webpage data extraction using Beautiful Soup 4

For a list of references see:

https://blog.hartleybrody.com/web-scraping-cheat-sheet/#using-beautifulsoup


In [10]:
# conda install beautifulsoup4
# pip install requests
import requests
from bs4 import BeautifulSoup

In [11]:
# Request the webpage
url = "https://www.spiegel.de/international"
req = requests.get(url)

In [12]:
# Inspect the structure of the article using Chrome / Devtools
req.text

'<!doctype html><html lang="de">\n<head>\n<title>International - DER SPIEGEL</title>\n<meta charset="utf-8">\n<meta name="viewport" content="width=device-width,initial-scale=1,user-scalable=no">\n<meta name="MSSmartTagsPreventParsing" content="true">\n<meta http-equiv="imagetoolbar" content="no">\n<meta http-equiv="x-ua-compatible" content="IE=Edge">\n<meta name="apple-itunes-app" content="app-id=424881832">\n<link rel="manifest" href="https://www.spiegel.de/public/spon/manifest/manifest.json">\n<meta name="theme-color" content="#E64415">\n<meta name="robots" content="index, follow, noarchive, noodp">\n<meta name="copyright" content="DER SPIEGEL, Hamburg, Germany">\n<meta name="email" content="spiegel_online@spiegel.de">\n<meta name="author" content="DER SPIEGEL, Hamburg, Germany">\n<meta name="description" content="Deutschlands führende Nachrichtenseite. Alles Wichtige aus Politik, Wirtschaft, Sport, Kultur, Wissenschaft, Technik und mehr.">\n<meta name="locale" content="de_DE">\n<met

In [13]:
# Save the website
websiteFileName="website.htm"
with open(websiteFileName, "wb") as file:
    file.write(req.text.encode())

In [14]:
# Create the BS4 Object
soup = BeautifulSoup(req.text, 'html')

In [16]:
# Use HTML Selector
events = soup.findAll('article')
events

[<article aria-label='"Saturday Is a Crucial Day"' class="lg:p-24 md:py-24 sm:py-16">
 <header class="lg:flex lg:justify-between md:flex md:justify-between md:mx-24 sm:mx-16">
 <h2 class="lg:flex-grow md:flex-grow">
 <a class="text-black block" href="https://www.spiegel.de/international/germany/saturday-is-a-crucial-day-interview-with-chancellor-merkel-s-chief-of-staff-a-983c1ede-076f-4fa5-9c75-db5854c43da3" title='"Saturday Is a Crucial Day"'>
 <span class="block lg:mb-24 md:mb-16 sm:mb-16">
 <span class="font-slabcdUI font-extrabold lg:text-5xl md:text-5xl sm:text-3xl leading-tight"><span class="align-middle hover:opacity-moderate focus:opacity-moderate">"Saturday Is a Crucial Day"</span>
 </span>
 </span>
 </a>
 </h2>
 </header>
 <figure class="lg:mb-16 relative">
 <a class="block" href="https://www.spiegel.de/international/germany/saturday-is-a-crucial-day-interview-with-chancellor-merkel-s-chief-of-staff-a-983c1ede-076f-4fa5-9c75-db5854c43da3" target="_self" title='"Saturday Is a 

In [17]:
# View the first article
event=events[1]
event

<article aria-label="Germany Weighing Strict Curfews If Rules Violated over Weekend" class="lg:py-24 md:py-24 sm:py-16 lg:px-24 md:px-24 sm:px-16">
<h2>
<a class="text-black block" href="https://www.spiegel.de/international/germany/germany-weighing-strict-curfews-if-rules-violated-over-weekend-a-b7a39586-2557-4e6a-b712-7e24e1dea800" target="_self" title="Germany Weighing Strict Curfews If Rules Violated over Weekend">
<span class="flex mb-4">
<span class="block text-primary-base focus:text-primary-darker hover:text-primary-dark font-sansUI font-bold text-base">
“Saturday Will Be Decisive”
</span>
</span>
<span class="block font-sansUI font-bold text-base"><span class="hover:opacity-moderate focus:opacity-moderate"><span class="align-middle mr-6">Germany Weighing Strict Curfews If Rules Violated over Weekend</span>
</span>
</span>
</a>
</h2>
</article>

In [27]:
# Get the headline
x=event.find('h2')
x

<h2>
<a class="text-black block" href="https://www.spiegel.de/international/germany/germany-weighing-strict-curfews-if-rules-violated-over-weekend-a-b7a39586-2557-4e6a-b712-7e24e1dea800" target="_self" title="Germany Weighing Strict Curfews If Rules Violated over Weekend">
<span class="flex mb-4">
<span class="block text-primary-base focus:text-primary-darker hover:text-primary-dark font-sansUI font-bold text-base">
“Saturday Will Be Decisive”
</span>
</span>
<span class="block font-sansUI font-bold text-base"><span class="hover:opacity-moderate focus:opacity-moderate"><span class="align-middle mr-6">Germany Weighing Strict Curfews If Rules Violated over Weekend</span>
</span>
</span>
</a>
</h2>

In [28]:
# Get the title from the Anchor tag within the headline
title=x.find("a")["title"]
title

'Germany Weighing Strict Curfews If Rules Violated over Weekend'

In [29]:
# Get the article link from the Anchor tag within the headline
href=x.find("a")["href"]
href

'https://www.spiegel.de/international/germany/germany-weighing-strict-curfews-if-rules-violated-over-weekend-a-b7a39586-2557-4e6a-b712-7e24e1dea800'

In [30]:
# Get the figure HTML element
x=event.find('figure')

In [32]:
# Get the image srce using HTML and CSS selection
image= x.find("img",{'data-image-el':'img'})["src"]
image

AttributeError: 'NoneType' object has no attribute 'find'

In [23]:
event=events[2]
x=event.find('span',{'class':'font-sansUI font-normal text-s text-shade-dark'})

In [24]:
if (x!=None):
    print(x.text)

A DER SPIEGEL Editorial by Lothar Gorris


In [1]:
my_events = []
for event in events:
    event_details = dict()
    x=event.find('h2')
    if (x!=None):
        event_details['title'] = x.find("a")["title"]
    x=event.find('h2')
    if (x!=None):
        event_details['href'] = x.find("a")["href"]
    x=event.find('figure')
    if (x!=None):
        event_details['image'] = x.find("img",{'data-image-el':'img'})["src"]
    x=event.find('span',{'class':'font-sansUI font-normal text-s text-shade-dark'})
    if (x!=None):
        event_details['author']=x.text
    my_events.append(event_details) 


NameError: name 'events' is not defined

In [26]:
print(my_events)

[{'title': "Inside Germany's Piecemeal Response to Corona", 'href': 'https://www.spiegel.de/international/germany/inside-germany-s-piecemeal-response-to-corona-a-f376b3f9-625f-4a6a-8e7d-04bd48be20b2', 'image': 'https://cdn.prod.www.spiegel.de/images/23d7cbd2-0e20-4815-b3fd-40818accb64b_w948_r2.11_fpx32.98_fpy44.99.jpg'}, {'title': 'The Urgent Search for a Cure for COVID-19', 'href': 'https://www.spiegel.de/international/world/coronavirus-the-urgent-search-for-a-cure-for-covid-19-a-fd4c9a3a-ab4e-4590-b95b-a1c01d8b9d61', 'image': 'https://cdn.prod.www.spiegel.de/images/309d25a8-da78-4936-9e78-9a400e4d34ce_w948_r2.11_fpx46.33_fpy52.98.jpg'}, {'title': 'The Brutal Logic of Coronavirus', 'href': 'https://www.spiegel.de/international/world/the-brutal-logic-of-coronavirus-a-e7be4add-4695-4d48-b1d6-0c593a668e92', 'image': 'https://cdn.prod.www.spiegel.de/images/a4948b2d-fe99-49d7-92fd-6ef0da840169_w872_r1.77_fpx36.22_fpy50.jpg', 'author': 'A DER SPIEGEL Editorial by Lothar Gorris'}, {'title': 

In [27]:
# Get one link
details_url=my_events[1]["href"]
details_url

'https://www.spiegel.de/international/world/coronavirus-the-urgent-search-for-a-cure-for-covid-19-a-fd4c9a3a-ab4e-4590-b95b-a1c01d8b9d61'

Check the website manually in Chrome with Xpath

//div[contains(@class,"RichText")]/p/text()

In [28]:
# Request the website
details_req = requests.get(details_url)

In [29]:
# Create a Soup Object
details_soup = BeautifulSoup(details_req.text, 'html')

In [30]:
import re
# AND expression with look aheads
regex = re.compile('(?=.*RichText.*)(?=.*word-wrap.*)')
# OR expression with look aheads
# regex = re.compile('(?=.*RichText.*|.*word-wrap.*)')

In [31]:
# Use HTML Selector
details_events = details_soup.findAll('div',{'class':regex})
details_events

[<div class="RichText RichText--iconLinks lg:w-8/12 md:w-10/12 lg:mx-auto md:mx-auto lg:px-24 md:px-24 sm:px-16 clearfix break-words word-wrap">
 <p>Clemens Wendtner treated some of Germany’s very first COVID-19 cases, back before the disease even carried that name. The head physician in the Department of Infectiology and Tropical Medicine at the Munich-Schwabing Clinic knows what is coming his way if the number of infected people in Germany rises as steeply as it did in Italy. "I think, in that case, we could really use an effective medication,” he says.</p>
 </div>,
 <div class="RichText RichText--iconLinks lg:w-8/12 md:w-10/12 lg:mx-auto md:mx-auto lg:px-24 md:px-24 sm:px-16 clearfix break-words word-wrap">
 <p>Because of his experience with the disease, his clinic will be the first in Germany to test remdesivir, a drug initially developed for the treatment of Ebola, on coronavirus patients. The university hospitals in Hamburg and Düsseldorf are also eager to participate in the clin

In [32]:
# Iterate over all p tags
for devents in details_events:
    x=devents.findAll("p")
    for p in x:
        print(p.text)

Clemens Wendtner treated some of Germany’s very first COVID-19 cases, back before the disease even carried that name. The head physician in the Department of Infectiology and Tropical Medicine at the Munich-Schwabing Clinic knows what is coming his way if the number of infected people in Germany rises as steeply as it did in Italy. "I think, in that case, we could really use an effective medication,” he says.
Because of his experience with the disease, his clinic will be the first in Germany to test remdesivir, a drug initially developed for the treatment of Ebola, on coronavirus patients. The university hospitals in Hamburg and Düsseldorf are also eager to participate in the clinical study by the American pharmaceutical company Gilead Sciences. Remdesivir inhibits the replication of the genetic material of so-called RNA viruses, which include the pathogen behind Ebola and the new coronavirus.
Ever since an American coronavirus patient recovered overnight after being given remdesivir, 

In [33]:
# Create a function for the code above
# Define a function to automatically extract the text
def downloadText(url):
    details_req = requests.get(url)
    details_soup = BeautifulSoup(details_req.text, 'html')
    regex = re.compile('.*RichText.*')
    details_events = details_soup.findAll('div',{'class':regex})
    text="";
    for devents in details_events:
        paragraphs=devents.findAll("p")
        for paragraph in paragraphs:
            text = text + " " + paragraph.text
    return text

In [34]:
# Test the function
print(downloadText(details_url))

 Clemens Wendtner treated some of Germany’s very first COVID-19 cases, back before the disease even carried that name. The head physician in the Department of Infectiology and Tropical Medicine at the Munich-Schwabing Clinic knows what is coming his way if the number of infected people in Germany rises as steeply as it did in Italy. "I think, in that case, we could really use an effective medication,” he says. Because of his experience with the disease, his clinic will be the first in Germany to test remdesivir, a drug initially developed for the treatment of Ebola, on coronavirus patients. The university hospitals in Hamburg and Düsseldorf are also eager to participate in the clinical study by the American pharmaceutical company Gilead Sciences. Remdesivir inhibits the replication of the genetic material of so-called RNA viruses, which include the pathogen behind Ebola and the new coronavirus. Ever since an American coronavirus patient recovered overnight after being given remdesivir,

In [35]:
# Download all images - Helper method
def downloadImage(url, file_name):
    # open in binary mode
    with open(file_name, "wb") as file:
        # get request
        response = requests.get(url)
        # write to file
        file.write(response.content)

In [36]:
# Download all images
i=0;
for e in my_events:
    if (e.get("image")!=None):
        downloadImage(e["image"],str(i)+".jpg")
        i = i + 1