# Short notebook to scrape data from German yellowpages

In [1]:
#import beautiful soup for scraping the html content
from bs4 import BeautifulSoup
import requests
import json

#import webdriver to open interactive browser with python
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

#time will be used to stop the script for short times
import time

#random will be used to simulate more 'human' behavior when interacting with the page
import random

#import pandas for data wrangling
import pandas as pd

In [2]:
#path to your chrome driver for selenium --> here you must insert the path from your individual device!
chromedriver_path = "E:\python_projects\chromedriver.exe"

#set the industry that you want to scrape the results for
searchterm = "Nagelstudio"

#set the city or zip code that you want to search in
region = "Hamburg"

#OPTIONAL: set how far from your region you want to search in kilometers
search_distance =""

## Start pulling html data from the page

In [3]:
if search_distance =="":
    page = requests.get(f"https://www.gelbeseiten.de/Suche/{searchterm}/{region}")
else:
    page = requests.get(f"https://www.gelbeseiten.de/Suche/{searchterm}/{region}?umkreis={search_distance*1000}")
if page.status_code == 200:
    content = page.content
else:
    print("Connection error. Please check the url or try again later")

#store the raw html and take a look at it --> we are especially interested in the <article> tags
DOMdocument = BeautifulSoup(content, 'html.parser')
DOMdocument.find('article')

<!DOCTYPE html>

<html class="no-js gs_anwendung gs_desktop" lang="de" prefix="og: http://ogp.me/ns#" xmlns="http://www.w3.org/1999/xhtml">
<head>
<base href="/"/>
<link as="style" href="/web/css/global_above.css?1631547009193" rel="preload"/>
<link href="/web/css/global_above.css?1631547009193" rel="stylesheet"/>
<link href="https://wwa.wipe.de/wwa.js" rel="preconnect"/>
<script src="https://wwa.wipe.de/wwa.js"></script>
<link href="https://consentmanager.mgr.consensu.org" rel="preconnect"/>
<link as="style" href="/web/css/trefferliste_above.css?1631547009193" rel="preload"/>
<link href="/web/css/trefferliste_above.css?1631547009193" rel="stylesheet"/>
<title>ᐅ Top 10 Friseur  Hamburg | ✉ Adresse | ☎ Telefonnummer | 📝 Kontakt | ✅ Bewertungen ➤ Jetzt auf GelbeSeiten.de ansehen.</title>
<meta content="ᐅ Top 10 Friseur  Hamburg | ✉ Adresse | ☎ Telefonnummer | 📝 Kontakt | ✅ Bewertungen ➤ Jetzt auf GelbeSeiten.de ansehen." property="og:title"/>
<meta content="Friseur Hamburg ✉ Adresse ☎ Te

In [4]:
# the page only shows us few results
print(f"{len(DOMdocument.find_all('article'))} entries are in current html body")

# find out, how many search results are totally available and how many are displayed
displayed_article_count = int(DOMdocument.find(id='loadMoreGezeigteAnzahl').text)
available_article_count = int(DOMdocument.find(id='loadMoreGesamtzahl').text)
print (f'{displayed_article_count}/{available_article_count} entries are initially shown')

50 entries are in current html body
50/908 entries are initially shown


In [5]:
#functions to scroll in the browser using selenium -> some content is only loaded into the html when you scroll past it
def fast_scroll_down(browser):
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

def slow_scroll_down(browser):
    total_height = int(browser.execute_script("return document.body.scrollHeight"))
    for i in range(1, total_height, 6):
        browser.execute_script("window.scrollTo(0, {});".format(i))

def fast_scroll_up(browser):
    browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.HOME)

Use Selenium to click the "load more" button until the html contains all available entries

In [6]:
# start the browser
browser = webdriver.Chrome(chromedriver_path)
#open the page
browser.get(f"https://www.gelbeseiten.de/Suche/{searchterm}/{region}")
#give page time to load
time.sleep(5)
browser.find_element_by_id("cmpbntyestxt").click() #accept cookie policy

In [7]:
#keep loading new entries until all possible entries are loaded into the html
while displayed_article_count < available_article_count:
    try:
        browser.find_element_by_id("mod-LoadMore--button").click()
    except:
        pass
    displayed_article_count = int(browser.find_element_by_id('loadMoreGezeigteAnzahl').text) # reset count of displayed entries
    print (f'{displayed_article_count}/{available_article_count} entries are loaded')
    time.sleep(round(random.randint(5,17)**.5,1)) #wait between 2-4 seconds for page to load to simulate human click behavior

# the page now contains all results
print(f"{len(BeautifulSoup(browser.page_source, 'html.parser').find_all('article'))} entries are in current html body")

# overwrite old DOMdocument variable with new html that containt all available article entries
DOMdocument = BeautifulSoup(browser.page_source, 'html.parser')

#close browser
browser.close()

50/908 entries are loaded
50/908 entries are loaded
50/908 entries are loaded
50/908 entries are loaded
60/908 entries are loaded
60/908 entries are loaded
70/908 entries are loaded
80/908 entries are loaded
90/908 entries are loaded
90/908 entries are loaded
110/908 entries are loaded
120/908 entries are loaded
120/908 entries are loaded
140/908 entries are loaded
150/908 entries are loaded
160/908 entries are loaded
170/908 entries are loaded
170/908 entries are loaded
170/908 entries are loaded
190/908 entries are loaded
200/908 entries are loaded
210/908 entries are loaded
220/908 entries are loaded
220/908 entries are loaded
220/908 entries are loaded
240/908 entries are loaded
250/908 entries are loaded
260/908 entries are loaded
270/908 entries are loaded
280/908 entries are loaded
290/908 entries are loaded
300/908 entries are loaded
310/908 entries are loaded
320/908 entries are loaded
330/908 entries are loaded
340/908 entries are loaded
350/908 entries are loaded
360/908 ent

Unfortunately, not every entry in the loaded overview contains all information of a given business. Thus, we will have to open the respective detail page of each business to see all contact and additional information.

In [351]:
articles = [] #placeholder list for the articles

#iterate through all articles and pull the available data - not every business has every data type associated!
for article in DOMdocument.find_all('article'):
    name,street_name,street_no,zip_code,city,phone,email,homepage,description,tags,detail_link = '','','','','','','','','','',''
    
    try:
        detail_link = article.find('a')['href'] #get the link to the company detail page
        soup = BeautifulSoup(requests.get(detail_link).content,'html.parser')
        container = soup.find('div', class_='mod-Kontaktdaten__container')
    except:
        continue
        
    try:
        name = container.find('h3').text
    except:
        continue
    
    try:
        full_street = container.find_all('p')[0].text
        street_no = ''.join([i for i in full_street if i in '0123456789'])
        street_name = full_street.replace(street_no,'').strip()
    except:
        pass
    
    try:
        full_zip = container.find_all('p')[1].text
        zip_code = ''.join([i for i in full_zip if i in '0123456789'])
        city = full_zip.replace(zip_code,'').strip()
    except:
        pass
    
    try:
        phone = container.find('li',class_='contains-icon-telefon').find('a').text.replace('\n','').replace('\t','').strip()
    except:
        pass
    
    try:
        email = container.find('li',class_='contains-icon-email').find('a').text.replace('\n','').replace('\t','').strip()
    except:
        pass

    try:
        homepage = container.find('li',class_='contains-icon-homepage').find('a').text.replace('\n','').replace('\t','').strip()
    except:
        pass
        
    try:
        description = soup.find('div',class_='mod-Beschreibung__wrapper').find('div').text
    except:
        pass
    
    try:
        tags = [i for i in soup.find_all('h2',class_='gc-text--h1') if i.text =="Stichworte"][0].next_element.next_element.next_element.text.replace('\n','').replace('\t','').strip()
    except:
        pass
    
    articles.append([name,street_name,street_no,zip_code,city,homepage,phone,email,description,tags,detail_link]) #save the data to the list of articles

In [352]:
#save the data in a dataframe
df=pd.DataFrame(articles,columns=['name','street_name','street_no','zip_code','city','homepage','phone','email','description','tags','detail_link'])

#generate the filename
filename = f"{searchterm}_in_{region}"
if search_distance != "":
    filename += f"_{search_distance}km"
filename

#save the dataframe to an excel list
df.to_excel(filename + '.xlsx',index=False)

print("Data saved to '" + filename + ".xlsx'")

Data saved to 'Friseur_in_Hamburg.xlsx'
