# Short notebook to scrape data from German yellowpages

In [1]:
#import beautiful soup for scraping the html content
from bs4 import BeautifulSoup
import requests
import json

#import webdriver to open interactive browser with python
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

#time will be used to stop the script for short times
import time

#random will be used to simulate more 'human' behavior when interacting with the page
import random

#import pandas for data wrangling
import pandas as pd

In [2]:
#path to your chrome driver for selenium --> here you must insert the path from your individual device!
chromedriver_path = "E:\python_projects\chromedriver.exe"

#set the industry that you want to scrape the results for
searchterm = "Kosmetikstudio"

#set the city or zip code that you want to search in
region = "Hamburg"

#OPTIONAL: set how far from your region you want to search in kilometers
search_distance =""

## Start pulling html data from the page

In [3]:
if search_distance =="":
    url = f"https://www.gelbeseiten.de/Suche/{searchterm}/{region}"
else:
    url = f"https://www.gelbeseiten.de/Suche/{searchterm}/{region}?umkreis={search_distance*1000}"

In [4]:
#functions to scroll in the browser using selenium -> some content is only loaded into the html when you scroll past it
def fast_scroll_down(browser):
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

def slow_scroll_down(browser):
    total_height = int(browser.execute_script("return document.body.scrollHeight"))
    for i in range(1, total_height, 6):
        browser.execute_script("window.scrollTo(0, {});".format(i))

def fast_scroll_up(browser):
    browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.HOME)

Unfortunately, not all possible businesses are shown at once. The page only shows 50 businesses and then a button must be pressed to load 10 more entries into the list of visible entries. 

Thus, we will use Selenium to click the "load more" button until the html contains all available entries.

In [5]:
# start the browser
browser = webdriver.Chrome(chromedriver_path)
#open the page
browser.get(url)
#give page time to load
time.sleep(5)
browser.find_element_by_id("cmpbntyestxt").click() #accept cookie policy

In [6]:
displayed_article_count = int(browser.find_element_by_id('loadMoreGezeigteAnzahl').text)
available_article_count = int(browser.find_element_by_id('loadMoreGesamtzahl').text)

#keep loading new entries until all possible entries are loaded into the html
while displayed_article_count < available_article_count:
    try:
        browser.find_element_by_id("mod-LoadMore--button").click()
    except:
        pass
    try:
        displayed_article_count = int(browser.find_element_by_id('loadMoreGezeigteAnzahl').text) # reset count of displayed entries
        print (f'{displayed_article_count}/{available_article_count} entries are loaded')
    except:
        pass
    time.sleep(round(random.randint(5,17)**.5,1)) #wait between 2-4 seconds for page to load to simulate human click behavior

# set DOMdocument variable with new html that containt all available article entries
DOMdocument = BeautifulSoup(browser.page_source, 'html.parser')

#close browser
browser.close()

50/551 entries are loaded
50/551 entries are loaded
50/551 entries are loaded
60/551 entries are loaded
60/551 entries are loaded
80/551 entries are loaded
90/551 entries are loaded
100/551 entries are loaded
110/551 entries are loaded
110/551 entries are loaded
120/551 entries are loaded
130/551 entries are loaded
140/551 entries are loaded
150/551 entries are loaded
160/551 entries are loaded
170/551 entries are loaded
180/551 entries are loaded
190/551 entries are loaded
200/551 entries are loaded
210/551 entries are loaded
220/551 entries are loaded
230/551 entries are loaded
240/551 entries are loaded
250/551 entries are loaded
260/551 entries are loaded
270/551 entries are loaded
280/551 entries are loaded
290/551 entries are loaded
300/551 entries are loaded
310/551 entries are loaded
320/551 entries are loaded
330/551 entries are loaded
340/551 entries are loaded
350/551 entries are loaded
360/551 entries are loaded
370/551 entries are loaded
380/551 entries are loaded
390/551 

Unfortunately, not every entry in the loaded overview contains all information of a given business. Thus, we will have to open the respective detail page of each business to see all contact and additional information.

In [7]:
articles = [] #placeholder list for the articles

#iterate through all articles and pull the available data - not every business has every data type associated!
for article in DOMdocument.find_all('article'):
    name,street_name,street_no,zip_code,city,phone,email,homepage,description,tags,detail_link = '','','','','','','','','','',''
    
    try:
        detail_link = article.find('a')['href'] #get the link to the company detail page
        soup = BeautifulSoup(requests.get(detail_link).content,'html.parser')
        container = soup.find('div', class_='mod-Kontaktdaten__container')
    except:
        continue
        
    try:
        name = container.find('h3').text
    except:
        continue
    
    try:
        full_street = container.find_all('p')[0].text
        street_no = ''.join([i for i in full_street if i in '0123456789'])
        street_name = full_street.replace(street_no,'').strip()
    except:
        pass
    
    try:
        full_zip = container.find_all('p')[1].text
        zip_code = ''.join([i for i in full_zip if i in '0123456789'])
        city = full_zip.replace(zip_code,'').strip()
    except:
        pass
    
    try:
        phone = container.find('li',class_='contains-icon-telefon').find('a').text.replace('\n','').replace('\t','').strip()
    except:
        pass
    
    try:
        email = container.find('li',class_='contains-icon-email').find('a').text.replace('\n','').replace('\t','').strip()
    except:
        pass

    try:
        homepage = container.find('li',class_='contains-icon-homepage').find('a').text.replace('\n','').replace('\t','').strip()
    except:
        pass
        
    try:
        description = soup.find('div',class_='mod-Beschreibung__wrapper').find('div').text
    except:
        pass
    
    try:
        tags = [i for i in soup.find_all('h2',class_='gc-text--h1') if i.text =="Stichworte"][0].next_element.next_element.next_element.text.replace('\n','').replace('\t','').strip()
    except:
        pass
    
    articles.append([name,street_name,street_no,zip_code,city,homepage,phone,email,description,tags,detail_link]) #save the data to the list of articles

In [8]:
#save the data in a dataframe
df=pd.DataFrame(articles,columns=['name','street_name','street_no','zip_code','city','homepage','phone','email','description','tags','detail_link'])

#generate the filename
filename = f"{searchterm}_in_{region}"
if search_distance != "":
    filename += f"_{search_distance}km"
filename

#save the dataframe to an excel list
df.to_excel(filename + '.xlsx',index=False)

print("Data saved to '" + filename + ".xlsx'")

Data saved to 'Kosmetikstudio_in_Hamburg.xlsx'
