**Import libraries**

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
from requests import get
from itertools import chain
from google_trans_new import google_translator 
from itertools import chain
from urllib.parse import urlparse, urljoin, unquote

**Get country URLs - Table**

In [2]:
def scrape_country_urls(url_dict):
    """
    - url_dict: Dictionary containing the urls to scrape as key and language codes as value (dict)
    """
    
    # Empty dictionary to store country pages
    country_pages = {}

    for url, language in url_dict.items():
        
        # Retrieve HTML from URL
        response = get(url)
        html_soup = BeautifulSoup(response.text, 'html.parser')

        # Search for table elements in HTML
        country_container = html_soup.find_all('tr')
        
        # Translator to convert country name to English
        translator = google_translator()
            
        # Skip first item with no country info
        for td in country_container[1:]:
                        
            # Loop through urls
            for url in td.find_all('a'):
                
                country = translator.translate(td.strong.contents[0], lang_src=language, lang_tgt='en').rstrip()
            
                # Append the URLs per country name, add country key if not yet in dictionary
                country_pages.setdefault(country,[]).append(url.get('href')) 
        
    return country_pages        

In [3]:
country_pages = scrape_country_urls({'https://www.nederlandwereldwijd.nl/help/in-welke-taal-communiceert-welk-land': 'nl',
                                     'https://www.netherlandsandyou.nl/help/in-welke-taal-communiceert-welk-land---which-language-is-used-for-which-country': 'en'
                                    }                                          
                                   )

In [4]:
country_pages['Germany']

['https://www.nederlandwereldwijd.nl/landen/duitsland',
 'https://www.niederlandeweltweit.nl/laender/deutschland',
 'https://www.sieunddieniederlande.nl/ihr-land-und-die-niederlande/deutschland']

In [5]:
# Generate usable calculated field for Tableau
else_if_list = [f"ELSEIF CONTAINS([Pw Event Url], \"{url}\") THEN \"{country}\"" for country in country_pages.keys() for url in country_pages[country]]
else_if_string = ' '.join(else_if_list)

**Get travel URLs - List**

In [6]:
def scrape_travel_urls(country_urls):
    
    """
    - url_dict: Dictionary containing the country as key and country pages as value (dict)
    """
    
    # Lists to store homepage and travel URLs
    homepages = []
    travel_urls = []
    
    # Flatten country URLs in dictionary
    country_urls_flat = list(chain.from_iterable(list(country_urls.values())))    
    
    for url in country_urls_flat:
        
        # Exclude irrelevant urls
        if (
            ('/landen/' not in url) and ('nederlandwereldwijd' not in url) and
            ('netherlandsandyou' not in url) and ('your-country-and-the-netherlands' not in url) and 
            ('uw-land-en-nederland' not in url) and ('paysbasetvous' not in url) and
            ('holandaevoce' not in url) and ('paisesbajosytu' not in url) and
            ('sieunddieniederlande' not in url) and ('paisesbajosytu' not in url) 
           ):
                       
            
            # Get homepage HTML
            parsed_uri = urlparse(url)
            homepage = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
                      
            # Only scrape unique homepages
            if homepage not in homepages:
                
                response = get(homepage)
                html_soup = BeautifulSoup(response.text, 'html.parser')
            
                # Get travel url 
                first_nav_item = html_soup.find_all('li', {'class': 'navItem'})[0]
                travel_path = first_nav_item.find('a')['href']
                full_travel_url = unquote(urljoin(homepage, travel_path))
                
                # Add to lists
                homepages.append(homepage)
                travel_urls.append(full_travel_url)
    
    return travel_urls              

In [7]:
travel_urls = scrape_travel_urls(country_pages)

In [8]:
travel_urls

['https://www.netherlandsworldwide.nl/travelling-outside-the-netherlands',
 'https://www.paysbasmondial.nl/voyager-a-l’etranger',
 'https://www.holandanomundo.nl/viajar',
 'https://www.paisesbajosmundial.nl/viajar-fuera-de-los-paises-bajos',
 'https://www.niyuhelan.nl/旅游与居住',
 'https://www.niederlandeweltweit.nl/reisen',
 'https://www.holandawaanta.nl/السفر-والإقامة',
 'https://www.belandadananda.nl/perjalanan-menetap',
 'https://www.orandatowatashi.nl/ryokou-kyojuu',
 'https://www.niderlandy-i-vy.nl/жить-работать',
 'https://www.hollandavesen.nl/seyahat-ve-yasam']

In [9]:
# Generate usable regex
'|'.join(travel_urls)

'https://www.netherlandsworldwide.nl/travelling-outside-the-netherlands|https://www.paysbasmondial.nl/voyager-a-l’etranger|https://www.holandanomundo.nl/viajar|https://www.paisesbajosmundial.nl/viajar-fuera-de-los-paises-bajos|https://www.niyuhelan.nl/旅游与居住|https://www.niederlandeweltweit.nl/reisen|https://www.holandawaanta.nl/السفر-والإقامة|https://www.belandadananda.nl/perjalanan-menetap|https://www.orandatowatashi.nl/ryokou-kyojuu|https://www.niderlandy-i-vy.nl/жить-работать|https://www.hollandavesen.nl/seyahat-ve-yasam'