## Scrape by class

In [2]:
from bs4 import BeautifulSoup
import pandas as pd
from requests import get
from pathlib import Path

In [3]:
# Define urls to scrape
url_list = ['https://www.rijksoverheid.nl/ministeries/ministerie-van-buitenlandse-zaken/het-werk-van-bz-in-de-praktijk/weblogs',
            'https://www.rijksoverheid.nl/ministeries/ministerie-van-buitenlandse-zaken/het-werk-van-bz-in-de-praktijk/weblogs?pagina=2'
             ]

In [4]:
def get_information(url_list):
    
    weblog_urls = []
    dates = []
    
    for url in url_list:
        
        # Get response from GET request
        response = get(url)
        
        # Capture HTML from URL
        html_soup = BeautifulSoup(response.text, 'html.parser')

        # Capture all href's with weblog class
        for a in html_soup.find_all('a', {'class': 'weblog'}, href=True):
            weblog_urls.append(a['href'])
            
        # Capture the datetime values per weblog
        for m in html_soup.find_all('p', {'class': 'meta'}):
            for i in m.find_all('time'):
                if i.has_attr('datetime'):
                    dates.append(i['datetime'])
                  
    # Create dataframe   
    weblog_df = pd.DataFrame({'url': weblog_urls,
                              'date': pd.to_datetime(dates)
                             }
                            )
    
    # Return blogs from october and november
    return weblog_df[(weblog_df['date'] >= '2020-10-01') & (weblog_df['date'] <= '2020-11-30')]

In [5]:
weblogs_df = get_information(url_list)

In [6]:
weblogs_df

Unnamed: 0,url,date
2,/ministeries/ministerie-van-buitenlandse-zaken...,2020-11-25
3,/ministeries/ministerie-van-buitenlandse-zaken...,2020-11-23
4,/ministeries/ministerie-van-buitenlandse-zaken...,2020-11-22
5,/ministeries/ministerie-van-buitenlandse-zaken...,2020-11-17
6,/ministeries/ministerie-van-buitenlandse-zaken...,2020-11-12
7,/ministeries/ministerie-van-buitenlandse-zaken...,2020-11-05
8,/ministeries/ministerie-van-buitenlandse-zaken...,2020-11-03
9,/ministeries/ministerie-van-buitenlandse-zaken...,2020-10-27
10,/ministeries/ministerie-van-buitenlandse-zaken...,2020-10-21
11,/ministeries/ministerie-van-buitenlandse-zaken...,2020-10-19


In [7]:
# Join to use in regex filter
'|'.join(weblogs_df['url'])

'/ministeries/ministerie-van-buitenlandse-zaken/het-werk-van-bz-in-de-praktijk/weblogs/2020/restless-development|/ministeries/ministerie-van-buitenlandse-zaken/het-werk-van-bz-in-de-praktijk/weblogs/2020/susana-puerto-gonzalez|/ministeries/ministerie-van-buitenlandse-zaken/het-werk-van-bz-in-de-praktijk/weblogs/2020/tijmen-rooseboom-youth-at-heart|/ministeries/ministerie-van-buitenlandse-zaken/het-werk-van-bz-in-de-praktijk/weblogs/2020/nationale-raad-van-kinderen|/ministeries/ministerie-van-buitenlandse-zaken/het-werk-van-bz-in-de-praktijk/weblogs/2020/brenda-odallo|/ministeries/ministerie-van-buitenlandse-zaken/het-werk-van-bz-in-de-praktijk/weblogs/2020/romana-osman|/ministeries/ministerie-van-buitenlandse-zaken/het-werk-van-bz-in-de-praktijk/weblogs/2020/ramiro-gomes-monteiro|/ministeries/ministerie-van-buitenlandse-zaken/het-werk-van-bz-in-de-praktijk/weblogs/2020/tommie-van-marula-proteen-limited-uganda|/ministeries/ministerie-van-buitenlandse-zaken/het-werk-van-bz-in-de-praktijk