In [1]:
%load_ext pycodestyle_magic
%pycodestyle_on

In [2]:
# OZ stands for Omroep Zeeland
OZ_DISQUS_API_KEY = 'E8Uh5l5fHZ6gD8U3KycjAIAk46f68Zw7C6eW8WSjZvCLXebZ7p0r1yrYDrLilk2F'

2:80: E501 line too long (86 > 79 characters)


In [3]:
from requests import get
from json import loads


def get_disqus_data(url, identifier):
    '''
    Retrieves a Disqus data on a web page.
    '''
    response = get(
        'https://disqus.com/api/3.0/threads/details',
        params={
            'thread': 'ident:' + identifier,
            'forum': 'omroepzeeland',
            'api_key': OZ_DISQUS_API_KEY
        }
    )
    return loads(response.text)

In [4]:
from bs4 import BeautifulSoup
from re import findall


def find_element(url, method, element, class_='', parser='html.parser'):
    '''
    Performs web scraping given the url, elements
    and class to look for.
    '''
    # Get the HTML of the page
    text = get(url).text
    soup = BeautifulSoup(text, parser)
    return getattr(soup, method)(
        element,
        class_=class_
    )

In [5]:
def web_scrape_omroepzeeland(url):
    '''
    Returns a list of all the comments on the web page.
    Each comment is a dictionary.
    '''
    def get_identifier(script_string):
        '''
        Find all parts between parenthesis
        and take the second one (the identifier)
        '''
        return findall("'(.*?)'", script_string)[1]

    def get_comment_data(feed_url, page_url):
        def remove_tags(string, tags):
            for tag in tags:
                string = string.replace(tag, '')
            return string

        items = find_element(
            feed_url, 'find_all', 'item'
        )

        # Check if there are any comments
        if len(items) == 0:
            return

        return [
            {
                'creator': item.find('dc:creator').text,
                'text': remove_tags(
                    item.find('description').text,
                    [
                        '<br>', '<br/>', '<br />',
                        '<p>', '</p>'
                    ]
                ),
                'link': page_url,
                'timestamp': item.find('pubdate').text
            }
            for item in items
        ]
    script = find_element(url, 'find', 'div', class_='news-comments')
    if script:
        script = script.find('script').string
    else:
        # Page does not have a div with class news-comments
        return

    identifier = get_identifier(script)
    disqus_data = get_disqus_data(url, identifier)

    return get_comment_data(
        disqus_data['response']['feed'],
        url
    )

In [6]:
def web_scrape(sources_df):
    '''
    Returns a DataFrame with web scraped data
    from the input DataFrame.
    '''
    comments_df = pd.DataFrame(
        columns=['creator', 'text', 'link', 'timestamp']
    )
    for index, _ in sources_df.iterrows():
        # Where index is an URL (link)
        series_dict = web_scrape_omroepzeeland(index)
        # Check if there are any comments
        if series_dict:
            comments_df = comments_df.append(
                series_dict, ignore_index=True
            )
    return comments_df

In [7]:
import pandas as pd
from os import getcwd

sources_df = pd.read_csv(getcwd() + '/data/sources.csv', index_col='link')
comments_df = web_scrape(sources_df)

print(comments_df.head())
comments_df.to_csv(getcwd() + '/data/comments.csv', index=False)

                   creator                                               text  \
0  Bastiaan van Stapelberg  Dit is zulke verschrikkelijke onzin. Ik heb he...   
1        disqus_qR1IbSLz73  Waarop gebaseerd? Niet op de mening van de stu...   
2               HZ Student  Ik kan niet spreken voor alle opleidingen binn...   
3          Kuchende meneer  Hoe zit het nu met HBO-ICT?, of mogen wij daar...   
4               T'is roak!  Hier nog een anonieme insider. Deze sluit zich...   

                                                link  \
0  https://www.omroepzeeland.nl/nieuws/122990/HZ-...   
1  https://www.omroepzeeland.nl/nieuws/122990/HZ-...   
2  https://www.omroepzeeland.nl/nieuws/122990/HZ-...   
3  https://www.omroepzeeland.nl/nieuws/108573/HZ-...   
4  https://www.omroepzeeland.nl/nieuws/101070/HZ-...   

                         timestamp  
0  Thu, 08 Oct 2020 06:23:20 -0000  
1  Thu, 08 Oct 2020 05:48:38 -0000  
2  Thu, 08 Oct 2020 05:31:13 -0000  
3  Thu, 04 Oct 2018 03:19:33