In [1]:
from pprint import pprint
from hashlib import sha256
from time import sleep
from io import BytesIO
from PIL import Image
import csv

import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from func_timeout import func_set_timeout, FunctionTimedOut
from tqdm import tqdm

# HACK: Add conda env to the PATH, this is needed for the Firefox dirver
import os
os.environ["PATH"] += ":/home/matjazibb/miniconda3/envs/nlp/bin"

## 24h News - Scraping 

In [2]:
def get_screenshot(driver):
    png = driver.get_screenshot_as_png()
    img = Image.open(BytesIO(png))
    return img

In [3]:
def news24h_get_total_comments(driver):
    selector = "div.article__details .article__details-item.c-pointer.link.link--plain div.article__details-content > div.article__details-main"
    return int(driver.find_element_by_css_selector(selector).text.strip())

def news24h_get_loaded_comments(driver):
    # selector = "#onl-article-comments > div > div.title.title--medium.title--icon.title--noborder > h3 > span"
    # return int(driver.find_element_by_css_selector(selector).text.strip()[1:-1])
    selector = "div.article__comments div.comment[id]"
    return len(driver.find_elements_by_css_selector(selector))

def news24h_accept_cookies(driver):
    selector = "onl-cookie > div > div > div > div.cookies__right > a.button.button--large.button--primary.button--expanded.button--noborder"
    cookies_present = EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
    try:
        WebDriverWait(driver, 5).until(cookies_present)
        driver.find_element_by_css_selector(selector).click()
        sleep(1)
    except: pass

@func_set_timeout(60)
def news24h_expend_comments(driver):
    try:
        total_comments = news24h_get_total_comments(driver)
        loaded_comments = news24h_get_loaded_comments(driver)
        load_more_present = EC.element_to_be_clickable((By.CSS_SELECTOR , "div.comments__more > button"))
        load_more_clickabke = EC.element_to_be_clickable((By.CSS_SELECTOR , "div.comments__more > button"))
        while min(200, total_comments) > loaded_comments and load_more_present(driver) and load_more_clickabke(driver):
            print(f"Loading more comments ({loaded_comments}/{total_comments})...")
            driver.find_element_by_css_selector("div.comments__more > button").click()
            sleep(5)
            loaded_comments = news24h_get_loaded_comments(driver)
    except: pass

In [4]:
def news_24h_load_comments(url):
    assert url.startswith("https://www.24ur.com/")

    driver_optios = Options()
    driver_optios.headless = True
    with webdriver.Firefox(options=driver_optios) as driver:
        driver.get(url)

        # Accept cookies
        news24h_accept_cookies(driver)

        # Wait for comments section to be loaded
        comments_present = EC.presence_of_element_located((By.CSS_SELECTOR, "div.article__comments"))
        WebDriverWait(driver, 10).until(comments_present)

        # Expend comments section by clicking on "load more" button, max 200 comments
        try: news24h_expend_comments(driver)
        except FunctionTimedOut: pass 

        html = BeautifulSoup(driver.page_source, 'html.parser')

    comments = []

    for comment in html.select("div.article__comments div.comment[id]"):
        id = comment.attrs["id"]
        author = sha256(comment.select_one("div.comment__header a.comment__author").text.strip().encode()).hexdigest()
        timestamp = comment.select_one("div.comment__header div.comment__timestamp").text.strip()
        body = comment.select_one("div.comment__body").text.strip()

        comments += [(url, id, timestamp, author, body)]

    return comments

## Scrape selected pages

In [5]:
urls = [
    "https://www.24ur.com/novice/fokus/novinarstvo-je-kisik-demokracije.html",
    "https://www.24ur.com/novice/tujina/severna-koreja-nad-bidna-in-lazno-diplomacijo-vodi-sovrazno-politiko.html",
    "https://www.24ur.com/popin/glasba/billie-eilish-napovedala-prihod-novega-albuma.html",
    "https://www.24ur.com/novice/znanost-in-tehnologija/zeleni-bitcoin.html",
    "https://www.24ur.com/tv-oddaje/sanjski-moski/sanjskemu-moskemu-je-ob-postavni-kaji-postalo-vroce.html",
    "https://www.24ur.com/novice/crna-kronika/islamska-skupnost-o-skrunitvi-grobov-v-domzalah-gre-za-javno-spodbujanje-sovrastva-nasilja-in-nestrpnosti.html",
]

In [6]:
comments = []
for url in tqdm(urls):
    comments += news_24h_load_comments(url)

  0%|          | 0/6 [00:00<?, ?it/s]Loading more comments (29/490)...
Loading more comments (63/490)...
Loading more comments (119/490)...
Loading more comments (170/490)...
 17%|█▋        | 1/6 [00:28<02:22, 28.55s/it]Loading more comments (31/49)...
 50%|█████     | 3/6 [00:47<00:55, 18.53s/it]Loading more comments (75/171)...
Loading more comments (132/171)...
 67%|██████▋   | 4/6 [01:04<00:36, 18.26s/it]Loading more comments (24/48)...
 83%|████████▎ | 5/6 [01:16<00:16, 16.33s/it]Loading more comments (24/428)...
Loading more comments (56/428)...
Loading more comments (96/428)...
Loading more comments (132/428)...
Loading more comments (172/428)...
100%|██████████| 6/6 [01:50<00:00, 18.39s/it]


In [7]:
comments_df = pd.DataFrame(comments, columns=["url", "id", "timestamp", "author", "body"])

In [8]:
!mkdir -p "data/ours/news_24h"
comments_df.to_csv("data/ours/news_24h/raw.csv", index=False, quoting=csv.QUOTE_ALL)

In [9]:
comments_df.head()

Unnamed: 0,url,id,timestamp,author,body
0,https://www.24ur.com/novice/fokus/novinarstvo-...,c_01F4P7BHMZVJB9K0TXZS9TMBPF,"02.05.2021, 11:43:06",24471d8cf01a0482316844ce2117e2b10937833b483bfe...,"Desni nam bodo povedali kaj je demokracija, ti..."
1,https://www.24ur.com/novice/fokus/novinarstvo-...,c_01F4P7BHMZVJB9K0TXZS9TMBPF_01F4P7F72V2TBPFG7...,"02.05.2021, 11:45:06",0c64410e8ecc5ead90e484048afd6873f370d9d3c918bd...,"jp, oni vedo, jim je diktator pacient povedal ..."
2,https://www.24ur.com/novice/fokus/novinarstvo-...,c_01F4P79AV0WMMXF48J2T6M117V,"02.05.2021, 11:41:54",fee0df776e4d4db012e0478065c908dc72529dc0a3d83c...,ker ni novica tega portala je ne smeš oment
3,https://www.24ur.com/novice/fokus/novinarstvo-...,c_01F4P78MRBPHV4DWCZGZEZAZK7,"02.05.2021, 11:41:31",deba298c79866ae7e693835b3aee82542331be9a723381...,"Če bi obstajal pritisk na medije, ta članek sp..."
4,https://www.24ur.com/novice/fokus/novinarstvo-...,c_01F4P765PYDN54J3XGBTJ9KED6,"02.05.2021, 11:40:10",6051da199c1341f6783b7d0324c73af056d0542158040e...,Je pa Veselko pošten-je zavrnu 17 tisoč ponuje...
