In [None]:
import helpers
import pathlib
import json
import time

from datetime import datetime, timezone
from selenium.webdriver import Remote, ChromeOptions
from bs4 import BeautifulSoup

now = datetime.now(timezone.utc)
# today = now.strftime("%Y-%m-%d")
today = "2024-02-13"

In [None]:
NBS_DIR = pathlib.Path().resolve().parent
BASE_DIR = NBS_DIR
DATASET_DIR = BASE_DIR / "dataset"
TODAYS_DIR = DATASET_DIR /today
POSTS_DIR = TODAYS_DIR / "posts"
print(BASE_DIR, POSTS_DIR, POSTS_DIR.exists())

In [None]:
POSTS_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
options = ChromeOptions()
MAX_PAGES = 1
today = "2024-02-13"

# disable downloading images
prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)

now = datetime.now(timezone.utc)
# today = now.strftime("%Y-%m-%d")

url_pattern = "https://news.ycombinator.com/front?day={day}&p={page}"
detail_pattern = "https://news.ycombinator.com/item?id={item_id}"
sbr_connection = helpers.get_sbr_connection()

In [None]:
html_datas = []
with Remote(sbr_connection, options=options) as driver:
    for page in range(1, MAX_PAGES + 1):
        url = url_pattern.format(day=today, page=page)
        print(page, url)
        driver.get(url) # HTTP GET
        time.sleep(2)
        html_source = driver.page_source
        html_datas.append(html_source)

In [None]:
def extract_post_data(tr):
    id = tr.attrs.get('id')
    next_tr = tr.find_next('tr')
    score_span = next_tr.find("span", class_="score")
    score = None
    if score_span:
        score = "".join([x for x in score_span.get_text() if x.isdigit()])
    title_element = tr.find("span", class_="titleline")
    text = title_element.get_text()
    target_links = [x.get('href') for x in tr.find_all('a') if x.get('href').startswith("http")]
    target_link = target_links[0] if len(target_links) >= 1 else None
    detail_link = detail_pattern.format(item_id = id)
    return  {
            "id": id,
            "text": text,
            'target_link': target_link,
            "score": score,
            "thread_link": detail_link
        }

In [None]:
def scrape_link(url=None, driver=None):
    if url is None:
        return ""
    if not f"{url}".startswith("http"):
        return ""
    if not driver:
        return ""
    driver.get(url)
    return driver.page_source

In [None]:
def save_json_data(data, path=None):
    id = data.get('id')
    json_data = json.dumps(data, indent=4)
    if path:
        path.write_text(json_data)

In [None]:
def scrape_and_save(data, key, driver=None, path=None):
    if path is None:
        return 
    try:
        data = scrape_link(data.get(key), driver=driver)
    except:
        data = None
    if data is not None:
        path.write_text(data)

In [None]:
scrape_all = True
save_thread_data = True
scrape_thread_detail = False
scrape_target = False

In [None]:
dataset = []
for html_source in html_datas:
    soup = BeautifulSoup(html_source, 'html.parser')
    rows = soup.find_all('tr', class_="athing")
    with Remote(sbr_connection, options=options) as driver:
        for tr in rows:
            id = tr.attrs.get('id')
            post_dir = POSTS_DIR / f"{id}"
            post_dir.mkdir(parents=True, exist_ok=True)
            json_output_path = post_dir  / 'detail.json'
            thread_output_path = post_dir / "thread.html"
            target_output_path = post_dir / "target.html"
            data = extract_post_data(tr)
            if save_thread_data or scrape_all:
                if not json_output_path.exists():
                    save_json_data(data, path=json_output_path)
            if scrape_thread_detail or scrape_all:
                if not thread_output_path.exists():
                    scrape_and_save(data, 'thread_link', driver=driver, path=thread_output_path)
            if scrape_target or scrape_all:
                if not target_output_path.exists():
                    scrape_and_save(data, 'target_link', driver=driver, path=target_output_path)