In [65]:
import datetime
from time import sleep, time
import re
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import requests
from bs4 import BeautifulSoup
import csv
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, wait
from webdriver_manager.chrome import ChromeDriverManager

In [66]:
# Declare some constants:

filename = 'articles_info.csv' 
driver_path = '/Users/anna.zemit/chromedriver'
base_dir= '/Users/anna.zemit/Desktop' 
user_agent = 'Mozilla/5.0' 
start_time = time() 

In [67]:
# Calculate how long it takes to load each web page

def get_load_time(article_url, user_agent):
    
    try:
        headers = {
            "User-Agent": user_agent
        }
        response = requests.get(
            article_url, headers=headers, stream=True, timeout=3.000
        )
        load_time = response.elapsed.total_seconds()
    except Exception as e:
        print(e)
        load_time = ">3"
    return load_time

In [68]:
# Save the results to a file

def write_to_file(output_list, filename, base_dir):
    for row in output_list:
        with open(Path(base_dir).joinpath(filename), "a") as csvfile:
            fieldnames = ['id', 'load_time', 'rank', 'points', 'title', 'url', 'comments_number']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writerow(row)

In [69]:
# This method waits for the table to be loaded onto the page and returns TRUE if it is available.

def connect_to_base(browser, page_number):
    base_url = "https://news.ycombinator.com/news?p={}".format(page_number)
    for connection_attempts in range(1,4): 
        try:
            browser.get(base_url)
            WebDriverWait(browser, 5).until(
                EC.presence_of_element_located((By.ID, "hnmain"))
            )
            return True
        except Exception as e:
            print(e)
            print("Error connecting to {}.".format(base_url))
            print("Attempt #{}.".format(connection_attempts))
    return False

In [70]:
# Parse the page, extracting the necessary attributes and saving them

def parse_html(html, user_agent):
    soup = BeautifulSoup(html, "html.parser")
    output_list = []
    
# find object id, rank, score and title in soup
    tr_blocks = soup.find_all("tr", class_="athing")
    subtext = soup.find_all("td", class_="subtext")
    article = 0
    for tr in tr_blocks:
        article_id = tr.get("id") # id
        article_url = tr.find_all("a")[1]["href"]

# sometimes the article is not located on an external site
        if "item?id=" in article_url or "from?site=" in article_url:
            article_url = f"https://news.ycombinator.com/{article_url}"
        load_time = get_load_time(article_url, user_agent)
        
# sometimes there is no rating
        try:
            score = soup.find(id=f"score_{article_id}").string
        except Exception as e:
            print(e)
            score = "0 points"
            
# find a comments number
    
        try:
            comments_number = int(re.findall('([0-9]+)\xa0comment[s]*', subtext[article].text)[0])
        except Exception as e:
            comments_number = 0

        article_info = {
            "id": article_id,
            "load_time": load_time,
            "rank": tr.span.string,
            "points": score,
            "title": tr.find(class_="titlelink").string,
            "url": article_url,
            'comments_number': comments_number
        }

# add information about the article to the list
        output_list.append(article_info)
        article += 1
    return output_list

In [None]:
def run_process(page_number, filename):
    browser = webdriver.Chrome(executable_path=driver_path)
    if connect_to_base(browser, page_number):
        sleep(5)
        output_list = parse_html(browser.page_source, user_agent)
        write_to_file(output_list, filename, base_dir)
        
        browser.quit()
    else:
        print("Error connecting to hacker news")
        browser.quit()

futures = []

with ThreadPoolExecutor() as executor:
    for number in range(2):
        futures.append(
            executor.submit(run_process, number, filename)
        )
wait(futures)
end_time = time()
elapsed_time = end_time - start_time
print("Elapsed run time: {} seconds".format(elapsed_time))