In [None]:
%pip install -q google-colab-selenium
import google_colab_selenium as gs
driver = gs.Chrome()

<IPython.core.display.Javascript object>

In [None]:
!pip install selenium



In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")

chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--incognito")

chrome_path = "/usr/bin/chromedriver"

driver = gs.Chrome(options=chrome_options)

#----------------------------------------------------------------------------------------------------------------------------

url = "https://discuss.huggingface.co/c/tokenizers/11/l/top"
driver.get(url)

time.sleep(5)

def scrape_titles_links_and_stats():
    titles_links_stats = []
    seen_titles_links = set()

    while True:
        # all topic rows
        topic_rows = driver.find_elements(By.CSS_SELECTOR, "tr.topic-list-item")

        for row in topic_rows:
            try:
                # title and link
                title_elem = row.find_element(By.CSS_SELECTOR, "span.link-top-line a.title.raw-link.raw-topic-link")
                topic_title = title_elem.text.strip()
                topic_link = title_elem.get_attribute('href')

                # replies
                replies_elem = row.find_element(By.CSS_SELECTOR, "td.num.posts span")
                replies = int(replies_elem.text.strip())

                # views
                views_elem = row.find_element(By.CSS_SELECTOR, "td.num.views span")
                views = views_elem.text.strip()

                # activity
                activity_elem = row.find_element(By.CSS_SELECTOR, "td.activity")
                activity = activity_elem.get_attribute("title").strip()

                # add to list if not already seen
                if (topic_title, topic_link) not in seen_titles_links:
                    titles_links_stats.append({
                        "topic_title": topic_title,
                        "topic_link": topic_link,
                        "replies": replies,
                        "views": views,
                        "activity": activity
                    })
                    seen_titles_links.add((topic_title, topic_link))

            except Exception as e:
                print(f"Error scraping a topic row: {e}")

        # scroll down to load more content
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
        time.sleep(3)

        # stop if no new topics are loaded
        if len(topic_rows) == len(driver.find_elements(By.CSS_SELECTOR, "tr.topic-list-item")):
            break

    return titles_links_stats

def get_topic_details(topic_link):
    driver.get(topic_link)
    try:
        # topic_owner
        owner_elem = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "span.first.username a"))
        )
        topic_owner = owner_elem.text.strip()

        # topic_content
        content_elem = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.topic-post.clearfix div.topic-body.clearfix div.cooked"))
        )
        topic_content = content_elem.text.strip()

        # timestamp
        post_date_elem = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.post-info.post-date a span[title]"))
        )
        timestamp = post_date_elem.get_attribute("title").strip()

        # fetch replies
        replies = get_replies_details()

        return {
            "topic_owner": topic_owner,
            "topic_content": topic_content,
            "timestamp": timestamp,
            "replies": replies
        }
    except Exception as e:
        print(f"Error scraping topic details for {topic_link}: {e}")
        return {
            "topic_owner": None,
            "topic_content": None,
            "timestamp": None,
            "replies": []
        }

def get_replies_details():
    replies = []
    try:
        # all reply elements
        reply_elements = driver.find_elements(By.CSS_SELECTOR, "div.topic-body.clearfix")

        for reply in reply_elements[1:]:
            try:
                # username_topic_answer
                username_elem = reply.find_element(By.CSS_SELECTOR, "div.names.trigger-user-card span.first.username")
                username_topic_answer = username_elem.text.strip()

                # topic_answer
                answer_elem = reply.find_element(By.CSS_SELECTOR, "div.regular.contents div.cooked")
                topic_answer = answer_elem.text.strip()

                # timestamp_topic_answer
                timestamp_span = reply.find_element(By.CSS_SELECTOR, "div.post-info.post-date a.widget-link.post-date span[title]")
                timestamp_topic_answer = timestamp_span.get_attribute("title").strip()

                replies.append({
                    "username_topic_answer": username_topic_answer,
                    "topic_answer": topic_answer,
                    "timestamp_topic_answer": timestamp_topic_answer
                })
            except Exception as e:
                print(f"Error scraping a reply: {e}")
    except Exception as e:
        print(f"Error finding replies: {e}")

    return replies

def save_to_csv(data, filename="scraped_data.csv"):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file, quoting=csv.QUOTE_ALL)

        # max_replies based on the 'replies' field
        max_replies = max(topic["replies"] for topic in data)

        headers = [
            "topic_title", "topic_link", "replies", "views",
            "activity", "topic_owner", "topic_content", "timestamp"
        ]
        for i in range(1, max_replies + 1):
            headers.extend([
                f"username_topic_answer_{i}",
                f"topic_answer_{i}",
                f"timestamp_topic_answer_{i}"
            ])
        writer.writerow(headers)

        for topic in data:
            row = [
                topic["topic_title"],
                topic["topic_link"],
                topic["replies"],
                topic["views"],
                topic["activity"].replace("\n", " "),
                topic["details"]["topic_owner"],
                topic["details"]["topic_content"].replace("\n", " "),
                topic["details"]["timestamp"].replace("\n", " "),
            ]

            for reply in topic["details"]["replies"]:
                row.extend([
                    reply["username_topic_answer"],
                    reply["topic_answer"].replace("\n", " "),
                    reply["timestamp_topic_answer"]
                ])

            extra_columns = (max_replies - len(topic["details"]["replies"])) * 3
            row.extend([""] * extra_columns)

            writer.writerow(row)

#----------------------------------------------------------------------------------------------------------------------------

titles_links_stats = scrape_titles_links_and_stats()

for topic in titles_links_stats:
    topic["details"] = get_topic_details(topic["topic_link"])

for topic in titles_links_stats:
    print(f"Title: {topic['topic_title']}")
    print(f"Link: {topic['topic_link']}")
    print(f"Replies: {topic['replies']}")
    print(f"Views: {topic['views']}")
    print(f"Activity: {topic['activity']}")
    print(f"Topic Owner: {topic['details']['topic_owner']}")
    print(f"Topic Content: {topic['details']['topic_content']}")
    print(f"Topic Timestamp: {topic['details']['timestamp']}")
    print("Replies:")
    for i, reply in enumerate(topic["details"]["replies"], 1):
        print(f"  - Username: {reply['username_topic_answer']}")
        print(f"    Answer: {reply['topic_answer']}")
        print(f"    Timestamp: {reply['timestamp_topic_answer']}")
    print("-" * 40)

# save the data to CSV
save_to_csv(titles_links_stats)

#----------------------------------------------------------------------------------------------------------------------------

driver.quit()

<IPython.core.display.Javascript object>

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Link: https://discuss.huggingface.co/t/how-to-perform-tokenization-on-an-onnx-model-in-js/17581
Replies: 0
Views: 811
Activity: Created: May 6, 2022 1:06 am
Topic Owner: msamogh
Topic Content: I’ve exported a custom PyTorch-based Transformer model into ONNX to run it on NodeJS. However, the exported model seems to expect input_ids directly (and not raw text).
Is there any way I can perform tokenization in JS?
Topic Timestamp: May 6, 2022 1:06 am
Replies:
----------------------------------------
Title: Make correct padding for text generation with GPT-NEO
Link: https://discuss.huggingface.co/t/make-correct-padding-for-text-generation-with-gpt-neo/45800
Replies: 0
Views: 795
Activity: Created: Jul 5, 2023 9:57 pm
Latest: Jul 5, 2023 10:16 pm
Topic Owner: junoriosity
Topic Content: In order to make generate text sequences with GPT-NEO, I first load all the relevant components for sequence generation for GPTNeoForCausalLM.
fr

In [None]:
from google.colab import files

files.download("scraped_data.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>