In [2]:
import requests
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import warnings

In [None]:

# Suppress XMLParsedAsHTML warnings
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

def fetch_links(url, start_url, visited, lock):
    new_links = set()
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')
        for a_tag in soup.find_all('a', href=True):
            full_url = urljoin(url, a_tag['href'])
            if full_url.startswith(start_url):
                with lock:
                    if full_url not in visited:
                        new_links.add(full_url)
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
    return url, new_links

def get_all_links(start_url, max_workers=128):
    visited = set()
    to_visit = [start_url]
    lock = threading.Lock()

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        while to_visit:
            futures = []
            with lock:
                current_batch = to_visit[:]
                to_visit = []

            for url in current_batch:
                with lock:
                    visited.add(url)
                    print(f"[✓] Visited: {url}")
                futures.append(executor.submit(fetch_links, url, start_url, visited, lock))

            for future in as_completed(futures):
                _, new_links = future.result()
                with lock:
                    for link in new_links:
                        if link not in visited and link not in to_visit:
                            to_visit.append(link)

    print(f"\n🎉 Done! Collected {len(visited)} links.")
    # Save visited links to a txt file
    with open("visited_links.txt", "w") as f:
        for link in visited:
            f.write(link + "\n")
    return visited

# Example usage
get_all_links("https://forum.beagleboard.org/")

In [1]:
# filter_threads.py
# Filters and sorts discussion thread links from visited_links.txt
import re
from collections import defaultdict

input_file = 'visited_links.txt'
output_file = 'filtered_threads.txt'

thread_links = defaultdict(list)
thread_pattern = re.compile(r'(https?://forum\.beagleboard\.org/t/[^/]+/\d+)(?:/(\d+))?')

with open(input_file, 'r') as f:
    for line in f:
        line = line.strip()
        # Only keep /t/ links, skip /u/ links
        match = thread_pattern.match(line)
        if match:
            thread_base = match.group(1)
            reply_num = int(match.group(2)) if match.group(2) else 0
            thread_links[thread_base].append((reply_num, line))

# Sort threads and replies
sorted_links = []
for thread_base in sorted(thread_links.keys()):
    replies = sorted(thread_links[thread_base], key=lambda x: x[0])
    sorted_links.extend([link for _, link in replies])

with open(output_file, 'w') as f:
    for link in sorted_links:
        f.write(link + '\n')

print(f"Filtered and sorted links written to {output_file}")


Filtered and sorted links written to filtered_threads.txt


In [14]:
import json
import time
from bs4 import BeautifulSoup
import requests

def fetch_thread_content(thread_urls, delay=0.2):

    threads_content = {}
    for thread_url in thread_urls:
        # Get thread base URL (remove /<reply_num> if present)
        base_url = thread_url.rsplit('/', 1)[0] if thread_url.rsplit('/', 1)[-1].isdigit() else thread_url
        if base_url not in threads_content:
            threads_content[base_url] = []

        try:
            response = requests.get(thread_url, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
            # Extract main post content (Discourse: <div class="cooked">)
            cooked = soup.find('div', class_='cooked')
            text = cooked.get_text(separator='\n', strip=True) if cooked else ''
            print(f"Fetched {thread_url} ({len(text)} characters)")
            threads_content[base_url].append(text)
        except Exception as e:
            print(f"Failed to fetch {thread_url}: {e}")
        time.sleep(delay)  # Be polite to the server

    # Join replies for each thread
    threads_content_joined = {url: '\n'.join(filter(None, replies)) for url, replies in threads_content.items()}

    # Save to JSON file
    with open('threads_content.json', 'w', encoding='utf-8') as jf:
        json.dump(threads_content_joined, jf, ensure_ascii=False, indent=2)

    print("Thread contents saved to threads_content.json")

# Example usage:
fetch_thread_content(sorted_links)

Fetched https://forum.beagleboard.org/t/0x400-arch1/40344 (0 characters)
Fetched https://forum.beagleboard.org/t/0x400-arch1/40344#post_3 (0 characters)
Fetched https://forum.beagleboard.org/t/0x400-arch1/40344/3 (0 characters)
Fetched https://forum.beagleboard.org/t/192-168-7-2-is-not-opening-or-reachable/2318/49 (0 characters)
Fetched https://forum.beagleboard.org/t/2020-04-06-how-to-add-hdmi-back-into-bbb-image-after-init-emmc-flasher-v3-sh/33134#post_3 (0 characters)
Fetched https://forum.beagleboard.org/t/2020-04-06-how-to-add-hdmi-back-into-bbb-image-after-init-emmc-flasher-v3-sh/33134 (0 characters)
Fetched https://forum.beagleboard.org/t/2020-04-06-how-to-add-hdmi-back-into-bbb-image-after-init-emmc-flasher-v3-sh/33134/3 (0 characters)
Fetched https://forum.beagleboard.org/t/2025-gsoc-repositories/42207 (0 characters)
Fetched https://forum.beagleboard.org/t/3-19-versus-4-0-dtb-rebuilder/22506 (0 characters)
Fetched https://forum.beagleboard.org/t/4-20-ma-measurement-with-beagle

KeyboardInterrupt: 

In [15]:
import random

# Example: Using a session, custom headers, and random delays
def fetch_thread_content_polite(thread_urls, min_delay=0.5, max_delay=2.0):
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    })
    threads_content = {}
    for thread_url in thread_urls:
        base_url = thread_url.rsplit('/', 1)[0] if thread_url.rsplit('/', 1)[-1].isdigit() else thread_url
        if base_url not in threads_content:
            threads_content[base_url] = []
        try:
            response = session.get(thread_url, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
            cooked = soup.find('div', class_='cooked')
            text = cooked.get_text(separator='\n', strip=True) if cooked else ''
            print(f"Fetched {thread_url} ({len(text)} characters)")
            threads_content[base_url].append(text)
        except Exception as e:
            print(f"Failed to fetch {thread_url}: {e}")
        time.sleep(random.uniform(min_delay, max_delay))  # Random polite delay
    threads_content_joined = {url: '\n'.join(filter(None, replies)) for url, replies in threads_content.items()}
    with open('threads_content_polite.json', 'w', encoding='utf-8') as jf:
        json.dump(threads_content_joined, jf, ensure_ascii=False, indent=2)
    print("Thread contents saved to threads_content_polite.json")

# Example usage:
fetch_thread_content_polite(sorted_links)

Fetched https://forum.beagleboard.org/t/0x400-arch1/40344 (0 characters)
Fetched https://forum.beagleboard.org/t/0x400-arch1/40344#post_3 (0 characters)
Fetched https://forum.beagleboard.org/t/0x400-arch1/40344#post_3 (0 characters)
Fetched https://forum.beagleboard.org/t/0x400-arch1/40344/3 (0 characters)
Fetched https://forum.beagleboard.org/t/0x400-arch1/40344/3 (0 characters)
Fetched https://forum.beagleboard.org/t/192-168-7-2-is-not-opening-or-reachable/2318/49 (0 characters)
Fetched https://forum.beagleboard.org/t/192-168-7-2-is-not-opening-or-reachable/2318/49 (0 characters)
Fetched https://forum.beagleboard.org/t/2020-04-06-how-to-add-hdmi-back-into-bbb-image-after-init-emmc-flasher-v3-sh/33134#post_3 (0 characters)
Fetched https://forum.beagleboard.org/t/2020-04-06-how-to-add-hdmi-back-into-bbb-image-after-init-emmc-flasher-v3-sh/33134#post_3 (0 characters)
Fetched https://forum.beagleboard.org/t/2020-04-06-how-to-add-hdmi-back-into-bbb-image-after-init-emmc-flasher-v3-sh/3313

KeyboardInterrupt: 

In [16]:
# Debugging cell: Check what is being returned by the forum
if sorted_links:
    test_url = sorted_links[0]
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    })
    response = session.get(test_url, timeout=10)
    print(f"Status code: {response.status_code}")
    print("First 500 characters of response:")
    print(response.text[:500])
    soup = BeautifulSoup(response.text, 'html.parser')
    cooked = soup.find('div', class_='cooked')
    print(f"cooked: {cooked}")
else:
    print("sorted_links is empty!")

Status code: 200
First 500 characters of response:
<!DOCTYPE html>
<html lang="en" class="desktop-view not-mobile-device text-size-normal anon">
  <head>
    <meta charset="utf-8">
    <title>0x400(arch1) - General Discussion - BeagleBoard</title>
    <meta name="description" content="Hi all, 
my kernel version is : 
/opt/scripts/tools# ./version.sh 
git:/opt/scripts/:[674bb55e34e94e3837f4f55790c7d1a52c9e149f] 
eeprom:[A335BNLT00C02208SBB16124] 
model:[TI_AM335x_BeagleBone_Black] 
dogtag:[BeagleBoard.&amp;hellip;">
    <meta name="discourse_them
cooked: None


In [18]:
# Print all div classes and a larger HTML snippet to help identify the post content container
if sorted_links:
    test_url = sorted_links[0]
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    })
    response = session.get(test_url, timeout=10)
    soup = BeautifulSoup(response.text, 'html.parser')
    print("First 1000 characters of HTML:")
    print(response.text)
    print("\nAll div classes on the page:")
    divs = soup.find_all('div')
    classes = set()
    for div in divs:
        if 'class' in div.attrs:
            classes.update(div['class'])
    print(classes)
else:
    print("sorted_links is empty!")

First 1000 characters of HTML:
<!DOCTYPE html>
<html lang="en" class="desktop-view not-mobile-device text-size-normal anon">
  <head>
    <meta charset="utf-8">
    <title>0x400(arch1) - General Discussion - BeagleBoard</title>
    <meta name="description" content="Hi all, 
my kernel version is : 
/opt/scripts/tools# ./version.sh 
git:/opt/scripts/:[674bb55e34e94e3837f4f55790c7d1a52c9e149f] 
eeprom:[A335BNLT00C02208SBB16124] 
model:[TI_AM335x_BeagleBone_Black] 
dogtag:[BeagleBoard.&amp;hellip;">
    <meta name="discourse_theme_id" content="1">
    <meta name="discourse_current_homepage" content="categories">

    <meta name="generator" content="Discourse 3.4.1 - https://github.com/discourse/discourse version 909b93f40f92f44757493e67accf65e2181469bd">
<link rel="icon" type="image/png" href="https://forum.beagleboard.org/uploads/default/optimized/2X/f/f5301a304d5f6ecc82ed0f196c1eb3eec34108b6_2_32x32.png">
<link rel="apple-touch-icon" type="image/png" href="https://forum.beagleboard.org/u

In [13]:
import requests
from bs4 import BeautifulSoup, Comment
import re

if True:
    test_url = "https://forum.beagleboard.org/t/192-168-7-2-is-not-opening-or-reachable/2318"
    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    })
    response = session.get(test_url, timeout=10)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Remove script, style, and hidden elements
    for tag in soup(['script', 'style']):
        tag.decompose()
    for tag in soup.select('[style*="display:none"], [hidden], .hidden'):
        tag.decompose()

    # Detect usernames from typical Discourse attributes
    username_tags = soup.find_all(attrs={
        'data-user-card': True
    }) + soup.find_all(class_=['username', 'poster', 'creator'])

    usernames = set(tag.get_text(strip=True) for tag in username_tags if tag.get_text(strip=True))

    # Get visible text
    visible_text = soup.get_text(separator='\n', strip=True)

    # Highlight usernames robustly
    def highlight_usernames(line):
        for uname in sorted(usernames, key=len, reverse=True):
            line = re.sub(rf'(?<!\*)\b{re.escape(uname)}\b(?!\*)', f'**{uname}**', line)
        return line

    # Apply highlight to each line
    highlighted_lines = [highlight_usernames(line) for line in visible_text.split('\n') if line.strip()]
    print('\n'.join(highlighted_lines))

else:
    print("sorted_links is empty!")


192.168.7.2 is NOT opening or reachable - GoogleGroups - BeagleBoard
BeagleBoard
192.168.7.2 is NOT opening or reachable
GoogleGroups
**Pankaj_Rai**
June 26, 2020, 11:59am
1
HI,
I’m just getting started with the BBB and i’m trying to go to on board server 192.168.7.2 as mentioned in the Getting started doc.
But the problem is that 192.168.7.2 is not opening or not reachable. While I have interfaced the BBB with mini B cable to laptop(I’m using Ubuntu 18.0.04 in it) and also connected the BBB with ethernet cable to my laptop.
Please suggest the possible cause and how to debug and make it work.
thanks
Pankaj
**amf99**
June 26, 2020,  4:00pm
2
What Image are you using?
cat /etc/dogtag (provided it is a debian image)
or sudo /opt/scripts/tools/version.sh
or uname -a (might help)
How are you powering the board, 5v supply or usb? if usb, is it via a powered hub?
can you also provide the output of ifconfig from the beaglebone black?
Does Nautilus show BEAGLEBONE folder?
I’m running ‘
BeagleBo

In [14]:
import re

def structure_posts(lines, usernames):
    discussion = []
    post = None

    # Regex for date line like "September 6, 2023, 4:55pm"
    date_re = re.compile(r'^[A-Z][a-z]+ \d{1,2}, \d{4},\s+\d{1,2}:\d{2}(am|pm)$')

    for line in lines:
        # Detect new user line (highlighted: **username**)
        user_match = re.match(r'^\*\*([a-zA-Z0-9_-]+)\*\*$', line)
        if user_match:
            if post:
                discussion.append(post)
            post = {
                'user': user_match.group(1),
                'msg': ''
            }
        elif post:
            if date_re.match(line):
                post['time'] = line
            elif re.match(r'^\d+$', line):
                post['num'] = line
            else:
                post['msg'] += line + '\n'
        # If line before any user, skip it

    if post:
        discussion.append(post)

    return discussion

# Run on highlighted output from previous cell
discussion = structure_posts(highlighted_lines, usernames)

# Print it out cleanly
for p in discussion:
    user = p.get('user', '?')
    time = p.get('time', '')
    num = p.get('num', '')
    msg = p.get('msg', '').strip()
    print(f"**{user}** | **{time}** | **Post #{num}**")
    print(msg)
    print('-' * 60)


**Pankaj_Rai** | **June 26, 2020, 11:59am** | **Post #1**
HI,
I’m just getting started with the BBB and i’m trying to go to on board server 192.168.7.2 as mentioned in the Getting started doc.
But the problem is that 192.168.7.2 is not opening or not reachable. While I have interfaced the BBB with mini B cable to laptop(I’m using Ubuntu 18.0.04 in it) and also connected the BBB with ethernet cable to my laptop.
Please suggest the possible cause and how to debug and make it work.
thanks
Pankaj
------------------------------------------------------------
**amf99** | **June 26, 2020,  4:00pm** | **Post #2**
What Image are you using?
cat /etc/dogtag (provided it is a debian image)
or sudo /opt/scripts/tools/version.sh
or uname -a (might help)
How are you powering the board, 5v supply or usb? if usb, is it via a powered hub?
can you also provide the output of ifconfig from the beaglebone black?
Does Nautilus show BEAGLEBONE folder?
I’m running ‘
BeagleBoard.org
Debian Image 2019-08-03’ and 

In [4]:
import json
import re
import requests
from bs4 import BeautifulSoup
import time

def normalize_thread_url(url):
    url = url.split('#')[0]
    parts = url.rstrip('/').split('/')
    if parts[-1].isdigit() and int(parts[-1]) < 500:
        url = '/'.join(parts[:-1])
    return url

def get_thread_title(soup):
    title_tag = soup.find('title')
    if title_tag:
        return title_tag.text.split('-')[0].strip()
    return ""

def extract_posts(soup):
    posts = []
    for post_div in soup.find_all('div', class_='topic-body'):
        num = None
        pos_span = post_div.find('span', {'itemprop': 'position'})
        if pos_span and pos_span.text.strip().isdigit():
            num = int(pos_span.text.strip())
        user = None
        creator = post_div.find('span', class_='creator')
        if creator:
            user_span = creator.find('span', {'itemprop': 'name'})
            if user_span:
                user = user_span.text.strip()
        content_div = post_div.find('div', class_='post')
        content = content_div.get_text(separator='\n', strip=True) if content_div else ''
        posts.append({'num': num, 'user': user, 'content': content})
    return posts

def scrape_thread(thread_url, session=None, delay=0.5):
    if session is None:
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        })
    base_url = normalize_thread_url(thread_url)
    all_posts = []
    seen_nums = set()
    last_num = 0
    try:
        resp = session.get(base_url, timeout=10)
        resp.raise_for_status()
    except Exception as e:
        print(f"Failed to fetch {base_url}: {e}")
        return None
    soup = BeautifulSoup(resp.text, 'html.parser')
    thread_name = get_thread_title(soup)
    posts = extract_posts(soup)
    for post in posts:
        if post['num'] and post['num'] not in seen_nums:
            all_posts.append(post)
            seen_nums.add(post['num'])
            last_num = max(last_num, post['num'] or 0)
    time.sleep(delay)
    next_num = last_num + 1
    while True:
        next_url = f"{base_url}/{next_num}"
        try:
            resp = session.get(next_url, timeout=10)
            if resp.status_code != 200:
                break
        except Exception as e:
            print(f"Failed to fetch {next_url}: {e}")
            break
        soup = BeautifulSoup(resp.text, 'html.parser')
        posts = extract_posts(soup)
        found = False
        for post in posts:
            if post['num'] and post['num'] not in seen_nums:
                all_posts.append(post)
                seen_nums.add(post['num'])
                last_num = max(last_num, post['num'] or 0)
                found = True
        if not found:
            break
        next_num = last_num + 1
        time.sleep(delay)
    content = '\n\n'.join(
        f"Post #{p['num']} by {p['user']}:\n{p['content']}" for p in sorted(all_posts, key=lambda x: x['num'] or 0)
    )
    return {
        'url': base_url,
        'thread_name': thread_name,
        'content': content,
        'num_replies': len(all_posts)
    }

# Read links from filtered_threads.txt and scrape all threads
with open('filtered_threads.txt', 'r') as f:
    links = [line.strip() for line in f if line.strip()]
thread_urls = set(normalize_thread_url(url) for url in links)
results = []
for url in thread_urls:
    print(f"Scraping: {url}")
    data = scrape_thread(url)
    if data:
        results.append(data)
        print(f"Done: {data['thread_name']} ({data['num_replies']} replies)")
    else:
        print(f"Failed: {url}")
with open('scraped_threads_complete.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)
print("All threads scraped and saved to scraped_threads_complete.json")

Scraping: https://forum.beagleboard.org/t/librobotcontrol-support-for-newer-boards/37187
Done: librobotcontrol support for newer boards (19 replies)
Scraping: https://forum.beagleboard.org/t/video-of-the-open-source-chip-design-panel/29256
Done: librobotcontrol support for newer boards (19 replies)
Scraping: https://forum.beagleboard.org/t/video-of-the-open-source-chip-design-panel/29256
Done: Video of the Open Source Chip Design panel (2 replies)
Scraping: https://forum.beagleboard.org/t/repo-access-and-re-build-scripts/42187
Done: Video of the Open Source Chip Design panel (2 replies)
Scraping: https://forum.beagleboard.org/t/repo-access-and-re-build-scripts/42187
Done: Repo access and re (2 replies)
Scraping: https://forum.beagleboard.org/t/recover-beagleplay/41533
Done: Repo access and re (2 replies)
Scraping: https://forum.beagleboard.org/t/recover-beagleplay/41533
Done: recover beagleplay (3 replies)
Scraping: https://forum.beagleboard.org/t/impossible-to-attach-plain-text/30431


In [5]:
import os

# Load previously scraped results
with open('scraped_threads_complete.json', 'r', encoding='utf-8') as f:
    scraped = json.load(f)
existing_urls = set(item['url'] for item in scraped if item and 'url' in item)

# Find all thread URLs from filtered_threads.txt
with open('filtered_threads.txt', 'r') as f:
    all_links = [line.strip() for line in f if line.strip()]
all_thread_urls = set(normalize_thread_url(url) for url in all_links)

# Find failed URLs (not in scraped or with 0 replies)
failed_urls = [url for url in all_thread_urls if url not in existing_urls]
# Also retry any with 0 replies
failed_urls += [item['url'] for item in scraped if item.get('num_replies', 1) == 0]
failed_urls = list(set(failed_urls))

print(f"Retrying {len(failed_urls)} failed threads...")
new_results = []
for url in failed_urls:
    print(f"Retrying: {url}")
    data = scrape_thread(url)
    if data and data['num_replies'] > 0:
        new_results.append(data)
        print(f"Success: {data['thread_name']} ({data['num_replies']} replies)")
    else:
        print(f"Still failed: {url}")

# Append new results and save
if new_results:
    all_results = scraped + new_results
    # Remove duplicates by url
    unique = {}
    for item in all_results:
        if item and 'url' in item:
            unique[item['url']] = item
    with open('scraped_threads_complete.json', 'w', encoding='utf-8') as f:
        json.dump(list(unique.values()), f, ensure_ascii=False, indent=2)
    print(f"Added {len(new_results)} new threads to scraped_threads_complete.json")
else:
    print("No new threads were successfully scraped.")


Retrying 141 failed threads...
Retrying: https://forum.beagleboard.org/t/debian-faq/30743
Success: Debian FAQ (1 replies)
Retrying: https://forum.beagleboard.org/t/problems-to-do-login-in-openbeagle-org/39651
Success: Debian FAQ (1 replies)
Retrying: https://forum.beagleboard.org/t/problems-to-do-login-in-openbeagle-org/39651
Success: Problems to do login in openbeagle.org (6 replies)
Retrying: https://forum.beagleboard.org/t/how-to-control-fine-pwm-on-ai64/35539
Success: Problems to do login in openbeagle.org (6 replies)
Retrying: https://forum.beagleboard.org/t/how-to-control-fine-pwm-on-ai64/35539
Success: How to control fine PWM on AI64? (44 replies)
Retrying: https://forum.beagleboard.org/t/i2c-bus-speed-beaglev-fire/41101
Success: How to control fine PWM on AI64? (44 replies)
Retrying: https://forum.beagleboard.org/t/i2c-bus-speed-beaglev-fire/41101
Success: I2C bus speed BeagleV fire (12 replies)
Retrying: https://forum.beagleboard.org/t/change-direction-of-beagle-y-ai-gpio-with