In [None]:
from youtube_transcript_api import YouTubeTranscriptApi

def get_youtube_transcript(video_url: str) -> list:
    video_id = video_url.split("v=")[1]
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    return transcript


def youtube_scraper():
    video_urls = [
        "https://youtu.be/8QfI5a7lTKU?si=WA-Io93sb1dRh6ju",
        "https://www.youtube.com/watch?v=96XsJ7xfsS8",
        "https://www.youtube.com/watch?v=ERhXoIn7kr4",
        "https://www.youtube.com/watch?v=dxcU-_PGZdw",
        "https://www.youtube.com/watch?v=3HuV1M1NMB8",
        "https://www.youtube.com/watch?v=Lm1ediRG5JA",
        "https://www.youtube.com/watch?v=jkoGkAd0GYk",
        "https://www.youtube.com/watch?v=ntJkRO_Z41I",
        "https://www.youtube.com/watch?v=bp7MAZh4lJA",
        "https://www.youtube.com/watch?v=laWn7_cj434",
        "https://www.youtube.com/watch?v=_qQAfTmB5wc",
    ]

In [None]:
import requests
from bs4 import BeautifulSoup

# def get_top_answer(question_url):
#     response = requests.get(question_url)
#     if response.status_code != 200:
#         print(f"Failed to fetch question page: {question_url}")
#         return "No answer found."

#     soup = BeautifulSoup(response.text, 'html.parser')
#     answer_elem = soup.find('div', class_='answer')
    
#     if not answer_elem:
#         return "No answers available."
    
#     # Extract the answer with the most votes
#     votes = answer_elem.find('span', class_='js-vote-count')
#     answer_body = answer_elem.find('div', class_='s-prose')
    
#     if votes and answer_body:
#         return {
#             'votes': votes.text.strip(),
#             'content': answer_body.text.strip()
#         }
#     return "No valid answer found."

def scrape_robotics_stackexchange(tag='ros', sort_category='Votes', num_pages=5):
    # https://robotics.stackexchange.com/questions/tagged/ros?tab=Votes
    base_url = f"https://robotics.stackexchange.com/questions/tagged/{tag}"
    questions_data = []

    for page in range(1, num_pages + 1):
        url = f"{base_url}?tab={sort_category}&page={page}"
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch page {page}")
            continue
        
        soup = BeautifulSoup(response.text, 'html.parser')
        questions = soup.find_all('div', class_='s-post-summary')

        for question in questions:
            title_elem = question.find('a', class_='s-link')
            title = title_elem.text.strip()
            link = "https://robotics.stackexchange.com" + title_elem['href']
            votes = question.find('span', class_='s-post-summary--stats-item-number').text.strip()
            summary_elem = question.find('div', class_='s-post-summary--content-excerpt')
            summary = summary_elem.text.strip() if summary_elem else "No summary available."

            # Visit the question page to get the top answer
            # answer = get_top_answer(link)
            
            questions_data.append({
                'tag': tag,
                'title': title,
                'link': link,
                'votes': votes,
                'summary': summary,
                # 'answer': answer,
            })

    return questions_data

# Usage
robotics_stackexchange_questions = []
tags = ['ros', 'gazebo', 'nav2', 'moveit']
for tag in tags:
    robotics_stackexchange_questions += scrape_robotics_stackexchange(tag=tag)


{'title': 'Can ROS run on a Raspberry Pi?', 'link': 'https://robotics.stackexchange.com/questions/230/can-ros-run-on-a-raspberry-pi', 'votes': '32', 'summary': 'Can ROS run on a Raspberry Pi?\n\nROS is resigned to run on a network of machines, with different machines, even different cores on the same machine doing different jobs. Can one of those machines be a ...'}
{'title': 'ROS: catkin_make vs. catkin build', 'link': 'https://robotics.stackexchange.com/questions/16604/ros-catkin-make-vs-catkin-build', 'votes': '23', 'summary': "When using ROS:\n\nWhy should I use catkin build from catkin_tools instead of catkin_make? \n\nI've been told that ..."}
{'title': 'ROS: Best practices?', 'link': 'https://robotics.stackexchange.com/questions/3110/ros-best-practices', 'votes': '19', 'summary': "I'm going to build a small robot system, and it seems like that ROS serves a nice framework to control and program the system.\n\nHowever, I am wondering which is the best practice to manage the ..."}


In [19]:
len(robotics_questions)

250

In [2]:
from github import Github

def fetch_github_issues_with_answers(repo_name, token, max_issues=10):
    g = Github(token)  # Authenticate using your personal access token
    repo = g.get_repo(repo_name)
    issues = repo.get_issues(state='open')
    
    issues_data = []

    for issue in issues[:max_issues]:  # Limit the number of issues to fetch
        top_comment = get_top_answer(issue)
        issues_data.append({
            'title': issue.title,
            'url': issue.html_url,
            'created_at': issue.created_at,
            'labels': [label.name for label in issue.labels],
            'body': issue.body[:200],  # First 200 characters of the body
            'top_answer': top_comment,
        })

    return issues_data


def get_top_answer(issue):
    """
    Fetch the comment with the most upvotes or the author's comment, if present.
    """
    comments = issue.get_comments()
    if not comments.totalCount:
        return "No comments available."
    
    # Track the top-voted comment
    top_comment = None
    max_upvotes = -1

    for comment in comments:
        upvotes = comment.reactions['+1']  # '+1' reaction is the equivalent of upvotes
        if upvotes > max_upvotes:
            max_upvotes = upvotes
            top_comment = comment

    # Check if the author responded (author's comments take precedence)
    for comment in comments:
        if comment.user.login == issue.user.login:
            return {
                'author_response': True,
                'content': comment.body[:200],  # First 200 characters of the response
                'upvotes': comment.reactions['+1']
            }

    # Return the most-upvoted comment if no author response
    if top_comment:
        return {
            'author_response': False,
            'content': top_comment.body[:200],  # First 200 characters of the comment
            'upvotes': max_upvotes
        }
    
    return "No valid comments found."


# Usage
# Replace 'your_token_here' with your GitHub Personal Access Token.
# Replace 'ros-planning/navigation2' with the desired repository.
repo_name = 'ros-planning/navigation2'
token = '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'  # Replace with your GitHub token
github_issues_with_answers = fetch_github_issues_with_answers(repo_name, token)

for issue in github_issues_with_answers[:5]:  # Print first 5 issues and their top answers
    print(f"Issue Title: {issue['title']}")
    print(f"Issue URL: {issue['url']}")
    print(f"Created At: {issue['created_at']}")
    print(f"Labels: {issue['labels']}")
    print(f"Description: {issue['body']}")
    print(f"Top Answer: {issue['top_answer']}")
    print()

Following Github server redirection from /repos/ros-planning/navigation2 to /repositories/135363400


Issue Title: new param to disable collision checking in DriveOnHeading and BackUp actions
Issue URL: https://github.com/ros-navigation/navigation2/pull/4785
Created At: 2024-12-05 21:17:30+00:00
Labels: []
Description: <!-- Please fill out the following pull request template for non-trivial changes to help us process your PR faster and more efficiently.-->

---

## Basic Info

| Info | Please fill out this col
Top Answer: {'author_response': True, 'content': "> Otherwise generally looks good to me! The BT node config guide pages need updating with the new port, migration guide to highlight this change\r\n\r\nI'm not yet up to speed on the release process. Woul", 'upvotes': 0}

Issue Title: Fix goal updater for CI (backport #4558)
Issue URL: https://github.com/ros-navigation/navigation2/pull/4784
Created At: 2024-12-04 12:37:25+00:00
Labels: []
Description: <hr>This is an automatic backport of pull request #4558 done by [Mergify](https://mergify.com).
Top Answer: {'author_response': Tru

In [3]:
github_issues_with_answers

[{'title': 'new param to disable collision checking in DriveOnHeading and BackUp actions',
  'url': 'https://github.com/ros-navigation/navigation2/pull/4785',
  'created_at': datetime.datetime(2024, 12, 5, 21, 17, 30, tzinfo=datetime.timezone.utc),
  'labels': [],
  'body': '<!-- Please fill out the following pull request template for non-trivial changes to help us process your PR faster and more efficiently.-->\r\n\r\n---\r\n\r\n## Basic Info\r\n\r\n| Info | Please fill out this col',
  'top_answer': {'author_response': True,
   'content': "> Otherwise generally looks good to me! The BT node config guide pages need updating with the new port, migration guide to highlight this change\r\n\r\nI'm not yet up to speed on the release process. Woul",
   'upvotes': 0}},
 {'title': 'Fix goal updater for CI (backport #4558)',
  'url': 'https://github.com/ros-navigation/navigation2/pull/4784',
  'created_at': datetime.datetime(2024, 12, 4, 12, 37, 25, tzinfo=datetime.timezone.utc),
  'labels': [

In [16]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def scrape_ros_docs(base_url, max_pages=10):
    visited_urls = set()
    to_visit = []
    scraped_data = []

    def fetch_navigation_links(url):
        """Fetch links from the navigation menu."""
        print(f"Fetching navigation links from: {url}")
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch {url}")
            return []

        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the navigation menu links
        navigation_div = soup.find('div', class_='wy-menu wy-menu-vertical')
        if not navigation_div:
            print("Navigation menu not found.")
            return []

        # Extract all anchor links within the navigation menu
        links = [
            urljoin(base_url, a['href'])
            for a in navigation_div.find_all('a', href=True)
            if a['href'] and not a['href'].startswith('#')  # Exclude fragment identifiers
        ]
        return links

    def fetch_page_content(url):
        """Fetch and parse the content of a single page."""
        print(f"Fetching page content from: {url}")
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch {url}")
            return None

        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.find('title').text.strip() if soup.find('title') else "No title"

        # Extract main content from <div role="main">
        content_div = soup.find('div', {'role': 'main'})
        if content_div:
            # Extract all text while stripping HTML tags
            content_text = content_div.get_text(separator="\n", strip=True)
        else:
            content_text = "No content available."

        return {"url": url, "title": title, "content": content_text}

    # Start by fetching navigation links from the base URL
    to_visit.extend(fetch_navigation_links(base_url))

    for url in to_visit[:max_pages]:
        if url in visited_urls:
            continue

        page_data = fetch_page_content(url)
        if page_data:
            scraped_data.append(page_data)
            visited_urls.add(url)

    return scraped_data


# Usage
base_url = "https://docs.ros.org/en/galactic/index.html"
scraped_docs = scrape_ros_docs(base_url, max_pages=-1)

# Print scraped data
for doc in scraped_docs:
    print(f"Title: {doc['title']}")
    print(f"URL: {doc['url']}")
    print(f"Content Snippet: {doc['content'][:1000]}...")  # Print first 1000 characters of content text
    print()


Fetching navigation links from: https://docs.ros.org/en/galactic/index.html
Fetching page content from: https://docs.ros.org/en/galactic/Installation.html
Fetching page content from: https://docs.ros.org/en/galactic/Installation/Ubuntu-Install-Debians.html
Fetching page content from: https://docs.ros.org/en/galactic/Installation/Windows-Install-Binary.html
Fetching page content from: https://docs.ros.org/en/galactic/Installation/RHEL-Install-RPMs.html
Fetching page content from: https://docs.ros.org/en/galactic/Installation/Alternatives.html
Fetching page content from: https://docs.ros.org/en/galactic/Installation/Alternatives/Ubuntu-Development-Setup.html
Fetching page content from: https://docs.ros.org/en/galactic/Installation/Alternatives/Ubuntu-Install-Binary.html
Fetching page content from: https://docs.ros.org/en/galactic/Installation/Alternatives/Windows-Development-Setup.html
Fetching page content from: https://docs.ros.org/en/galactic/Installation/Alternatives/RHEL-Development

In [17]:
scraped_docs

[{'url': 'https://docs.ros.org/en/galactic/Installation.html',
  'title': 'Installation — ROS 2 Documentation: Galactic  documentation',
 {'url': 'https://docs.ros.org/en/galactic/Installation/Ubuntu-Install-Debians.html',
  'title': 'Ubuntu (Debian) — ROS 2 Documentation: Galactic  documentation',
 {'url': 'https://docs.ros.org/en/galactic/Installation/Windows-Install-Binary.html',
  'title': 'Windows (binary) — ROS 2 Documentation: Galactic  documentation',
 {'url': 'https://docs.ros.org/en/galactic/Installation/RHEL-Install-RPMs.html',
  'title': 'RHEL (RPM) — ROS 2 Documentation: Galactic  documentation',
 {'url': 'https://docs.ros.org/en/galactic/Installation/Alternatives.html',
  'title': 'Alternatives — ROS 2 Documentation: Galactic  documentation',
 {'url': 'https://docs.ros.org/en/galactic/Installation/Alternatives/Ubuntu-Development-Setup.html',
  'title': 'Ubuntu (source) — ROS 2 Documentation: Galactic  documentation',
 {'url': 'https://docs.ros.org/en/galactic/Installation