In [1]:
# Importing libraries
from youtube_transcript_api import YouTubeTranscriptApi
from pymongo import MongoClient
import requests
from clearml import Task, PipelineController, PipelineDecorator
import re

In [None]:
# Connecting to MongoDB

# MongoDB Configuration
MONGO_URI = "mongodb://mongoadmin:secret@localhost:27017/"
DB_NAME = "media_data_final"
COLLECTION_NAME = "raw_data_final"

# Connect to MongoDB
mongo_client = MongoClient(MONGO_URI)
db = mongo_client[DB_NAME]
collection = db[COLLECTION_NAME]

In [None]:
# ClearML Task Setup
Task.init(project_name="ETL Pipeline Main", task_name="Media Ingestion Pipeline")


ClearML Task: created new task id=1d7a980f3b39412588cde1f89785e287
2024-12-07 19:39:04,918 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/56f82e0a1a2d4f18a9bd3d2f25c991b8/experiments/1d7a980f3b39412588cde1f89785e287/output/log


<clearml.task.Task at 0x7fef75bbab90>

ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring


In [12]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    return text.strip().lower()

In [4]:
def fetch_youtube_transcripts(url):
    video_id = url.split("v=")[1]
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    return transcript

In [18]:
def fetch_youtube_data():
    with open('/home/zeudon/AI_Project/ros2-rag/app/youtube_sources.txt', 'r') as file:
        # Read all lines, strip newline characters, and save to a list
        youtube_links = list(set(line.strip() for line in file))
    print(len(youtube_links))
    
    for url in youtube_links:
        transcript = fetch_youtube_transcripts(url)
        document = {
            "source": "youtube",
            "url": url,
            "video_id": url.split("v=")[1],
            "data": transcript
        }
        collection.insert_one(document)
        print("Data inserted into db for Youtube URL:", url)
    
    print("Saved complete YouTube Data into the DB")
        

In [14]:
fetch_youtube_data()

11
Saved complete YouTube Data into the DB


In [6]:
def clean_rst_content(text):
    # Remove directives (e.g., .. redirect-from::, .. code-block::)
    text = re.sub(r'\.\. .*::.*', '', text)
    
    # Remove list-table and code-block markers
    text = re.sub(r'\.\. (list-table|code-block)::.*', '', text)
    
    # Remove table headers and structure
    text = re.sub(r'\n\s+\* - .*', '', text)
    
    # Convert headings (e.g., ===, ---) to plain text
    text = re.sub(r'\n[=~-]{2,}\n', '\n', text)
    
    # Remove HTML-like tags (e.g., `link <url>`__)
    text = re.sub(r'`[^`]*?`__', '', text)
    
    # Remove backticks used for inline code
    text = re.sub(r'`([^`]*?)`', r'\1', text)
    
    # Remove extra newlines and trim spaces
    text = re.sub(r'\n{2,}', '\n', text).strip()
    
    # Remove leading/trailing spaces around lines
    text = "\n".join(line.strip() for line in text.splitlines())
    
    return text

In [7]:
def extract_and_clean(data):
    # Extract the 'content' field
    raw_content = data.get('content', '')
    
    # Clean the raw content
    cleaned_content = clean_rst_content(raw_content)
    
    return {'path': data.get('path', ''), 'content': cleaned_content}

In [8]:
def scrape_github_repo_to_text(repo_url, access_token=None, allowed_extensions=None):
    # Extract owner and repository name from URL
    repo_name = repo_url.rstrip('/').split('/')[-1]
    owner_name = repo_url.rstrip('/').split('/')[-2]
    
    if allowed_extensions is None:
        allowed_extensions = [".txt", ".rst", ".md"]
        
    headers = {'Authorization': f'token {access_token}'} if access_token else {}
    
    # GitHub API URLs
    repo_api_url = f"https://api.github.com/repos/{owner_name}/{repo_name}"
    contents_api_url = f"{repo_api_url}/contents"
    
    # Get repository metadata
    repo_metadata = requests.get(repo_api_url, headers=headers).json()
    if 'message' in repo_metadata and 'Not Found' in repo_metadata['message']:
        print("Repository not found. Check the URL or your access token.")
        return
    
    print("Repository Metadata:")
    print("Repo metadata:", repo_metadata)
    
    # Recursive function to get all files and their content
    all_files = []
    file_names = []
    def fetch_contents(api_url, path=""):
        contents = requests.get(api_url, headers=headers).json()
        for item in contents:
                if isinstance(item, dict) and 'type' in item:
                    if item['type'] == 'file':
                        if any(item['path'].endswith(ext) for ext in allowed_extensions):
                            file_names.append(item['path'])
                            file_content = requests.get(item['download_url'], headers=headers).text
                            all_files.append({
                                "path": f"{path}/{item['name']}",
                                "content": file_content
                            })
                    elif item['type'] == 'dir':
                        fetch_contents(item['_links']['self'], path=f"{path}/{item['name']}")
                else:
                    print("Item not dict:", item)

    fetch_contents(contents_api_url)

    print(f"Fetched {len(all_files)} files from the repository.")
    return all_files

In [19]:
def fetch_github_data():
    access_token = "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"
    with open('/home/zeudon/AI_Project/ros2-rag/app/github_sources.txt', 'r') as file:
        # Read all lines, strip newline characters, and save to a list
        github_links = list(set(line.strip() for line in file))
    print(len(github_links))
    print(github_links)
    for link in github_links:
        repo_data = scrape_github_repo_to_text(link, access_token)
        for file in repo_data:
            cleaned_data = extract_and_clean(file)
            document = {
            "source": "github",
            "url": link,
            "path": cleaned_data.get('path', ''),
            "data": cleaned_data.get('content','')
            }
            collection.insert_one(document)
        print("Saved data into mongodb for repository:", link)
        
        
    print("Saved complete GitHub Data into the DB, number of repositories persisted:", len(github_links))

In [30]:
def query_ingested_urls():
    unique_urls = list(collection.distinct("url"))
    print("Number of ingested URLS:", len(unique_urls))
    print("Ingested URLs:")
    for entry in unique_urls:
        print(entry)

In [21]:
fetch_youtube_data()


11
Data inserted into db for Youtube URL: https://www.youtube.com/watch?v=3HuV1M1NMB8
Data inserted into db for Youtube URL: https://www.youtube.com/watch?v=jkoGkAd0GYk
Data inserted into db for Youtube URL: https://www.youtube.com/watch?v=laWn7_cj434
Data inserted into db for Youtube URL: https://www.youtube.com/watch?v=Lm1ediRG5JA
Data inserted into db for Youtube URL: https://www.youtube.com/watch?v=8QfI5a7lTKU
Data inserted into db for Youtube URL: https://www.youtube.com/watch?v=_qQAfTmB5wc
Data inserted into db for Youtube URL: https://www.youtube.com/watch?v=96XsJ7xfsS8
Data inserted into db for Youtube URL: https://www.youtube.com/watch?v=dxcU-_PGZdw
Data inserted into db for Youtube URL: https://www.youtube.com/watch?v=ERhXoIn7kr4
Data inserted into db for Youtube URL: https://www.youtube.com/watch?v=ntJkRO_Z41I
Data inserted into db for Youtube URL: https://www.youtube.com/watch?v=bp7MAZh4lJA
Saved complete YouTube Data into the DB


In [22]:
fetch_github_data()


7
['https://github.com/ros-navigation/docs.nav2.org', 'https://github.com/gazebosim/docs', 'https://github.com/gazebosim/ros_gz', 'https://github.com/ros-navigation/navigation2', 'https://github.com/ros2/ros2_documentation', 'https://github.com/moveit/moveit_tutorials', 'https://github.com/moveit/moveit2']
Repository Metadata:
Repo metadata: {'id': 240847415, 'node_id': 'MDEwOlJlcG9zaXRvcnkyNDA4NDc0MTU=', 'name': 'docs.nav2.org', 'full_name': 'ros-navigation/docs.nav2.org', 'private': False, 'owner': {'login': 'ros-navigation', 'id': 150733807, 'node_id': 'O_kgDOCPwD7w', 'avatar_url': 'https://avatars.githubusercontent.com/u/150733807?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/ros-navigation', 'html_url': 'https://github.com/ros-navigation', 'followers_url': 'https://api.github.com/users/ros-navigation/followers', 'following_url': 'https://api.github.com/users/ros-navigation/following{/other_user}', 'gists_url': 'https://api.github.com/users/ros-navigation/gists{/gis

In [32]:

query_ingested_urls()

Number of ingested URLS: 18
Ingested URLs:
https://github.com/gazebosim/docs
https://github.com/gazebosim/ros_gz
https://github.com/moveit/moveit2
https://github.com/moveit/moveit_tutorials
https://github.com/ros-navigation/docs.nav2.org
https://github.com/ros-navigation/navigation2
https://github.com/ros2/ros2_documentation
https://www.youtube.com/watch?v=3HuV1M1NMB8
https://www.youtube.com/watch?v=8QfI5a7lTKU
https://www.youtube.com/watch?v=96XsJ7xfsS8
https://www.youtube.com/watch?v=ERhXoIn7kr4
https://www.youtube.com/watch?v=Lm1ediRG5JA
https://www.youtube.com/watch?v=_qQAfTmB5wc
https://www.youtube.com/watch?v=bp7MAZh4lJA
https://www.youtube.com/watch?v=dxcU-_PGZdw
https://www.youtube.com/watch?v=jkoGkAd0GYk
https://www.youtube.com/watch?v=laWn7_cj434
https://www.youtube.com/watch?v=ntJkRO_Z41I


In [None]:
target_url = "https://github.com/gazebosim/docs"  # Replace with the specific URL

# Query the collection for documents with the given URL and content, limit to 2
matching_documents = collection.find({"url": target_url}).limit(2)
for doc in matching_documents:
    print(f"Data: {doc.get('data')}, URL: {doc.get('url')}")

Data: # Gazebo Documentation
This repository contains documentation about [Gazebo](https://gazebosim.org) that does not pertain to a specific
[Gazebo library](https://gazebosim.org/libs). An example would be
installation instructions for an Gazebo release. The documentation
contained in this repository can be view at
[https://gazebosim.org/docs](https://gazebosim.org/docs).
Each [Gazebo library](https://gazebosim.org/libs) maintains
documentation and tutorials that are scoped to the features and
capabilities of the library itself. The documentation for a library can be
found under the API Reference section of [https://gazebosim.org/docs](https://gazebosim.org/docs).
## Updating gazebosim.org
## Main docs
The documentation in this repository is built using [Sphinx](https://www.sphinx-doc.org/).
To build, you need to install the following:
* python virtualenv
Create the virtual env and activate it:
bash
python3 -m venv .venv
source .venv/bin/activate
Then install the necessary dependenci

In [20]:
# ClearML Pipeline Controller
pipeline = PipelineController(
    project="RAG System ETL Pipeline",
    name="ROS2 Media ETL",
    version="1.0",
)

# Add pipeline steps

pipeline.add_function_step(
    name="Fetch YouTube Data",
    function=fetch_youtube_data,
    execution_queue="default"
)

pipeline.add_function_step(
    name="Fetch GitHub Data",
    function=fetch_github_data,
    parents=["Fetch YouTube Data"],
    execution_queue="default"
)

pipeline.add_function_step(
    name="Query Ingested URLs",
    function=query_ingested_urls,
    parents=["Fetch GitHub Data"],
    execution_queue="default"
)
    
if __name__ == "__main__":
    pipeline.start_locally()


ClearML Task: created new task id=98d272ca84e643879d817cba382b7c35
ClearML results page: https://app.clear.ml/projects/f788cb6894e74f739e1ce1f74898c8e9/experiments/98d272ca84e643879d817cba382b7c35/output/log


Could not fetch function declared in __main__: <module '__main__'> is a built-in module
Could not fetch function imports: <module '__main__'> is a built-in module


ClearML pipeline page: https://app.clear.ml/pipelines/f788cb6894e74f739e1ce1f74898c8e9/experiments/98d272ca84e643879d817cba382b7c35


Could not fetch function declared in __main__: <module '__main__'> is a built-in module
Could not fetch function imports: <module '__main__'> is a built-in module
Could not fetch function declared in __main__: <module '__main__'> is a built-in module
Could not fetch function imports: <module '__main__'> is a built-in module


Launching the next 1 steps
Launching step [Fetch YouTube Data]
Launching step: Fetch YouTube Data
Parameters:
None
Configurations:
{}
Overrides:
{}
Launching the next 0 steps
Setting pipeline controller Task as failed (due to failed steps) !


In [None]:
# Part of featurization pipeline for youtube data

# sentences = [i['text'] for i in result['data']]
# print(sentences)
# sentences_in_chunk = 6
# chunks = []
# for i in range(0, len(sentences), sentences_in_chunk):
#     chunks.append(" ".join(sentences[i:i + sentences_in_chunk]))
# print(chunks)

