In [5]:
from zenml.steps import step

## Medium - ETL



In [6]:
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
import re
from clearml import Task
from clearml import Logger

# MongoDB connection parameters
MONGO_URI = "mongodb://localhost:27017/"
DATABASE_NAME = "final_data_project"
COLLECTION_NAME = "medium"

def extract(link: str) -> str:
    """Extract the HTML content from the Medium link."""
    try:
        response = requests.get(link)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {link}: {e}")
        return None

def clean_text(text):
    """Clean and normalize the text."""
    # Remove multiple spaces, tabs, and newlines
    text = re.sub(r'\s+', ' ', text)
    # Remove non-printable characters
    text = re.sub(r'[^\x20-\x7E]', '', text)
    # Trim leading and trailing spaces
    return text.strip()

def transform(html_content: str, link: str) -> dict:
    """Parse the HTML and extract metadata and cleaned content."""
    soup = BeautifulSoup(html_content, "html.parser")
    
    # Extract metadata
    title = soup.find("title").text if soup.find("title") else "No title"
    author = soup.find("meta", {"name": "author"})
    author_name = author["content"] if author else "Unknown author"
    publication_date = soup.find("time")["datetime"] if soup.find("time") else "Unknown date"
    
    # Extract and clean article content
    article_content = ""
    article_body = soup.find("article")
    if article_body:
        for script_or_style in article_body(["script", "style"]):
            script_or_style.extract()  # Remove script and style tags
        paragraphs = article_body.find_all("p")
        article_content = "\n".join(clean_text(p.text) for p in paragraphs if p.text)
    
    # Construct data
    data = {
        "metadata": {
            "type": "Medium",
            "url": link,
            "title": clean_text(title),
            "author": clean_text(author_name),
            "publication_date": clean_text(publication_date),
        },
        "content": article_content
    }
    return data

def load(data: dict, mongo_uri: str, db_name: str, collection_name: str):
    """Store the extracted data in MongoDB."""
    if data and data.get("content"):  # Ensure content is not empty
        client = MongoClient(mongo_uri)
        db = client[db_name]
        collection = db[collection_name]
        collection.insert_one(data)
        print(f"Inserted article: {data['metadata']['url']}")
    else:
        print("No content to insert.")

def medium_etl_pipeline(medium_links: list, mongo_uri: str, db_name: str, collection_name: str):
    """The full ETL pipeline."""
    for link in medium_links:
        # Start a new task for each link
        
        # Extract HTML content
        html_content = extract(link)
        
        # Transform HTML content
        data = transform(html_content, link)
        
        # Load data into MongoDB
        load(data, mongo_uri, db_name, collection_name)
        
        # Log any relevant output for later review

if __name__ == "__main__":
    # Define the Medium links
    medium_links = [
        "https://medium.com/schmiedeone/getting-started-with-ros2-part-1-d4c3b7335c71",
        "https://medium.com/@nullbyte.in/ros2-from-the-ground-up-part-1-an-introduction-to-the-robot-operating-system-4c2065c5e032",
        "https://medium.com/@tetraengnrng/a-beginners-guide-to-ros2-29721dcf49c8"
    ]
    
    # Run the pipeline
    medium_etl_pipeline(medium_links, mongo_uri=MONGO_URI, db_name=DATABASE_NAME, collection_name=COLLECTION_NAME)

Inserted article: https://medium.com/schmiedeone/getting-started-with-ros2-part-1-d4c3b7335c71
Inserted article: https://medium.com/@nullbyte.in/ros2-from-the-ground-up-part-1-an-introduction-to-the-robot-operating-system-4c2065c5e032
Inserted article: https://medium.com/@tetraengnrng/a-beginners-guide-to-ros2-29721dcf49c8


# Documentation ETL

In [7]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from pymongo import MongoClient
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin
import re

# MongoDB Configuration
MONGO_URI = "mongodb://localhost:27017"
DATABASE_NAME = "final_data_project"

client = MongoClient(MONGO_URI)
db = client[DATABASE_NAME]

# DOCUMENTATION_SITES = [
#     {"base_url": "https://gazebosim.org/docs/latest/getstarted/", "collection_name": "gazebo_documentation", "domain": "gazebosim.org/docs"},
#     {"base_url": "https://moveit.picknik.ai/main/index.html", "collection_name": "moveit_documentation", "domain": "moveit.picknik.ai"},
# ]

DOCUMENTATION_SITES = [
    {"base_url": "https://gazebosim.org/docs/latest/getstarted/", "collection_name": "gazebo_documentation", "domain": "gazebosim.org/docs"},
    {"base_url": "https://moveit.picknik.ai/main/index.html", "collection_name": "moveit_documentation", "domain": "moveit.picknik.ai"},
    {"base_url": "https://docs.ros.org/en/foxy/index.html", "collection_name": "ros2_documentation", "domain": "docs.ros.org"},
    
    ]

def get_selenium_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def fetch_page(driver, url):
    driver.get(url)
    time.sleep(2)  # Wait for the page to load fully
    return driver.page_source

# Clean text utility
def clean_text(text):
    """Clean and normalize text content."""
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
    text = re.sub(r'[^\x20-\x7E]', '', text)  # Remove non-printable characters
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    return text.strip()

# Extract links and clean content from a page
def extract_links_and_content_from_page(soup, base_url, domain_filter):
    links = []
    content = ""

    # Remove unwanted tags
    for script_or_style in soup(["script", "style", "noscript"]):
        script_or_style.extract()

    # Extract the main page content and clean it
    raw_content = soup.get_text(strip=True)
    content = clean_text(raw_content)  # Clean the extracted text

    # Find and normalize all links
    for a_tag in soup.find_all('a', href=True):
        link = a_tag['href']
        normalized_link = urljoin(base_url, link)  # Handle absolute and relative links
        if domain_filter in normalized_link:  # Only keep links from the target domain
            links.append(normalized_link)

    return links, content

def scrape_documentation(base_url, collection_name, domain_filter):
    collection = db[collection_name]  # Use a different collection for each site
    driver = get_selenium_driver()
    
    try:
        print(f"Starting to scrape: {base_url}")

        # Fetch and process the base page
        page_source = fetch_page(driver, base_url)
        soup = BeautifulSoup(page_source, 'html.parser')

        # Extract links and content
        links_to_scrape, base_page_content = extract_links_and_content_from_page(soup, base_url, domain_filter)

        # Save the base page content
        data = {
            "metadata": {
                "url": base_url,
                "type": collection_name,
            },
            "content": base_page_content,
        }
        collection.insert_one(data)
        print(f"Saved: {base_url}")

        # Iterate through links to scrape additional pages
        for link in links_to_scrape:
            try:
                page_source = fetch_page(driver, link)
                soup = BeautifulSoup(page_source, 'html.parser')

                # Extract content for each linked page
                _, link_content = extract_links_and_content_from_page(soup, link, domain_filter)

                # Save the linked page content
                data = {
                    "metadata": {
                        "url": link,
                        "type": collection_name,
                    },
                    "content": link_content,
                }
                collection.insert_one(data)
                print(f"Saved: {link}")
            except Exception as e:
                print(f"Error processing {link}: {e}")

    finally:
        driver.quit()

# Scrape each site in DOCUMENTATION_SITES
for site in DOCUMENTATION_SITES:
    scrape_documentation(site["base_url"], site["collection_name"], site["domain"])

print("Scraping, cleaning, and data saving completed for all documentation sites.")



Python(89654) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(89655) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(89656) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(89657) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(89658) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(89659) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[1;35mGet LATEST chromedriver version for google-chrome[0m


Python(89660) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(89661) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(89662) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(89663) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(89664) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(89665) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[1;35mGet LATEST chromedriver version for google-chrome[0m
[1;35mGet LATEST chromedriver version for google-chrome[0m
[1;35mWebDriver version 131.0.6778.87 selected[0m
[1;35mModern chrome version [0m[34mhttps://storage.googleapis.com/chrome-for-testing-public/131.0.6778.87/mac-arm64/chromedriver-mac-arm64.zip[1;35m[0m
[1;35mAbout to download new driver from [0m[34mhttps://storage.googleapis.com/chrome-for-testing-public/131.0.6778.87/mac-arm64/chromedriver-mac-arm64.zip[1;35m[0m
[1;35mDriver downloading response is 200[0m
[1;35mGet LATEST chromedriver version for google-chrome[0m
[1;35mDriver has been saved in cache [/Users/debikad/.wdm/drivers/chromedriver/mac64/131.0.6778.87][0m


Python(89671) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Starting to scrape: https://gazebosim.org/docs/latest/getstarted/
Saved: https://gazebosim.org/docs/latest/getstarted/
Saved: https://gazebosim.org/docs/latest/getstarted/#main-content
Saved: https://gazebosim.org/docs/latest/
Saved: https://gazebosim.org/docs
Saved: https://gazebosim.org/docs/jetty/getstarted/
Saved: https://gazebosim.org/docs/ionic/getstarted/
Saved: https://gazebosim.org/docs/harmonic/getstarted/
Saved: https://gazebosim.org/docs/garden/getstarted/
Saved: https://gazebosim.org/docs/fortress/getstarted/
Saved: https://gazebosim.org/docs/edifice/getstarted/
Saved: https://gazebosim.org/docs/dome/getstarted/
Saved: https://gazebosim.org/docs/citadel/getstarted/
Saved: https://gazebosim.org/docs/blueprint/getstarted/
Saved: https://gazebosim.org/docs/acropolis/getstarted/
Saved: https://gazebosim.org/docs
Saved: https://gazebosim.org/docs/latest/getstarted/
Saved: https://gazebosim.org/docs/latest/install/
Saved: https://gazebosim.org/docs/latest/install_ubuntu/
Saved: 

Python(90606) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(90607) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(90608) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(90609) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(90610) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(90611) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[1;35mGet LATEST chromedriver version for google-chrome[0m


Python(90612) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(90613) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(90614) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(90615) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(90616) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(90617) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[1;35mGet LATEST chromedriver version for google-chrome[0m
[1;35mDriver [/Users/debikad/.wdm/drivers/chromedriver/mac64/131.0.6778.87/chromedriver-mac-arm64/chromedriver] found in cache[0m


Python(90619) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Starting to scrape: https://moveit.picknik.ai/main/index.html
Saved: https://moveit.picknik.ai/main/index.html
Saved: https://moveit.picknik.ai/main/index.html
Saved: https://moveit.picknik.ai/main/doc/tutorials/tutorials.html
Saved: https://moveit.picknik.ai/main/doc/examples/examples.html
Saved: https://moveit.picknik.ai/main/doc/concepts/concepts.html
Saved: https://moveit.picknik.ai/main/doc/how_to_guides/how_to_guides.html
Saved: https://moveit.picknik.ai/main/doc/api/api.html
Saved: https://moveit.picknik.ai/main/doc/how_to_contribute/how_to_contribute.html
Saved: https://moveit.picknik.ai/main/index.html
Saved: https://moveit.picknik.ai/main/index.html
Saved: https://moveit.picknik.ai/main/index.html#moveit-2-documentation
Saved: https://moveit.picknik.ai/main/index.html#how-to-use-this-website
Saved: https://moveit.picknik.ai/main/doc/tutorials/tutorials.html
Saved: https://moveit.picknik.ai/main/doc/how_to_guides/how_to_guides.html
Saved: https://moveit.picknik.ai/main/doc/con

Python(91181) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(91182) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(91183) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(91184) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(91185) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(91186) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(91191) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(91192) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(91193) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(91194) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(91195) Malloc

[1;35mGet LATEST chromedriver version for google-chrome[0m
[1;35mGet LATEST chromedriver version for google-chrome[0m
[1;35mDriver [/Users/debikad/.wdm/drivers/chromedriver/mac64/131.0.6778.87/chromedriver-mac-arm64/chromedriver] found in cache[0m


Python(91197) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Starting to scrape: https://docs.ros.org/en/foxy/index.html
Saved: https://docs.ros.org/en/foxy/index.html
Saved: https://docs.ros.org/en/foxy/index.html
Saved: https://docs.ros.org/en/foxy/Installation.html
Saved: https://docs.ros.org/en/foxy/Installation/Ubuntu-Install-Debians.html
Saved: https://docs.ros.org/en/foxy/Installation/Windows-Install-Binary.html
Saved: https://docs.ros.org/en/foxy/Installation/Alternatives.html
Saved: https://docs.ros.org/en/foxy/Installation/Alternatives/Ubuntu-Development-Setup.html
Saved: https://docs.ros.org/en/foxy/Installation/Alternatives/Ubuntu-Install-Binary.html
Saved: https://docs.ros.org/en/foxy/Installation/Alternatives/Windows-Development-Setup.html
Saved: https://docs.ros.org/en/foxy/Installation/Alternatives/macOS-Development-Setup.html
Saved: https://docs.ros.org/en/foxy/Installation/Alternatives/macOS-Install-Binary.html
Saved: https://docs.ros.org/en/foxy/Installation/Alternatives/Fedora-Development-Setup.html
Saved: https://docs.ros.or

In [8]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from pymongo import MongoClient
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin
import re

# MongoDB Configuration
MONGO_URI = "mongodb://localhost:27017"
DATABASE_NAME = "final_data_project"

client = MongoClient(MONGO_URI)
db = client[DATABASE_NAME]

# Nav2 Documentation Details
BASE_URL = "https://docs.nav2.org/"
COLLECTION_NAME = "nav2_documentation"
DOMAIN_FILTER = "docs.nav2.org"

# Define keywords for relevance filtering
NAV2_KEYWORDS = ["navigation", "path planning", "setup", "ROS", "map", "robot"]

# Maximum number of pages to save
MAX_PAGES = 300

def get_selenium_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver

def fetch_page(driver, url):
    driver.get(url)
    time.sleep(2)  # Wait for the page to load fully
    return driver.page_source

# Clean text utility
def clean_text(text):
    """Clean and normalize text content."""
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
    text = re.sub(r'[^\x20-\x7E]', '', text)  # Remove non-printable characters
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    return text.strip()

# Extract links and clean content from a page
def extract_links_and_content_from_page(soup, base_url, domain_filter):
    links = []
    content = ""

    # Remove unwanted tags
    for script_or_style in soup(["script", "style", "noscript"]):
        script_or_style.extract()

    # Extract the main page content and clean it
    raw_content = soup.get_text(strip=True)
    content = clean_text(raw_content)  # Clean the extracted text

    # Find and normalize all links
    for a_tag in soup.find_all('a', href=True):
        link = a_tag['href']
        normalized_link = urljoin(base_url, link)  # Handle absolute and relative links
        if domain_filter in normalized_link:  # Only keep links from the target domain
            links.append(normalized_link)

    return links, content

# Relevance check function
def is_relevant_content(content, keywords=None, min_length=50):
    """
    Check if the content is relevant based on keywords and content length.
    """
    if not content or len(content) < min_length:
        return False  # Exclude empty or very short content
    
    if keywords:
        content_lower = content.lower()
        for keyword in keywords:
            if keyword.lower() in content_lower:
                return True
        return False  # No keywords matched
    return True  # If no keywords are provided, consider all content as relevant

# Scrape Nav2 documentation
def scrape_nav2_documentation():
    collection = db[COLLECTION_NAME]  # Use the Nav2 collection
    driver = get_selenium_driver()
    saved_pages = 0  # Counter for saved pages

    try:
        print(f"Starting to scrape: {BASE_URL}")

        # Fetch and process the base page
        page_source = fetch_page(driver, BASE_URL)
        soup = BeautifulSoup(page_source, 'html.parser')

        # Extract links and content
        links_to_scrape, base_page_content = extract_links_and_content_from_page(soup, BASE_URL, DOMAIN_FILTER)

        # Check relevance of the base page content
        if is_relevant_content(base_page_content, NAV2_KEYWORDS):
            data = {
                "metadata": {
                    "url": BASE_URL,
                    "type": COLLECTION_NAME,
                },
                "content": base_page_content,
            }
            collection.insert_one(data)
            saved_pages += 1
            print(f"Saved: {BASE_URL}")
        else:
            print(f"Skipped base page (irrelevant): {BASE_URL}")

        # Iterate through links to scrape additional pages
        for link in links_to_scrape:
            if saved_pages >= MAX_PAGES:  # Stop if the maximum number of pages is reached
                print("Reached maximum page limit. Stopping.")
                break

            try:
                page_source = fetch_page(driver, link)
                soup = BeautifulSoup(page_source, 'html.parser')

                # Extract content for each linked page
                _, link_content = extract_links_and_content_from_page(soup, link, DOMAIN_FILTER)

                # Check relevance of the linked page content
                if is_relevant_content(link_content, NAV2_KEYWORDS):
                    data = {
                        "metadata": {
                            "url": link,
                            "type": COLLECTION_NAME,
                        },
                        "content": link_content,
                    }
                    collection.insert_one(data)
                    saved_pages += 1
                    print(f"Saved: {link}")
                else:
                    print(f"Skipped link (irrelevant): {link}")
            except Exception as e:
                print(f"Error processing {link}: {e}")

    finally:
        driver.quit()

# Scrape Nav2 documentation only
scrape_nav2_documentation()
print("Scraping, cleaning, and data saving completed for Nav2 documentation.")


[1;35mGet LATEST chromedriver version for google-chrome[0m


Python(93089) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(93090) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(93091) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(93092) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(93093) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(93094) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(93095) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(93096) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(93097) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(93098) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(93099) Malloc

[1;35mGet LATEST chromedriver version for google-chrome[0m
[1;35mDriver [/Users/debikad/.wdm/drivers/chromedriver/mac64/131.0.6778.87/chromedriver-mac-arm64/chromedriver] found in cache[0m


Python(93101) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Starting to scrape: https://docs.nav2.org/
Saved: https://docs.nav2.org/
Saved: https://docs.nav2.org/
Saved: https://docs.nav2.org/getting_started/index.html
Saved: https://docs.nav2.org/getting_started/index.html#installation
Saved: https://docs.nav2.org/getting_started/index.html#running-the-example
Saved: https://docs.nav2.org/getting_started/index.html#navigating
Saved: https://docs.nav2.org/development_guides/index.html
Saved: https://docs.nav2.org/development_guides/build_docs/index.html
Saved: https://docs.nav2.org/development_guides/build_docs/index.html#install
Saved: https://docs.nav2.org/development_guides/build_docs/index.html#for-iron-and-older
Saved: https://docs.nav2.org/development_guides/build_docs/index.html#for-jazzy-and-newer
Saved: https://docs.nav2.org/development_guides/build_docs/index.html#build
Saved: https://docs.nav2.org/development_guides/build_docs/index.html#released-distribution-binaries
Saved: https://docs.nav2.org/development_guides/build_docs/index.h

# Reddit - ETL

In [9]:
import praw
from pymongo import MongoClient
import re

# MongoDB Configuration
MONGO_URI = "mongodb://localhost:27017"
DATABASE_NAME = "final_data_project"
COLLECTION_NAME = "reddit"

# Reddit API Configuration
REDDIT_CLIENT_ID = "9Hjzq9IuhgEzlfLwGnT9YA"
REDDIT_CLIENT_SECRET = "y63xI-407llXiRNVWwgpO97r1M7AuQ"
REDDIT_USER_AGENT = "ros2_rag"

# Clean text utility
def clean_text(text):
    """Clean and normalize Reddit post text."""
    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces/newlines with a single space
    text = re.sub(r"[^\x20-\x7E]", "", text)  # Remove non-printable characters
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = text.strip()  # Trim leading/trailing whitespace
    return text

# Get Reddit posts
def get_reddit_posts(subreddit, keyword, limit=10):
    reddit = praw.Reddit(
        client_id=REDDIT_CLIENT_ID,
        client_secret=REDDIT_CLIENT_SECRET,
        user_agent=REDDIT_USER_AGENT
    )
    posts = reddit.subreddit(subreddit).search(keyword, limit=limit)
    return posts

# Transform and clean post data
def transform_and_store_post_data(post, subreddit, keyword):
    metadata = {
        "type": "reddit",
        "subreddit": subreddit,
        "keyword": keyword,
        "url": f"https://reddit.com{post.permalink}",
    }
    # Clean the post content
    content = clean_text(post.selftext)
    
    return {"metadata": metadata, "content": content}

# Load cleaned data into MongoDB
def load_data_to_mongodb(data):
    client = MongoClient(MONGO_URI)
    db = client[DATABASE_NAME]
    collection = db[COLLECTION_NAME]
    collection.insert_one(data)
    print(f"Ingested Reddit post data: {data['metadata']['url']}")

# ETL Process for Reddit posts
def etl_reddit_posts(configurations, limit=10):
    try:
        for config in configurations:
            subreddit = config["subreddit"]
            keyword = config["keyword"]
            posts = get_reddit_posts(subreddit, keyword, limit)
            for post in posts:
                transformed_data = transform_and_store_post_data(post, subreddit, keyword)
                if transformed_data["content"]:  # Ensure there's valid content
                    load_data_to_mongodb(transformed_data)
                else:
                    print(f"Skipped empty post: {post.permalink}")
    except Exception as e:
        print(f"Error fetching Reddit posts: {e}")

# Configuration for multiple keywords and subreddits
configurations = [
    {"subreddit": "ROS", "keyword": "ROS2"},
    {"subreddit": "ROS", "keyword": "nav2"},
    {"subreddit": "ROS", "keyword": "gazebo"},
    {"subreddit": "ROS", "keyword": "moveit"},
]

# Execute the ETL pipeline
etl_reddit_posts(configurations, limit=20)

Ingested Reddit post data: https://reddit.com/r/ROS/comments/1bt2ms0/ros2_beginner/
Skipped empty post: /r/ROS/comments/1g698bf/i_bounced_off_of_using_ros2_for_my_project_and_in/
Ingested Reddit post data: https://reddit.com/r/ROS/comments/1h47rtv/robot_is_not_visible_in_gazebo_ros2_humble/
Ingested Reddit post data: https://reddit.com/r/ROS/comments/1gsr2pp/simulated_robots_package_for_ros2_foxy_humble/
Ingested Reddit post data: https://reddit.com/r/ROS/comments/1fspb4u/should_i_go_for_ros2_on_windows_natively_or/
Ingested Reddit post data: https://reddit.com/r/ROS/comments/1gb2p51/rosmaster_r2_to_learn_ros2_is_it_good/
Ingested Reddit post data: https://reddit.com/r/ROS/comments/1goa2o1/drag_and_drop_ros2_development_with_zero_setup/
Ingested Reddit post data: https://reddit.com/r/ROS/comments/1h8vd8l/quik_a_ultrafast_and_robust_generalized_inverse/
Ingested Reddit post data: https://reddit.com/r/ROS/comments/1grp9fd/how_to_learn_ros2/
Ingested Reddit post data: https://reddit.com/r

# Stack overflow - ETL

In [10]:
from pymongo import MongoClient
import requests
import re

# MongoDB Configuration
MONGO_URI = "mongodb://localhost:27017"
DATABASE_NAME = "final_data_project"
COLLECTION_NAME = "stackoverflow"

# Stack Overflow API Configuration
STACKOVERFLOW_API_BASE = "https://api.stackexchange.com/2.3/questions"
STACKOVERFLOW_API_PARAMS = {
    "site": "stackoverflow",
    "filter": "withbody"  # Includes question body in response
}

# Function to fetch Stack Overflow question data
def fetch_stackoverflow_data(question_url):
    """
    Fetch question data from Stack Overflow API based on the question URL.
    """
    # Extract the question ID from the URL
    match = re.search(r"/questions/(\d+)", question_url)
    if not match:
        raise ValueError(f"Invalid Stack Overflow URL format: {question_url}")
    
    question_id = match.group(1)
    url = f"{STACKOVERFLOW_API_BASE}/{question_id}"
    
    # API request
    response = requests.get(url, params=STACKOVERFLOW_API_PARAMS)
    if response.status_code == 200:
        items = response.json().get("items", [])
        if items:
            return items[0]  # Return the first item (question data)
        else:
            raise ValueError(f"No data found for question ID {question_id}")
    else:
        raise RuntimeError(f"Failed to fetch data for question ID {question_id}: {response.status_code} {response.text}")

# Function to clean and transform data
def clean_content(content):
    """
    Clean and normalize the content by:
    - Removing HTML tags.
    - Collapsing multiple spaces/newlines.
    """
    content = re.sub(r"<[^>]*>", "", content)  # Remove HTML tags
    content = re.sub(r"\s+", " ", content)  # Collapse multiple spaces/newlines
    return content.strip()

def transform_data(question_data, url):
    """
    Transform the fetched question data into a MongoDB-friendly format.
    """
    metadata = {
        "type": "Stackoverflow",
        "url": url,
        "title": question_data.get("title", "Untitled")
    }
    content = clean_content(question_data.get("body", "No content available"))
    return {"metadata": metadata, "content": content}

# Function to load data into MongoDB
def load_data_to_mongodb(data):
    """
    Load the transformed data into MongoDB.
    """
    client = MongoClient(MONGO_URI)
    db = client[DATABASE_NAME]
    collection = db[COLLECTION_NAME]
    collection.insert_one(data)
    print(f"Ingested data for question: {data['metadata']['title']}")

# ETL Process for Stack Overflow URLs
def etl_stackoverflow_data(urls):
    """
    Perform the ETL process for a list of Stack Overflow question URLs.
    """
    for url in urls:
        try:
            print(f"Processing URL: {url}")
            # Fetch, transform, and load data
            question_data = fetch_stackoverflow_data(url)
            transformed_data = transform_data(question_data, url)
            load_data_to_mongodb(transformed_data)
        except Exception as e:
            print(f"Error processing {url}: {e}")

# List of Stack Overflow question URLs
urls = [
    "https://stackoverflow.com/questions/57426715/import-modules-in-package-in-ros2",
    "https://stackoverflow.com/questions/51187676/whats-the-difference-between-ros2-and-dds",
    "https://stackoverflow.com/questions/68771051/ros2-pub-sub-custom-message-through-ros2-web-bridge-to-client-app",
]

# Run the ETL process
etl_stackoverflow_data(urls)


Processing URL: https://stackoverflow.com/questions/57426715/import-modules-in-package-in-ros2
Ingested data for question: Import modules in package in ROS2
Processing URL: https://stackoverflow.com/questions/51187676/whats-the-difference-between-ros2-and-dds
Ingested data for question: What&#39;s the difference between ROS2 and DDS?
Processing URL: https://stackoverflow.com/questions/68771051/ros2-pub-sub-custom-message-through-ros2-web-bridge-to-client-app
Ingested data for question: ros2 pub/sub custom message through ros2-web-bridge to client app


In [11]:
from zenml.pipelines import pipeline
from zenml.steps import step

# Define a step
@step
def my_step() -> str:
    return "Hello, ZenML!"

# Define a pipeline
@pipeline
def my_pipeline(step1):
    step1()

In [12]:
pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

Python(94642) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Collecting google-api-python-client
  Downloading google_api_python_client-2.154.0-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting google-auth-httplib2
  Downloading google_auth_httplib2-0.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting google-auth-oauthlib
  Downloading google_auth_oauthlib-1.2.1-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting httplib2<1.dev0,>=0.19.0 (from google-api-python-client)
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0 (from google-api-python-client)
  Downloading google_auth-2.36.0-py2.py3-none-any.whl.metadata (4.7 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5 (from google-api-python-client)
  Downloading google_api_core-2.23.0-py3-none-any.whl.metadata (3.0 kB)
Collecting uritemplate<5,>=3.0.1 (from google-api-python-client)
  Downloading uritemplate-4.1.1-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting requests-oauthlib>=0.7.0 (

# Entire youtube ETL

In [16]:
!pip install youtube_transcript_api

Python(96145) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Collecting youtube_transcript_api
  Downloading youtube_transcript_api-0.6.3-py3-none-any.whl.metadata (17 kB)
Collecting defusedxml<0.8.0,>=0.7.1 (from youtube_transcript_api)
  Downloading defusedxml-0.7.1-py2.py3-none-any.whl.metadata (32 kB)
Downloading youtube_transcript_api-0.6.3-py3-none-any.whl (622 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m622.3/622.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading defusedxml-0.7.1-py2.py3-none-any.whl (25 kB)
Installing collected packages: defusedxml, youtube_transcript_api
Successfully installed defusedxml-0.7.1 youtube_transcript_api-0.6.3


In [18]:
import os
import pickle
import re
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from google.auth.transport.requests import Request
from pymongo import MongoClient
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, VideoUnavailable

# MongoDB Configuration
MONGO_URI = "mongodb://localhost:27017"
DATABASE_NAME = "final_data_project"
COLLECTION_NAME = "youtube_captions"

# OAuth2 YouTube API Configuration
CLIENT_SECRETS_FILE = "/Users/debikad/Desktop/AI-project/client_secret.json"  # Path to your OAuth2 credentials JSON file
SCOPES = ["https://www.googleapis.com/auth/youtube.force-ssl"]

# Function to get authenticated YouTube service
def get_authenticated_service():
    creds = None
    if os.path.exists("token.pickle"):
        with open("token.pickle", "rb") as token:
            creds = pickle.load(token)

    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                CLIENT_SECRETS_FILE, SCOPES
            )
            creds = flow.run_local_server(port=54510)

        with open("token.pickle", "wb") as token:
            pickle.dump(creds, token)

    youtube = build("youtube", "v3", credentials=creds)
    return youtube


# Function to get video IDs from playlists
def get_video_ids_from_playlists(youtube, playlist_urls):
    def extract_playlist_id_from_url(url):
        match = re.search(r"list=([a-zA-Z0-9_-]+)", url)
        if match:
            return match.group(1)
        else:
            raise ValueError(f"Invalid YouTube playlist URL format: {url}")

    def get_video_ids_from_single_playlist(youtube, playlist_id):
        video_ids = []
        playlist_items = youtube.playlistItems().list(
            part="snippet",
            playlistId=playlist_id,
            maxResults=50
        ).execute()

        while playlist_items:
            for item in playlist_items["items"]:
                video_id = item["snippet"]["resourceId"]["videoId"]
                video_ids.append(video_id)

            if "nextPageToken" in playlist_items:
                playlist_items = youtube.playlistItems().list(
                    part="snippet",
                    playlistId=playlist_id,
                    maxResults=50,
                    pageToken=playlist_items["nextPageToken"]
                ).execute()
            else:
                break

        return video_ids

    all_video_ids = []
    for url in playlist_urls:
        try:
            playlist_id = extract_playlist_id_from_url(url)
            print(f"Fetching videos from playlist: {playlist_id}")
            video_ids = get_video_ids_from_single_playlist(youtube, playlist_id)
            all_video_ids.extend(video_ids)
        except Exception as e:
            print(f"Error processing playlist {url}: {e}")

    return all_video_ids


# Function to fetch video details
def get_video_details(youtube, video_id):
    try:
        video_details = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=video_id
        ).execute()

        if video_details["items"]:
            video = video_details["items"][0]
            title = video["snippet"].get("title", "Untitled Video")
            description = video["snippet"].get("description", "No description available")
            return {"title": title, "description": description}
        else:
            return {"title": "Unknown", "description": "No description available"}
    except Exception as e:
        print(f"Error fetching details for video {video_id}: {e}")
        return {"title": "Error", "description": str(e)}


# Function to fetch captions using youtube-transcript-api
def fetch_video_captions_fallback(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return " ".join([item["text"] for item in transcript])
    except NoTranscriptFound:
        return "No captions or transcript available"
    except VideoUnavailable:
        return "Video is unavailable"
    except Exception as e:
        print(f"Error fetching transcript for video {video_id}: {e}")
        return "Error fetching captions"


# Function to clean content
def clean_content(content):
    content = re.sub(r"<[^>]*>", "", content)  # Remove HTML tags
    content = re.sub(r"\s+", " ", content)  # Collapse multiple spaces/newlines
    return content.strip()


# Function to transform data into MongoDB format
def transform_data(video_details, captions, url):
    metadata = {
        "type": "YouTube",
        "url": url,
        "title": video_details["title"],
        "description": video_details["description"],
    }
    content = clean_content(captions)
    return {"metadata": metadata, "content": content}


# Function to load data into MongoDB
def load_data_to_mongodb(data):
    client = MongoClient(MONGO_URI)
    db = client[DATABASE_NAME]
    collection = db[COLLECTION_NAME]
    collection.insert_one(data)
    print(f"Ingested data for video: {data['metadata']['title']}")


# ETL Process for a list of YouTube video IDs
def etl_youtube_videos(video_ids):
    for video_id in video_ids:
        try:
            print(f"Processing video: {video_id}")
            
            # Fetch video details
            video_details = get_video_details(youtube, video_id)
            
            # Fetch captions
            captions = fetch_video_captions_fallback(video_id)
            if captions in ["Captions are restricted or unavailable", "No captions or transcript available"]:
                print(f"Skipping video {video_id}: Captions not accessible")
                continue
            
            # Transform and load data
            url = f"https://www.youtube.com/watch?v={video_id}"
            transformed_data = transform_data(video_details, captions, url)
            load_data_to_mongodb(transformed_data)
        
        except Exception as e:
            print(f"Error processing video {video_id}: {e}")


if __name__ == "__main__":
    youtube = get_authenticated_service()

    playlist_urls = [
        "https://www.youtube.com/watch?v=C6eQ6VwTpxk&list=PLSK7NtBWwmpTS_YVfjeN3ZzIxItI1P_Sr"
    ]

    video_ids = get_video_ids_from_playlists(youtube, playlist_urls)

    print("Extracted Video IDs:", video_ids)
    etl_youtube_videos(video_ids)

Python(96490) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=120625730766-171678t2p3m7diridcm62t2rmio0sn56.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A54510%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fyoutube.force-ssl&state=aSsWfoyylz4Ito0SZ22brd34W6hlzX&access_type=offline
[1;35m"GET /?state=aSsWfoyylz4Ito0SZ22brd34W6hlzX&code=4/0AeanS0bJ9wm8VL0OPNqb37DiQjyPcfIrwX5gXe68n4cmNgFyl0DDIpBy5waXfwZ72fx3eg&scope=[0m[34mhttps://www.googleapis.com/auth/youtube.force-ssl[1;35m HTTP/1.1" 200 65[0m
[1;35mfile_cache is only supported with oauth2client<4.0.0[0m
Fetching videos from playlist: PLSK7NtBWwmpTS_YVfjeN3ZzIxItI1P_Sr
Extracted Video IDs: ['C6eQ6VwTpxk', '7FKi-waQuMM', 'dJLBLb0IXdw', '72a-wJ2k25A', 'sWw69pIiMz0', 'oVOR74D8A3U', 'nsbgIys0_oc', 'PcO-sTuP8zg', 'QQLOk8l2lEo', 'zNxCqBKKbGM', 'JNM2qIhseiU', '9Myw-9UQxPw', 'mFCundd5s-Q', 'KLvUMtYI_Ag', '4zGUDisw4UI', 'rGsyQHwWObA', 'lDSrqQM85zA', 'EO

# Chunking and creating embeddings and storing Qdrant

In [19]:
from pymongo import MongoClient

# MongoDB Configuration
MONGO_URI = "mongodb://localhost:27017"
DATABASE_NAME = "final_data_project"
COLLECTIONS = ["medium", "ros2_documentation", "reddit","stackoverflow", "youtube_captions"]

client = MongoClient(MONGO_URI)
db = client[DATABASE_NAME]


@step
def fetch_documents() -> list[dict[str, any]]:
    documents = []
    for collection_name in COLLECTIONS:
        collection = db[collection_name]
        for doc in collection.find({}, {"_id": 0, "content": 1, "metadata": 1}):
            # Restrict metadata to url and type
            metadata = {
                "type": doc["metadata"].get("type", collection_name),  # Default to collection name if type is missing
                "url": doc["metadata"].get("url", ""),  # Default to empty string if URL is missing
            }
            documents.append({"content": doc["content"], "metadata": metadata})
    return documents



# Chunking function


In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


@step
def chunk_documents(documents: list[dict[str, any]], chunk_size: int = 500, chunk_overlap: int = 50) -> list[dict[str, any]]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    
    chunked_data = []
    for doc in documents:
        content = doc["content"]
        metadata = doc["metadata"]  # Only includes `type` and `url`
        
        # Create chunks for each document
        chunks = text_splitter.split_text(content)
        for chunk in chunks:
            chunked_data.append({
                "content": chunk,
                "metadata": metadata  # Pass only `type` and `url`
            })
    
    return chunked_data


# embedding function

In [21]:
from sentence_transformers import SentenceTransformer
from zenml.steps import step


@step
def generate_embeddings(text_chunks: list[str], batch_size: int = 32) -> list[list[float]]:
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = []
    
    for i in range(0, len(text_chunks), batch_size):
        batch = text_chunks[i:i + batch_size]
        batch_embeddings = model.encode(batch)
        embeddings.extend(batch_embeddings.tolist())
    
    return embeddings

Python(97060) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


[1;35mPyTorch version 2.5.1 available.[0m
[1;35mTensorFlow version 2.18.0 available.[0m


# storing in single qdrant collection

In [22]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, PointStruct

# Initialize Qdrant Client
qdrant_client = QdrantClient("http://localhost:6333")



from pymongo import MongoClient
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, PointStruct

@step
def store_in_qdrant(chunked_data: list[dict], collection_name: str = "unified_collection_final", batch_size: int = 100):
    # Create or recreate the Qdrant collection
    vector_size = len(chunked_data[0]["embedding"])  # Dimension of embeddings
    qdrant_client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=vector_size,
            distance="Cosine"  # Use cosine similarity for matching
        )
    )
    
    # Insert data into Qdrant in batches
    for i in range(0, len(chunked_data), batch_size):
        batch = chunked_data[i:i + batch_size]
        points = [
            PointStruct(
                id=j + i,  # Ensure unique IDs across batches
                vector=chunk["embedding"],
                payload={
                    "content": chunk["content"],  # Chunked content
                    "type": chunk["metadata"]["type"],  # Metadata: type
                    "url": chunk["metadata"]["url"],  # Metadata: url
                }
            )
            for j, chunk in enumerate(batch)
        ]

        # Upsert batch into Qdrant
        qdrant_client.upsert(collection_name=collection_name, points=points)
        print(f"Inserted batch {i // batch_size + 1} with {len(points)} points.")
    
    print(f"Stored {len(chunked_data)} chunks in Qdrant collection: {collection_name}")


In [23]:
@step
def extract_texts_from_chunks(chunked_data: list[dict]) -> list[str]:
    return [chunk['content'] for chunk in chunked_data]

In [24]:
@step
def assign_embeddings_to_chunks(chunked_data: list[dict], all_embeddings: list) -> list[dict]:
    for i, chunk in enumerate(chunked_data):
        chunk["embedding"] = all_embeddings[i]
    return chunked_data

# Working Pipeline

In [32]:
!zenml init

Python(98870) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[?25l[2;36mFound existing ZenML repository at path [0m[2;32m'/Users/debikad/Desktop/AI-project'[0m[2;36m.[0m
[2;32m⠋[0m[2;36m Initializing ZenML repository at /Users/debikad/Desktop/AI-project.[0m
[2K[1A[2K[32m⠋[0m Initializing ZenML repository at /Users/debikad/Desktop/AI-project.

[1A[2K[1A[2K

In [None]:
@pipeline
def document_processing_pipeline():
    # Fetch documents
    documents = fetch_documents()
    # Chunk documents
    chunked_data = chunk_documents(documents)
    # Generate embeddings
    texts = extract_texts_from_chunks(chunked_data)
    all_embeddings = generate_embeddings(texts, batch_size=64)
    batch_size = 64  # Adjust based on available resources
    all_embeddings = generate_embeddings(texts, batch_size=batch_size)
    # Assign embeddings back to chunks
    chunked_data = assign_embeddings_to_chunks(chunked_data, all_embeddings)
    store_in_qdrant(chunked_data)


# Instantiate and run the pipeline
pipeline_instance = document_processing_pipeline()
pipeline_instance.run()

In [None]:
from zenml.pipelines import pipeline
from zenml.steps import step

@step
def example_step():
    print("Hello, ZenML!")

@pipeline
def example_pipeline(step):
    step()

example_pipeline(example_step()).run()


In [None]:

# Step 1: Fetch documents from MongoDB
print("Fetching documents from MongoDB...")
documents = fetch_documents()
print("doc length is:",len(documents))
# Step 2: Chunk documents
print("Chunking documents...")
chunked_data = chunk_documents(documents)


print(len(chunked_data))
print(chunked_data[0])
# Step 3: Generate embeddings for each chunk
print("Generating embeddings...")
texts = [chunk['content'] for chunk in chunked_data]
batch_size = 64  # Adjust based on available resources
all_embeddings = generate_embeddings(texts, batch_size=batch_size)
# Assign embeddings back to chunks
for i, chunk in enumerate(chunked_data):
    chunk["embedding"] = all_embeddings[i]
# Step 4: Store in Qdrant
print("Storing data in Qdrant...")
store_in_qdrant(chunked_data)

print("Process completed successfully!")



## Model Answering part

In [40]:
from qdrant_client import QdrantClient

# Initialize Qdrant Client
qdrant_client = QdrantClient("http://localhost:6333")

def retrieve_similar_documents(query_embedding, collection_name="unified_collection_final", top_k=5):
    """
    Retrieve the top-k most similar documents from Qdrant for the given query embedding.
    """
    results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=top_k  # Retrieve the top-k matches
    )
    
    # Extract relevant information from results
    similar_docs = [
        {
            "content": result.payload.get("content"),  # The actual text content
            "metadata": {
                "type": result.payload.get("type"),
                "url": result.payload.get("url")
            },
            "score": result.score  # Similarity score
        }
        for result in results
    ]
    context_chunks = [result.payload for result in results]
    context = "\n".join([result.payload["content"] for result in results])

    return similar_docs,context_chunks,context


In [41]:
!pip install SentenceTransformer

Python(4286) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[31mERROR: Could not find a version that satisfies the requirement SentenceTransformer (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for SentenceTransformer[0m[31m
[0m

In [42]:

from sentence_transformers import SentenceTransformer

def generate_embedding_query(query, model_name='all-MiniLM-L6-v2'):
    """
    Generate an embedding for the query using a pre-trained embedding model.
    """
    # Load the model (you can cache it to avoid reloading multiple times)
    model = SentenceTransformer(model_name)
    
    # Generate the embedding
    embedding = model.encode(query).tolist()  # Convert numpy array to list for compatibility
    return embedding


In [25]:
# query = "Ros2 full form"
# query_embedding = generate_embedding_query(query)

[1;35mUse pytorch device_name: mps[0m
[1;35mLoad pretrained SentenceTransformer: all-MiniLM-L6-v2[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [43]:
# similar_docs,context_chunks,context = retrieve_similar_documents(query_embedding, collection_name="unified_collection", top_k=5)

# # Print results
# for doc in similar_docs:
#     print(f"Content: {doc['content']}")
#     print(f"Type: {doc['metadata']['type']}")
#     print(f"URL: {doc['metadata']['url']}")
#     print(f"Score: {doc['score']}")
#     print("-" * 50)
# print(context_chunks)

In [44]:
import subprocess

def generate_response_with_llama(query, context):
    """
    Generate a response using Llama 3 via Ollama CLI.
    """
    prompt = f"""
    You are a knowledgeable assistant. Use the following context to answer the query. If the context does not contain enough information, say, "I cannot answer this based on the provided context."

    Context:
    {context}

    Query:
    {query}

    Answer:
    """
    # Use subprocess to call Ollama CLI
    result = subprocess.run(
        ["ollama", "run", "hf.co/debika/model"],
        input=prompt,
        text=True,
        capture_output=True
    )
    return result.stdout.strip()


In [45]:
def full_pipeline(query):
    # Step 1: Generate query embedding
    # query_embedding = generate_query_embedding(query)
    query_embedding = generate_embedding_query(query)
    _,context_chunks,context = retrieve_similar_documents(query_embedding, collection_name="unified_collection_final", top_k=5)
    # Step 2: Retrieve relevant context from Qdrant
    # context = retrieve_context(query_embedding, collection_name="unified_collection", top_k=5)
    # Step 3: Handle case with no relevant context
    print("context is:",context)
    if not context.strip():
        return "I'm sorry, but I couldn't find any relevant data to answer your question."

    # Step 4: Generate response using Llama 3
    response = generate_response_with_llama(query, context)
    return response


In [46]:
import gradio as gr
def gradio_qa(query):
    return full_pipeline(query)
interface = gr.Interface(
    fn=gradio_qa,  # Connect Gradio function to the full pipeline
    inputs=gr.Textbox(lines=2, label="Enter your query"),  # Input: Query
    outputs=gr.Textbox(label="Generated Answer"),  # Output: Model response
    title="Q&A System with Llama 2",
    description="Ask a question, and the system will retrieve relevant context and generate an answer using Llama 2."
)

[1;35mHTTP Request: GET [0m[34mhttps://api.gradio.app/gradio-messaging/en[1;35m "HTTP/1.1 200 OK"[0m


[1;35mHTTP Request: GET [0m[34mhttps://api.gradio.app/pkg-version[1;35m "HTTP/1.1 200 OK"[0m


In [56]:
# query = "give ros2 full form"
# response = full_pipeline(query)
# print("Response:\n", response)


context is: goal of the ROS 2 project is to adapt to these changes, leveraging what is great about ROS 1 and improving what isnt.Here you will find the official documentation onROS 2, the newest version of ROS.If youre looking for documentation on ROS 1 (i.e., ROS as it has existed for several years, and what you might be using right now), check theROS wiki.Where to startNewcomers and experienced ROS users should consult this overview of our user-centric content to find what theyre looking
in all ROS 2 core packages since the previous release.Table of
ROS community. The goal of the ROS 2 project is to adapt to these changes, leveraging what is great about ROS 1 and improving what isnt.Here you will find the official documentation onROS 2, the newest version of ROS.If youre looking for documentation on ROS 1 (i.e., ROS as it has existed for several years, and what you might be using right now), check theROS wiki.Where to startNewcomers and experienced ROS users should consult this overv

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## gradio - installation

# gradio - deploy

In [47]:
if __name__ == "__main__":
    interface.launch()

* Running on local URL:  http://127.0.0.1:7860
[1;35mHTTP Request: GET [0m[34mhttp://127.0.0.1:7860/gradio_api/startup-events[1;35m "HTTP/1.1 200 OK"[0m
[1;35mHTTP Request: HEAD [0m[34mhttp://127.0.0.1:7860/[1;35m "HTTP/1.1 200 OK"[0m

To create a public link, set `share=True` in `launch()`.


[1;35mUse pytorch device_name: mps[0m
[1;35mLoad pretrained SentenceTransformer: all-MiniLM-L6-v2[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[1;35mHTTP Request: POST [0m[34mhttp://localhost:6333/collections/unified_collection_final/points/search[1;35m "HTTP/1.1 200 OK"[0m
context is: goal of the ROS 2 project is to adapt to these changes, leveraging what is great about ROS 1 and improving what isnt.Here you will find the official documentation onROS 2, the newest version of ROS.If youre looking for documentation on ROS 1 (i.e., ROS as it has existed for several years, and what you might be using right now), check theROS wiki.Where to startNewcomers and experienced ROS users should consult this overview of our user-centric content to find what theyre looking
are plenty of resources available online to learn more about ROS 2. Heres a few to start with:design.ros2.orgcontains various articles on the design decisions behind ROS 2, like:Why ROS 2?ROS on DDSChanges between ROS 1 and ROS 2The code for ROS 2 is open source and broken into various repositories. You can find the code for most of the repositories on theros2 GitHu

Python(4434) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[1;35mUse pytorch device_name: mps[0m
[1;35mLoad pretrained SentenceTransformer: all-MiniLM-L6-v2[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[1;35mHTTP Request: POST [0m[34mhttp://localhost:6333/collections/unified_collection_final/points/search[1;35m "HTTP/1.1 200 OK"[0m
context is: Same thing, i have to use moveit_servo to reach a specific pose, but I can't find anyway to do so
in a plane, or two dimensions. Furthermore, a floating joint is unconstrained, and can move around in any of the three dimensions. These joints cannot be specified by just one number, and therefore arent included in this tutorial.Specifying the PoseAs you move the sliders around in the GUI, the model moves in Rviz. How is this done? First theGUIparses the URDF and finds all the non-fixed joints and their limits. Then, it uses the values of the sliders to
this is our initial pose so you can see here we have um our xaxis right here so this x-axis is the initial pose of our camera and then later we're going to be modifying it so if we scroll down we see the first transform that we do is going to be this right here so here in our transform what we

Python(9973) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[1;35mUse pytorch device_name: mps[0m
[1;35mLoad pretrained SentenceTransformer: all-MiniLM-L6-v2[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[1;35mHTTP Request: POST [0m[34mhttp://localhost:6333/collections/unified_collection_final/points/search[1;35m "HTTP/1.1 200 OK"[0m
context is: Same thing, i have to use moveit_servo to reach a specific pose, but I can't find anyway to do so
go ahead and run this now so we come here and run this uh launch file and this will allow us to view our joint and you can see here we have a leg on the side and then this here will allow us to rotate the leg about the y- AIS which is the green axis that we see now finally we could go ahead and take a look at a full robot example so we'll be making a robot with 16 links and 15 joints So Below is a table that shows the 16 links so you can see here we have cylinders boxes and then um we split it
in a plane, or two dimensions. Furthermore, a floating joint is unconstrained, and can move around in any of the three dimensions. These joints cannot be specified by just one number, and therefore arent included in this tutorial.Specifying the PoseAs yo

Python(14955) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [63]:
!pip install "zenml==0.70.0"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting zenml==0.70.0
  Downloading zenml-0.70.0-py3-none-any.whl.metadata (21 kB)
Downloading zenml-0.70.0-py3-none-any.whl (4.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: zenml
  Attempting uninstall: zenml
    Found existing installation: zenml 0.71.0
    Uninstalling zenml-0.71.0:
      Successfully uninstalled zenml-0.71.0
Successfully installed zenml-0.70.0
