#Embedding-Based Retrieval with Activeloop and OpenAI

Copyright 2024 Denis Rothman

This first component of the RAG pipeline collects data and prepares it.

# Environment

In [1]:
#!pip install beautifulsoup4==4.12.3
#!pip install requests==2.31.0

# DATA COLLECTION

## Collecting and preparing the data

In [14]:
import requests
from bs4 import BeautifulSoup
import re

# Function to clean text
def clean_text(content):
    content = re.sub(r'\[\d+\]', '', content)  # Remove references like [1]
    return content

# Function to fetch and clean content from a URL
def fetch_and_clean(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    content = soup.find('div', {'class': 'mw-parser-output'})
    for section_title in ['References', 'Bibliography', 'External links', 'See also']:
        section = content.find('span', id=section_title)
        if section:
            for sib in section.parent.find_next_siblings():
                sib.decompose()
            section.parent.decompose()
    text = content.get_text(separator=' ', strip=True)
    return clean_text(text)

# Save fetched content to the specified text file
def save_to_txt(output_path, urls):
    with open(output_path, 'w', encoding='utf-8') as f:
        for url in urls:
            print(f"Fetching content from {url}...")
            try:
                cleaned_text = fetch_and_clean(url)
                # Write the URL and content to the file
                f.write(f"Source URL: {url}\n")
                f.write(cleaned_text + '\n\n')
            except Exception as e:
                print(f"Failed to fetch content from {url}: {e}")

# List of URLs
urls = [
    "https://en.wikipedia.org/wiki/Exploration_of_Mars",
    "https://en.wikipedia.org/wiki/Apollo_program",
    "https://en.wikipedia.org/wiki/Hubble_Space_Telescope",
    "https://en.wikipedia.org/wiki/Mars_rover",
    "https://en.wikipedia.org/wiki/International_Space_Station",
    "https://en.wikipedia.org/wiki/SpaceX",
    "https://en.wikipedia.org/wiki/Juno_(spacecraft)",
    "https://en.wikipedia.org/wiki/Voyager_program",
    "https://en.wikipedia.org/wiki/Galileo_(spacecraft)",
    "https://en.wikipedia.org/wiki/Kepler_Space_Telescope",
    "https://en.wikipedia.org/wiki/James_Webb_Space_Telescope",
    "https://en.wikipedia.org/wiki/Space_Shuttle",
    "https://en.wikipedia.org/wiki/Artemis_program",
    "https://en.wikipedia.org/wiki/Skylab",
    "https://en.wikipedia.org/wiki/NASA",
    "https://en.wikipedia.org/wiki/European_Space_Agency",
    "https://en.wikipedia.org/wiki/Ariane_(rocket_family)",
    "https://en.wikipedia.org/wiki/Spitzer_Space_Telescope",
    "https://en.wikipedia.org/wiki/New_Horizons",
    "https://en.wikipedia.org/wiki/Cassini%E2%80%93Huygens",
    "https://en.wikipedia.org/wiki/Curiosity_(rover)",
    "https://en.wikipedia.org/wiki/Perseverance_(rover)",
    "https://en.wikipedia.org/wiki/InSight",
    "https://en.wikipedia.org/wiki/OSIRIS-REx",
    "https://en.wikipedia.org/wiki/Parker_Solar_Probe",
    "https://en.wikipedia.org/wiki/BepiColombo",
    "https://en.wikipedia.org/wiki/Juice_(spacecraft)",
    "https://en.wikipedia.org/wiki/Solar_Orbiter",
    "https://en.wikipedia.org/wiki/CHEOPS_(satellite)",
    "https://en.wikipedia.org/wiki/Gaia_(spacecraft)"
]

# Save to the specified file
output_path = 'D:/RAG_Rothman/Chapter02/llm_with_metadata.txt'
save_to_txt(output_path, urls)

print("Content saved to llm_with_metadata.txt.")

Fetching content from https://en.wikipedia.org/wiki/Exploration_of_Mars...
Fetching content from https://en.wikipedia.org/wiki/Apollo_program...
Fetching content from https://en.wikipedia.org/wiki/Hubble_Space_Telescope...
Fetching content from https://en.wikipedia.org/wiki/Mars_rover...
Fetching content from https://en.wikipedia.org/wiki/International_Space_Station...
Fetching content from https://en.wikipedia.org/wiki/SpaceX...
Fetching content from https://en.wikipedia.org/wiki/Juno_(spacecraft)...
Fetching content from https://en.wikipedia.org/wiki/Voyager_program...
Fetching content from https://en.wikipedia.org/wiki/Galileo_(spacecraft)...
Fetching content from https://en.wikipedia.org/wiki/Kepler_Space_Telescope...
Fetching content from https://en.wikipedia.org/wiki/James_Webb_Space_Telescope...
Fetching content from https://en.wikipedia.org/wiki/Space_Shuttle...
Fetching content from https://en.wikipedia.org/wiki/Artemis_program...
Fetching content from https://en.wikipedia.org

In [15]:
# Open the file and read the first 20 lines
with open('D:/RAG_Rothman/Chapter02/llm_with_metadata.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()
    # Print the first 20 lines
    for line in lines[:20]:
        print(line.strip())

Source URL: https://en.wikipedia.org/wiki/Exploration_of_Mars
Not to be confused with Human mission to Mars or Colonization of Mars . Self-portrait of Perseverance rover and Ingenuity helicopter (to the left) located at Wright Brothers Field, the Ingenuity helicopter drop site (7 April 2021) Active Mars missions, 1997 to present α Year Number of missions 1997 2 1998 1 1999 1 2000 1 2001 2 2002 2 2003 3 2004 5 2005 5 2006 6 2007 5 2008 6 2009 5 2010 5 2011 4 2012 5 2013 5 2014 7 2015 7 2016 8 2017 8 2018 9 2019 8 2020 8 2021 11 2022 11 2023 10 The planet Mars has been explored remotely by spacecraft. Probes sent from Earth, beginning in the late 20th century, have yielded a large increase in knowledge about the Martian system, focused primarily on understanding its geology and habitability potential. [ 1 ] [ 2 ] Engineering interplanetary journeys is complicated and the exploration of Mars has experienced a high failure rate, especially the early attempts. Roughly sixty percent of all s