# Retrieval Augmented Generation (RAG)
Building the Journal page navigator.

In [None]:
%pip install langchain
%pip install python-dotenv
%pip install lark
%pip install chromadb
%pip install tiktoken

In [2]:
## Initialize OpenAI API
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']
print('ready!')

ready!


## Document Loading


In [3]:
## Document Loading
from langchain.document_loaders import WebBaseLoader
import requests
from bs4 import BeautifulSoup

# Define the base URL
base_url = "https://antfriend.github.io/journals"

# Function to get all HTML document URLs from the base URL
def get_document_urls(base_url):
    page = requests.get(base_url)
    soup = BeautifulSoup(page.content, 'html.parser')
    links = soup.find_all('a')

    # Check if href is not None and ends with '.html'
    # An anchor tag with an href of None is a link to the current page
    document_urls = [link.get('href') for link in links if link.get('href') and link.get('href').endswith('.html')]
    document_urls = [base_url + '/' + url for url in document_urls]
    return document_urls

# Get all document URLs
document_urls = get_document_urls(base_url)
print(len(document_urls), 'journal pages from:')
print(document_urls[0])
print('to:')
print(document_urls[-1])

# Load and process each document
documents = []
for url in document_urls:
    # Initialize the WebBaseLoader
    loader = WebBaseLoader(url)
    document = loader.load()
    documents.append(document)

# `documents` now contains the loaded HTML content from each URL
print(len(documents), 'documents loaded')
print(documents[0][:100], '...')


13 journal pages from:
https://antfriend.github.io/journals/waves_15.html
to:
https://antfriend.github.io/journals/vectors_32.html
13 documents loaded
[Document(page_content='\n\n\n\n\n\nTitle: "Entangled Minds: Quantum Observations in the Stardust" journal page\n\n\n\nJournals Home \n\n\n\n\n\n"Entangled Minds: Quantum Observations in the Stardust"\nScientist with a sense of humor summary:\nOur brains are like glow-in-the-dark toys, and by peeking at the stars, we might just be playing cosmic tag with light from eons ago.\nFourth grader summary:\nImagine your brain lighting up like a firefly when you look at stars — it\'s like making friends with light from way back in the past!\nDetailed scientific explanation:\nThe journal page appears to express a poetic and metaphorical connection between human observation and the universe through the lens of quantum physics. The writer suggests that by looking at a star, there is an instantaneous, albeit metaphysical, link created between the obs

## Splitting

In [11]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)
splits_collection = []
for doc in documents:
    try:
        splits = text_splitter.split_documents(doc)
        splits_collection.append(splits)
        #print(len(splits), 'documents split')
        #print(splits[0])
    except:
        print('Error splitting documents')
        print('Try reducing chunk_size and chunk_overlap')
        raise
print(len(splits_collection), 'documents split')
print('into a total of', sum([len(splits) for splits in splits_collection]), 'splits')
print('each split resembles:', splits_collection[0][0])

embeddings = OpenAIEmbeddings()
embeddings.embed_documents
embeddings_collection_of_collections = []
embeddings_collection = []

for splits in splits_collection:
    #print(' ')
    # Generating embeddings for each split
    for split in splits:
        # print(split.page_content[:100], '...')
        embed = embeddings.embed_documents(split.page_content)
        embeddings_collection.append(embed)

    # `embeddings_collection` now contains the embeddings for each split
    embeddings_collection_of_collections.append(embeddings_collection)
print(' ')
print(len(embeddings_collection_of_collections), 'documents embedded') # 1 per document

13 documents split
into a total of 41 splits
each split resembles: page_content='Title: "Entangled Minds: Quantum Observations in the Stardust" journal page\n\n\n\nJournals Home' metadata={'source': 'https://antfriend.github.io/journals/waves_15.html', 'title': 'Title: "Entangled Minds: Quantum Observations in the Stardust" journal page', 'language': 'No language found.'}
 
13 documents embedded


### Vector Embeddings on splits

In [14]:
print(' ')
embeddings_collection_of_collections[0][0][0][:10] # 1 per split
# embedding1 = embedding.embed_query(sentence1)

 


[-0.003934556732305011,
 -0.014521354298720337,
 0.013507292997201745,
 -0.005266356900147807,
 -0.028583000310713525,
 0.008213894679655383,
 -0.020389387974675634,
 -0.01650891314584513,
 -0.005790288727819512,
 -0.018077327462155166]

## Storage


In [21]:
## Storage
from langchain.vectorstores import Chroma
#from langchain.embeddings.openai import OpenAIEmbeddings
persist_directory = 'docs/chroma/'
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings_collection_of_collections)

## Retrieval

In [22]:
# embedding = OpenAIEmbeddings()
# vectordb = Chroma(
#     persist_directory=persist_directory,
#     embedding_function=embedding
# )
print(vectordb._collection.count())

0


## Output

In [None]:
## Output
