### Import dependencies

In [5]:
# import basics
import os
import glob
from dotenv import load_dotenv

# import langchain
from langchain_community.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import SupabaseVectorStore
from langchain_openai import OpenAIEmbeddings

# import supabase
from supabase.client import Client, create_client

### Load environmental variables

In [6]:
# load environment variables
load_dotenv()  

# initiate supabase db
supabase_url = os.environ.get("SUPABASE_URL")
supabase_key = os.environ.get("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(supabase_url, supabase_key)

# check env variables
print(f'SUPABASE_URL: {supabase_url}')
print(f'SUPABASE_SERVICE_KEY: {supabase_key}')

SUPABASE_URL: https://tnjclyviduvzqjkifjch.supabase.co
SUPABASE_SERVICE_KEY: eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InRuamNseXZpZHV2enFqa2lmamNoIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NDE4MDkwNzUsImV4cCI6MjA1NzM4NTA3NX0.iZbBenrBARtcUX5qn_AP8iL23tDH-bZ6TJr21ml0OvA


### Create embeddings and loader objects

In [7]:
# initiate embeddings model
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


### Load documents

In [8]:
# Define the directory to search
search_directory = "/home/cgorricho/apps/obenai/RAG-chatbot-OBEN/Manuales/BOPP_PER"

# Use glob to find all files that contain "GB" in the name
files = glob.glob(os.path.join(search_directory, '**'), recursive=True)

files = files[1:]

files

['/home/cgorricho/apps/obenai/RAG-chatbot-OBEN/Manuales/BOPP_PER/1597_USD_WIN_en-GB.pdf',
 '/home/cgorricho/apps/obenai/RAG-chatbot-OBEN/Manuales/BOPP_PER/1597_USD_ManufacturerList_en-GB.pdf',
 '/home/cgorricho/apps/obenai/RAG-chatbot-OBEN/Manuales/BOPP_PER/1597_USD_Plant_Overview_A3_en-GB.pdf',
 '/home/cgorricho/apps/obenai/RAG-chatbot-OBEN/Manuales/BOPP_PER/1597_USD_CAS_PP_en-GB.pdf',
 '/home/cgorricho/apps/obenai/RAG-chatbot-OBEN/Manuales/BOPP_PER/1597_USD_plant_operation_en-GB.pdf',
 '/home/cgorricho/apps/obenai/RAG-chatbot-OBEN/Manuales/BOPP_PER/1598_USE_Interval_en-GB.pdf',
 '/home/cgorricho/apps/obenai/RAG-chatbot-OBEN/Manuales/BOPP_PER/1597_USD_Auxiliary_f800_en-GB.pdf',
 '/home/cgorricho/apps/obenai/RAG-chatbot-OBEN/Manuales/BOPP_PER/1597_USD_MDO_en-GB.pdf',
 '/home/cgorricho/apps/obenai/RAG-chatbot-OBEN/Manuales/BOPP_PER/1597_USD_FOK446_en-GB.pdf',
 '/home/cgorricho/apps/obenai/RAG-chatbot-OBEN/Manuales/BOPP_PER/1598_USE_MDO_en-GB.pdf',
 '/home/cgorricho/apps/obenai/RAG-chatb

### Create vector store

In [None]:

for file in files:
    
    print(f'Loading file: {file.split('/')[-1]}...')
    
    # instatiate document loader
    loader = PyPDFLoader(file)

    # split the documents in multiple chunks
    documents = loader.load()

    print(f'File {file.split('/')[-1]} has {len(documents)} pages')

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    docs = text_splitter.split_documents(documents)

    print(f'File {file.split('/')[-1]} has {len(docs)} chunks')

    print(f'Uploading {file.split('/')[-1]} to vector store...')
    
    # store chunks in vector store
    vector_store = SupabaseVectorStore.from_documents(
        docs,
        embeddings,
        client=supabase,
        table_name="oben_bopp",
        query_name="match_documents",
        chunk_size=1000,
    )

    print(f'File {file.split('/')[-1]} uploaded to vector store')

    print('********* NEXT FILE *********')

Loading file: 1597_USD_WIN_en-GB.pdf...
File 1597_USD_WIN_en-GB.pdf has 116 pages
File 1597_USD_WIN_en-GB.pdf has 165 chunks
Uploading /home/cgorricho/apps/obenai/RAG-chatbot-OBEN/Manuales/BOPP_PER/1597_USD_WIN_en-GB.pdf to vector store...
File 1597_USD_WIN_en-GB.pdf uploaded to vector store
********* NEXT FILE *********
Loading file: 1597_USD_ManufacturerList_en-GB.pdf...
File 1597_USD_ManufacturerList_en-GB.pdf has 22 pages
File 1597_USD_ManufacturerList_en-GB.pdf has 33 chunks
Uploading /home/cgorricho/apps/obenai/RAG-chatbot-OBEN/Manuales/BOPP_PER/1597_USD_ManufacturerList_en-GB.pdf to vector store...
File 1597_USD_ManufacturerList_en-GB.pdf uploaded to vector store
********* NEXT FILE *********
Loading file: 1597_USD_Plant_Overview_A3_en-GB.pdf...
File 1597_USD_Plant_Overview_A3_en-GB.pdf has 14 pages
File 1597_USD_Plant_Overview_A3_en-GB.pdf has 14 chunks
Uploading /home/cgorricho/apps/obenai/RAG-chatbot-OBEN/Manuales/BOPP_PER/1597_USD_Plant_Overview_A3_en-GB.pdf to vector store.

### Explore vector store

In [12]:
import os
from supabase import create_client, Client

# load environment variables
load_dotenv()

# Initialize Supabase client
url: str = os.getenv("SUPABASE_URL")
key: str = os.getenv("SUPABASE_SERVICE_KEY")
supabase: Client = create_client(url, key)

# Query the documents table
response = supabase.table('documents').select('*').execute()


In [59]:
for document in response.data:
    print(document)

{'id': 'aa8c3c36-ad0b-4909-815f-ec410dc3f6e7', 'content': 'AGENTIC RETRIEVAL -AUGMENTED GENERATION : A S URVEY ON\nAGENTIC RAG\nAditi Singh\nDepartment of Computer Science\nCleveland State University\nCleveland, OH, USA\na.singh22@csuohio.edu\nAbul Ehtesham\nThe Davey Tree Expert Company\nKent, OH, USA\nabul.ehtesham@davey.com\nSaket Kumar\nThe MathWorks Inc\nNatick, MA, USA\nsaketk@mathworks.com\nTala Talaei Khoei\nKhoury College of Computer Science\nRoux Institute at Northeastern University\nPortland, ME, USA\nt.talaeikhoei@northeastern.edu\nABSTRACT\nLarge Language Models (LLMs) have revolutionized artificial intelligence (AI) by enabling human-\nlike text generation and natural language understanding. However, their reliance on static training\ndata limits their ability to respond to dynamic, real-time queries, resulting in outdated or inaccurate\noutputs. Retrieval-Augmented Generation (RAG) has emerged as a solution, enhancing LLMs by\nintegrating real-time data retrieval to prov

In [60]:
response.data

[{'id': 'aa8c3c36-ad0b-4909-815f-ec410dc3f6e7',
  'content': 'AGENTIC RETRIEVAL -AUGMENTED GENERATION : A S URVEY ON\nAGENTIC RAG\nAditi Singh\nDepartment of Computer Science\nCleveland State University\nCleveland, OH, USA\na.singh22@csuohio.edu\nAbul Ehtesham\nThe Davey Tree Expert Company\nKent, OH, USA\nabul.ehtesham@davey.com\nSaket Kumar\nThe MathWorks Inc\nNatick, MA, USA\nsaketk@mathworks.com\nTala Talaei Khoei\nKhoury College of Computer Science\nRoux Institute at Northeastern University\nPortland, ME, USA\nt.talaeikhoei@northeastern.edu\nABSTRACT\nLarge Language Models (LLMs) have revolutionized artificial intelligence (AI) by enabling human-\nlike text generation and natural language understanding. However, their reliance on static training\ndata limits their ability to respond to dynamic, real-time queries, resulting in outdated or inaccurate\noutputs. Retrieval-Augmented Generation (RAG) has emerged as a solution, enhancing LLMs by\nintegrating real-time data retrieval to p

In [61]:
# Query the documents table for rows where metadata contains 'page': 1
response = supabase.table('documents').select('*').eq('metadata->page', 1).execute()


In [63]:
type(response)

postgrest.base_request_builder.APIResponse[TypeVar]

In [66]:
type(response.data[0])

dict

In [65]:
response.data[0]

{'id': '595e789b-ab0a-4197-8856-b90f5f873714',
 'content': '1 Introduction\nLarge Language Models (LLMs) [1, 2] [3], such as OpenAI’s GPT-4, Google’s PaLM, and Meta’s LLaMA, have signifi-\ncantly transformed artificial intelligence (AI) with their ability to generate human-like text and perform complex natural\nlanguage processing tasks. These models have driven innovation across diverse domains, including conversational\nagents [4], automated content creation, and real-time translation. Recent advancements have extended their capabilities\nto multimodal tasks, such as text-to-image and text-to-video generation [5], enabling the creation and editing of videos\nand images from detailed prompts [6], which broadens the potential applications of generative AI.\nDespite these advancements, LLMs face significant limitations due to their reliance on static pre-training data. This\nreliance often results in outdated information, hallucinated responses [7], and an inability to adapt to dynamic,

In [67]:
response.data[0]['metadata']

{'page': 1, 'source': 'documents/agentic rag paper.pdf'}

In [69]:
type(response.data[0]['metadata'])

dict

In [70]:
response.data[0]['metadata']['page']

1

In [68]:
response.data[0]['content']

'1 Introduction\nLarge Language Models (LLMs) [1, 2] [3], such as OpenAI’s GPT-4, Google’s PaLM, and Meta’s LLaMA, have signifi-\ncantly transformed artificial intelligence (AI) with their ability to generate human-like text and perform complex natural\nlanguage processing tasks. These models have driven innovation across diverse domains, including conversational\nagents [4], automated content creation, and real-time translation. Recent advancements have extended their capabilities\nto multimodal tasks, such as text-to-image and text-to-video generation [5], enabling the creation and editing of videos\nand images from detailed prompts [6], which broadens the potential applications of generative AI.\nDespite these advancements, LLMs face significant limitations due to their reliance on static pre-training data. This\nreliance often results in outdated information, hallucinated responses [7], and an inability to adapt to dynamic, real-world'

In [None]:
response.data[0]['embedding']

'[0.018955627,0.015946027,0.054172806,-0.009356459,-0.00067844946,-0.05276509,-0.008852837,0.003886389,-0.004405181,0.054221347,0.023809822,-0.04553234,-0.045459528,-0.027377654,-0.0153392535,-0.034173526,-0.01623728,-0.023397215,0.012499549,-0.004957346,0.015484879,0.005014989,-0.036066663,-0.014732479,0.0072934264,-0.0059767263,-0.019792976,0.047959436,-0.022402106,-0.00881643,0.0379598,-0.018664377,-0.03458613,0.008786092,-0.017669266,0.045216817,0.02389477,0.024222428,0.011832098,0.022474919,0.004295962,-0.06121139,0.0011794175,0.026916506,-0.041187838,0.046430364,0.008500908,0.015836809,-0.019853653,0.035872493,-0.03203768,0.022402106,0.007827388,-0.014611124,-0.037595734,-0.011025088,-0.0013295941,-0.008500908,-0.0083795525,-0.004802618,0.0044385535,-0.030629965,0.015739724,0.01836099,-0.062182225,-0.031600803,-0.05786199,0.007657491,-0.015230034,-0.0014054409,-0.0024574357,-0.013676692,-0.041600443,0.026940776,-0.04669735,0.014562582,-0.022305021,0.041891694,-0.013130595,0.00625

In [72]:
type(response.data[0]['embedding'])

str

In [75]:
response.data[0]['embedding'].strip('][').split(',')

['0.018955627',
 '0.015946027',
 '0.054172806',
 '-0.009356459',
 '-0.00067844946',
 '-0.05276509',
 '-0.008852837',
 '0.003886389',
 '-0.004405181',
 '0.054221347',
 '0.023809822',
 '-0.04553234',
 '-0.045459528',
 '-0.027377654',
 '-0.0153392535',
 '-0.034173526',
 '-0.01623728',
 '-0.023397215',
 '0.012499549',
 '-0.004957346',
 '0.015484879',
 '0.005014989',
 '-0.036066663',
 '-0.014732479',
 '0.0072934264',
 '-0.0059767263',
 '-0.019792976',
 '0.047959436',
 '-0.022402106',
 '-0.00881643',
 '0.0379598',
 '-0.018664377',
 '-0.03458613',
 '0.008786092',
 '-0.017669266',
 '0.045216817',
 '0.02389477',
 '0.024222428',
 '0.011832098',
 '0.022474919',
 '0.004295962',
 '-0.06121139',
 '0.0011794175',
 '0.026916506',
 '-0.041187838',
 '0.046430364',
 '0.008500908',
 '0.015836809',
 '-0.019853653',
 '0.035872493',
 '-0.03203768',
 '0.022402106',
 '0.007827388',
 '-0.014611124',
 '-0.037595734',
 '-0.011025088',
 '-0.0013295941',
 '-0.008500908',
 '-0.0083795525',
 '-0.004802618',
 '0.00443