# Browsing Agent
This notebook will use the internet to download content like pdfs. Then an LLM will decide which files to keep. Next it will create a vector index of the files, for easy RAG.

In [None]:
#imports
%load_ext autoreload
%autoreload 2
import os
from open_agent import OpenAgent
from config import Config
from IPython.display import display
import numpy as np
from transformers import CLIPProcessor, CLIPModel
import requests
from googlesearch import search
import PyPDF2
import faiss
import regex as re


In [None]:
# Function to search for PDF URLs using a query
def search_pdf_urls(query, num_results=10):
    pdf_urls = []
    for url in search(query, num_results=num_results):
        # Optionally check if the URL really ends with '.pdf'
        if url.lower().endswith(".pdf"):
            pdf_urls.append(url)
    return pdf_urls

In [None]:
# Function to download a PDF file from a given URL
def download_pdf(url, dest_folder="downloads"):
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)
    local_filename = os.path.join(dest_folder, url.split("/")[-1])
    try:
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(local_filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        print(f"Downloaded: {local_filename}")
        return local_filename
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return None

In [None]:
# Function to extract metadata (e.g., creation date) from the PDF
def extract_pdf_metadata(pdf_path):
    metadata = {}
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            metadata = reader.metadata
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return metadata

In [None]:
# Function to extract text from the first few pages of the PDF
def extract_pdf_text(pdf_path, max_pages=3):
    text = ""
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            num_pages = len(reader.pages)
            pages_to_read = min(num_pages, max_pages)
            for i in range(pages_to_read):
                page = reader.pages[i]
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return text

In [None]:
agent = OpenAgent(api_key=Config.api_key)

In [None]:
def analyze_pdf(pdf_text, context):
    # Analyze the pdf text together with the context.
    # Next we try to see if it is relevant to the context or not
    # Build our XML prompt
    prompt = f"""
        <prompt>
            <objective>Given the following document text as extracted from a PDF, decide if it is relevant to the context or not</objective>
            <instuctions>
                <instruction>
                    Analyze the document text in the section "document_text" and the context in the section "context" and decide if the document is relevant to the context or not.
                </instruction>
                <instruction>
                    It is important that the brand name is the same in the document and the context.
                </instruction>
                <instruction>
                    Answer with "True" if the document is relevant to the context, and "False" if it is not.
                </instruction>
            </instuctions>
            <document_text>
                {pdf_text}
            </document_text>
            <context>
                {context}   
            </context>

        </prompt>"""
    
    # Send the prompt to the OpenAI API
    response = agent.chat(text=prompt)
    # Try to extract the response as True or False
    try:
        output = response.lower()
        if output == "true":
            return True
        elif output == "false":
            return False
    except Exception as e:
        print(f"Error extracting response: {e}")


 

In [None]:
brand = "mölnycke mepilex border flex"
#brand = "mölnycke Mepilex Border Heel"
#brand = "Essity Libero Touch"
product_type = "product sheet"


In [None]:
# Construct the search query. The filetype operator helps to target PDFs.
query = f"{brand} {product_type} filetype:pdf"
print("Searching for PDFs...")
pdf_urls = search_pdf_urls(query, num_results=10)
print(f"Found {len(pdf_urls)} PDF URLs.")

relevant_pdfs = []
# Process each found PDF
for url in pdf_urls:
    print(f"\nProcessing URL: {url}")
    file_path = download_pdf(url)
    if not file_path:
        continue
    
   

In [None]:
# Extract the pdf files from the downloads folder
folder = "downloads"
# append the folder to the file path
pdf_files = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".pdf")]
pdf_files

# Filter the PDFs
See if the data is valid for our use case.

In [None]:
company = "Mölnlycke"
#company = "Essity"
context = f"We want to match a product with the brand name: {brand} to the product sheet (pdf). The manufacturer should be {company}. The pdf should be a product sheet."
print(context)
relevant_pdfs = []
for file_path in pdf_files:
    #print(f'Analyzing PDF: {file_path}')
    # Extract metadata such as creation date
    # metadata = extract_pdf_metadata(file_path)
    # creation_date = metadata.get("/CreationDate", "Unknown")
    # print(f"Creation Date (from metadata): {creation_date}")

    # Extract text from the PDF 
    pdf_text = extract_pdf_text(file_path, max_pages=10)

    # See if the PDF is relevant to the context
    is_relevant = analyze_pdf(pdf_text, context)
    print(f"{file_path} Is Relevant: {is_relevant}")
    if is_relevant:
        relevant_pdfs.append(file_path)

# Query the data
Ask questions about the data using the PDFs and an LLM


In [None]:
# Take the full text of all relevant pdfs and add them in a RAG response
main_query = 'Extract the available sizes from the product sheet, only answer with information that belongs to Mepilex Border Flex. Answer with a table.'
#main_query = 'Extract the available sizes from the product sheet together with their arcticle number. Answer with a table.'

# Iterate over the relevant PDFs and update the query


files_xml = ""
files_xml = ""
for file_path in relevant_pdfs:
    file_name = os.path.basename(file_path)
    file_content = extract_pdf_text(file_path)
    # Create an XML-like structure for each file.
    files_xml += f"<file>\n  <name>{file_name}</name>\n  <file_content>{file_content}</file_content>\n</file>\n"


query = f"""
    <prompt>
        <objective>Answer the main query by extracting the information from the product sheets. The main quary is in "main_query" and the content is in "content". Each file has its own "file" where "name" is the name of the pdf file, the "file_content" is the actual pdf text.</objective>
        <main_query>
            {main_query}
        </main_query>
        <instruction>
            For each piece of extracted information, provide the source file name.
        </instruction>
        <content>
            {files_xml}
        </content>
    </prompt>
"""

response = agent.chat(text=query)
print(response)

# Create a vector index of the relevant documents

In [None]:
text = 'hello world 42'
embedding = agent.get_embedding(text)
print(embedding)

In [None]:
embedding_dim = len(embedding)
# Create a faiss index
index = faiss.IndexFlatIP(embedding_dim)

In [None]:
def chunk_text(text, max_length=500):
    """
    Splits text into chunks of up to max_length characters.
    This is a simple splitter that uses sentence boundaries.
    """
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_length:
            current_chunk += " " + sentence if current_chunk else sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

In [None]:
def embed_text(text, file_name):
    # Split the text into chunks
    chunks = chunk_text(text)
    # Embed each chunk
    embeddings = []
    documents = [] #The documents for each chunk
    for i, chunk in enumerate(chunks):
        if chunk:
            embedding = agent.get_embedding(chunk)
            embeddings.append(embedding)

            documents.append({
                            "file": file_name,
                            "chunk_index": i,
                            "text": chunk
                        })
    return embeddings, documents

In [None]:
embeddings, documents = embed_text(pdf_text, file_path)

In [None]:
documents

In [None]:
# Add the embeddings to the index
embeddings = np.array(embeddings)
index.add(embeddings)

In [None]:
query = "Available Sizes"
# Embed the query
query_embedding = agent.get_embedding(query)
# Search the index
k = 3
D, I = index.search(np.array([query_embedding]), k)
# Display the search results
for i in range(k):
    print(f"Result {i+1}")
    print(f"Distance: {D[0][i]}")
    print(f"Document: {documents[I[0][i]]['file']}")
    print(f"Chunk Index: {documents[I[0][i]]['chunk_index']}")
    print(f"Text: {documents[I[0][i]]['text']}")
    print()

In [None]:
relevant_pdfs

In [None]:
relevant_pdfs = pdf_files