# Browsing Agent
This notebook will use the internet to download content like pdfs. Then an LLM will decide which files to keep. Next it will create a vector index of the files, for easy RAG.

In [11]:
#imports
%load_ext autoreload
%autoreload 2
import os
from open_agent import OpenAgent
from config import Config
from IPython.display import display
import numpy as np
from transformers import CLIPProcessor, CLIPModel
import requests
from googlesearch import search
import PyPDF2
import faiss
import regex as re


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
# Function to search for PDF URLs using a query
def search_pdf_urls(query, num_results=10):
    pdf_urls = []
    for url in search(query, num_results=num_results):
        # Optionally check if the URL really ends with '.pdf'
        if url.lower().endswith(".pdf"):
            pdf_urls.append(url)
    return pdf_urls

In [13]:
# Function to download a PDF file from a given URL
def download_pdf(url, dest_folder="downloads"):
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)
    local_filename = os.path.join(dest_folder, url.split("/")[-1])
    try:
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(local_filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        print(f"Downloaded: {local_filename}")
        return local_filename
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        return None

In [14]:
# Function to extract metadata (e.g., creation date) from the PDF
def extract_pdf_metadata(pdf_path):
    metadata = {}
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            metadata = reader.metadata
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return metadata

In [15]:
# Function to extract text from the first few pages of the PDF
def extract_pdf_text(pdf_path, max_pages=3):
    text = ""
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            num_pages = len(reader.pages)
            pages_to_read = min(num_pages, max_pages)
            for i in range(pages_to_read):
                page = reader.pages[i]
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return text

In [16]:
agent = OpenAgent(api_key=Config.api_key)

In [17]:
def analyze_pdf(pdf_text, context):
    # Analyze the pdf text together with the context.
    # Next we try to see if it is relevant to the context or not
    # Build our XML prompt
    prompt = f"""
        <prompt>
            <objective>Given the following document text as extracted from a PDF, decide if it is relevant to the context or not</objective>
            <instuctions>
                <instruction>
                    Analyze the document text in the section "document_text" and the context in the section "context" and decide if the document is relevant to the context or not.
                </instruction>
                <instruction>
                    It is important that the brand name is the same in the document and the context.
                </instruction>
                <instruction>
                    Answer with "True" if the document is relevant to the context, and "False" if it is not.
                </instruction>
            </instuctions>
            <document_text>
                {pdf_text}
            </document_text>
            <context>
                {context}   
            </context>

        </prompt>"""
    
    # Send the prompt to the OpenAI API
    response = agent.chat(text=prompt)
    # Try to extract the response as True or False
    try:
        output = response.lower()
        if output == "true":
            return True
        elif output == "false":
            return False
    except Exception as e:
        print(f"Error extracting response: {e}")


 

In [18]:
brand = "mölnycke mepilex border flex"
#brand = "mölnycke Mepilex Border Heel"
product_type = "product sheet"


In [19]:
# Construct the search query. The filetype operator helps to target PDFs.
query = f"{brand} {product_type} filetype:pdf"
print("Searching for PDFs...")
pdf_urls = search_pdf_urls(query, num_results=10)
print(f"Found {len(pdf_urls)} PDF URLs.")

relevant_pdfs = []
# Process each found PDF
for url in pdf_urls:
    print(f"\nProcessing URL: {url}")
    file_path = download_pdf(url)
    if not file_path:
        continue
    
   

Searching for PDFs...
Found 7 PDF URLs.

Processing URL: https://www.molnlycke.com/globalassets/mepilex-border-flex-hqim005406.pdf
Downloaded: downloads\mepilex-border-flex-hqim005406.pdf

Processing URL: https://www.molnlycke.ca/contentassets/3a0ad2ec58d848169ae7391bb4370689/mepilexborderflex_productsheet_eng-1.pdf
Downloaded: downloads\mepilexborderflex_productsheet_eng-1.pdf

Processing URL: https://www.onemed.se/-/media/onemed/b2b/ligu/molnlycke/broschyr---mepilex-border-flex.pdf
Downloaded: downloads\broschyr---mepilex-border-flex.pdf

Processing URL: https://static.webareacontrol.com/CommonFile/mepilexborderflex-product-sheet-1647325121893.pdf
Downloaded: downloads\mepilexborderflex-product-sheet-1647325121893.pdf

Processing URL: https://www.molnlycke.lat/SysSiteAssets/master-and-local-markets/documents/master/wound-care-documents/ifu/pd-571466_01-ifu-mepilex-border-flex-lite.pdf
Downloaded: downloads\pd-571466_01-ifu-mepilex-border-flex-lite.pdf

Processing URL: https://www.mol

In [20]:
# Extract the pdf files from the downloads folder
folder = "downloads"
# append the folder to the file path
pdf_files = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".pdf")]
pdf_files

['downloads\\860717d3e8b7e304716c715e0e22f787.pdf',
 'downloads\\broschyr---mepilex-border-flex.pdf',
 'downloads\\ifu-40900-mepilex-border-flex-em_01.pdf',
 'downloads\\mepilex-border-flex-hqim005406.pdf',
 'downloads\\mepilexborderflex-product-sheet-1647325121893.pdf',
 'downloads\\mepilexborderflex_productsheet_eng-1.pdf',
 'downloads\\pd-571466_01-ifu-mepilex-border-flex-lite.pdf']

# Filter the PDFs
See if the data is valid for our use case.

In [21]:
size = "10 x 10 cm"
company = "Mölnlycke"
context = f"We want to match a product with the brand name: {brand} to the product sheet (pdf). The manufacturer should be {company}. We are looking for a product with size: {size}."
print(context)
relevant_pdfs = []
for file_path in pdf_files:
    #print(f'Analyzing PDF: {file_path}')
    # Extract metadata such as creation date
    # metadata = extract_pdf_metadata(file_path)
    # creation_date = metadata.get("/CreationDate", "Unknown")
    # print(f"Creation Date (from metadata): {creation_date}")

    # Extract text from the PDF 
    pdf_text = extract_pdf_text(file_path, max_pages=10)

    # See if the PDF is relevant to the context
    is_relevant = analyze_pdf(pdf_text, context)
    print(f"{file_path} Is Relevant: {is_relevant}")
    if is_relevant:
        relevant_pdfs.append(file_path)

We want to match a product with the brand name: mölnycke mepilex border flex to the product sheet (pdf). The manufacturer should be Mölnlycke. We are looking for a product with size: 10 x 10 cm.
downloads\860717d3e8b7e304716c715e0e22f787.pdf Is Relevant: True
downloads\broschyr---mepilex-border-flex.pdf Is Relevant: True
downloads\ifu-40900-mepilex-border-flex-em_01.pdf Is Relevant: False
downloads\mepilex-border-flex-hqim005406.pdf Is Relevant: True
downloads\mepilexborderflex-product-sheet-1647325121893.pdf Is Relevant: True
downloads\mepilexborderflex_productsheet_eng-1.pdf Is Relevant: True
downloads\pd-571466_01-ifu-mepilex-border-flex-lite.pdf Is Relevant: False


# Query the data
Ask questions about the data using the PDFs and an LLM


In [None]:
# Take the full text of all relevant pdfs and add them in a RAG response
main_query = 'Extract the available sizes from the product sheet, only answer with information that belongs to Mepilex Border Flex. Answer with a table.'

# Iterate over the relevant PDFs and update the query


files_xml = ""
files_xml = ""
for file_path in relevant_pdfs:
    file_name = os.path.basename(file_path)
    file_content = extract_pdf_text(file_path)
    # Create an XML-like structure for each file.
    files_xml += f"<file>\n  <name>{file_name}</name>\n  <file_content>{file_content}</file_content>\n</file>\n"


query = f"""
    <prompt>
        <objective>Answer the main query by extracting the information from the product sheets. The main quary is in "main_query" and the content is in "content". Each file has its own "file" where "name" is the name of the pdf file, the "file_content" is the actual pdf text.</objective>
        <main_query>
            {main_query}
        </main_query>
        <instruction>
            For each piece of extracted information, provide the source file name.
        </instruction>
        <content>
            {files_xml}
        </content>
    </prompt>
"""

response = agent.chat(text=query)
print(response)

The available sizes for Mepilex Border Flex extracted from the product sheets are:

1. Size: 7.5 x 7.5 cm (3 x 3 inch) - Wound pad size: 4.5 x 4.5 cm (1.8 x 1.8 inch) - Dressings per pack: 5
   - Source: mepilex-border-flex-hqim005406.pdf

2. Size: 10 x 10 cm (4 x 4 inch) - Wound pad size: 6.5 x 6.5 cm (2.6 x 2.6 inch) - Dressings per pack: 5
   - Source: mepilex-border-flex-hqim005406.pdf

3. Size: 12.5 x 12.5 cm (5 x 5 inch) - Wound pad size: 8.5 x 8.5 cm (3.3 x 3.3 inch) - Dressings per pack: 5
   - Source: mepilex-border-flex-hqim005406.pdf

4. Size: 15 x 15 cm (6 x 6 inch) - Wound pad size: 11 x 11 cm (4.3 x 4.3 inch) - Dressings per pack: 5
   - Source: mepilex-border-flex-hqim005406.pdf

5. Size: 15 x 20 cm (6 x 8 inch) - Wound pad size: 11 x 16 cm (4.3 x 6.3 inch) - Dressings per pack: 5
   - Source: mepilex-border-flex-hqim005406.pdf

6. Size: 7.8 x 10 cm (3.1 x 4 inch) - Wound pad size: 15 sq cm (2.3 sq inch) - Dressings per pack: 5
   - Source: mepilex-border-flex-hqim005406

# Create a vector index of the relevant documents

In [20]:
text = 'hello world 42'
embedding = agent.get_embedding(text)
print(embedding)

[-0.003582298057153821, -0.03835277631878853, 0.019874263554811478, -0.004460331052541733, -0.01951015554368496, -0.03094923496246338, -0.011325296014547348, 0.04023400694131851, -0.015497374348342419, 0.002150516724213958, 0.0378369577229023, -0.04250968620181084, -0.009368211962282658, 0.004858574829995632, 0.0022718862164765596, 0.02243819646537304, -0.005215097684413195, 0.0036524648312479258, 0.0009026860352605581, 0.02817290648818016, 0.04341995716094971, -0.02227131277322769, -0.0010202628327533603, -0.003368005156517029, 0.029947936534881592, 0.011423909105360508, -0.03929339349269867, 0.0022737826220691204, 0.021497581154108047, -0.02268093451857567, 0.006136748008430004, -0.042995162308216095, 0.0100888442248106, -0.0134340925142169, 0.026701301336288452, -0.03459032252430916, 0.0013872160343453288, -0.0006480944575741887, -0.0030057928524911404, -0.003986231051385403, -0.037412162870168686, -0.0378369577229023, 0.032344985753297806, 0.019085362553596497, -0.00660326192155480

In [21]:
embedding_dim = len(embedding)
# Create a faiss index
index = faiss.IndexFlatIP(embedding_dim)

In [22]:
def chunk_text(text, max_length=500):
    """
    Splits text into chunks of up to max_length characters.
    This is a simple splitter that uses sentence boundaries.
    """
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_length:
            current_chunk += " " + sentence if current_chunk else sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

In [23]:
def embed_text(text, file_name):
    # Split the text into chunks
    chunks = chunk_text(text)
    # Embed each chunk
    embeddings = []
    documents = [] #The documents for each chunk
    for i, chunk in enumerate(chunks):
        if chunk:
            embedding = agent.get_embedding(chunk)
            embeddings.append(embedding)

            documents.append({
                            "file": file_name,
                            "chunk_index": i,
                            "text": chunk
                        })
    return embeddings, documents

In [24]:
embeddings, documents = embed_text(pdf_text, file_path)

In [25]:
documents

[{'file': 'downloads\\pd-571466_01-ifu-mepilex-border-flex-lite.pdf',
  'chunk_index': 1,
  'text': 'WITH SAFETAC® TECHNOLOGY\nManufacturer / Fabricant\nMölnlycke Health Care AB\nGamlestadsvägen 3C, Box 13080, SE-402 52 Göteborg, Sweden\n\nMepilex® Border Flex Lite ≥ 15x15cm / 6x6in\n1 2 3Mepilex® Border Flex Lite < 15x15cm / 6x6in\n1 2Medical Device\nMedizinprodukt\nDispositif médical\nProducto sanitario\nMedisch hulpmiddel\nMedicinteknisk produkt\nDispositivo medico\nLääkinnällinen laite\nDispositivo médico\nMedicinsk udstyr\nιατροτεχνολογικό προϊόν\nWyrób medyczny\nZdravotnický prostředek\nOrvostechnikai eszköz\nMedisinsk utstyr\nMedicinski pripomoček\nМедицинско изделие\nDispozitiv medical\nZdravotnícka pomôcka\nTıbbi cihaz \nMedicinos priemonė\nMedicīniska ierīce\nMeditsiiniseade\nМедицинское изделие\nMedicinski proizvod\nMedicinsko sredstvo\nLækningatæki\nجهاز طبي\nwww.molnlycke.com/symbolsToll free number :\nUSA 1-800-882-4582\nCanada 1-800-494-5134Australian sponsor address:\nM

In [26]:
# Add the embeddings to the index
embeddings = np.array(embeddings)
index.add(embeddings)

In [None]:
query = "Available Sizes"
# Embed the query
query_embedding = agent.get_embedding(query)
# Search the index
k = 3
D, I = index.search(np.array([query_embedding]), k)
# Display the search results
for i in range(k):
    print(f"Result {i+1}")
    print(f"Distance: {D[0][i]}")
    print(f"Document: {documents[I[0][i]]['file']}")
    print(f"Chunk Index: {documents[I[0][i]]['chunk_index']}")
    print(f"Text: {documents[I[0][i]]['text']}")
    print()

Result 1
Distance: 0.68940269947052
Document: downloads\pd-571466_01-ifu-mepilex-border-flex-lite.pdf
Chunk Index: 9
Text: Indications for use
Mepilex Border Flex Lite is designed for the management of a wide range of non- to 
moderately exuding wounds, such as leg and foot ulcers, pressure ulcers, surgical 
wounds and traumatic wounds e.g. abrasions, blisters and skin tears. Mepilex 
Border Flex Lite can also be used as protection of compromised and/or fragile skin.

Result 2
Distance: 0.656291663646698
Document: downloads\pd-571466_01-ifu-mepilex-border-flex-lite.pdf
Chunk Index: 13
Text: • Mepilex Border Flex Lite can be used under compression bandaging. • Mepilex Border Flex Lite can be used in combination with gels. Disposal should be handled according to local environmental procedures. Other information
The polyurethane foam used in the product may change colour to more yellow when 
it is exposed to light, air and/or heat. The colour change has no influence on product 
properties

In [28]:
relevant_pdfs

['downloads\\e615c1f571e618b61b39c2f946fa3123.pdf',
 'downloads\\Mepilex-Border-Product-Sheet.pdf',
 'downloads\\MHC-2022-79822%20SacrumHeel-DoDont-Flyer-Adult%20%281%29.pdf',
 'downloads\\Molnlycke%20Wound%20Care%20Products.pdf']

In [32]:
relevant_pdfs = pdf_files

The available sizes for Mepilex Border Flex are as follows:

1. 7.5 x 7.5 cm
2. 10 x 10 cm
3. 12.5 x 12.5 cm
4. 15 x 15 cm
5. 15 x 20 cm
6. 10 x 20 cm
7. 10 x 30 cm

**Source file name:** mepilexborderflex_productsheet_eng-1.pdf
