In [None]:
#  Copyright (c) 2023 Snowflake Computing Inc. All rights reserved.

# Import python packages
import streamlit as st
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


# Demo: Asking Questions to Your Own Documents Using Snowflake Cortex
This notebook demo the usage of new Cortex-LLM functions. Using UDTFs (User Defined Table Functions), PDF documents are read and chunked. Embeddings are used to create vectors for each chunk. Those vectors are used later to find similarities with the questions. The user can decide if use RAG to answer questions, so those chunks are provided as context in the prompt.

Streams and Tasks are also used. Each time a new PDF is uploaded into the stage area, it is automatically processed and embeddings are created.

You can run this notebook and upload your own documents. There is another code with a Streamlit App. This notebook needs to be executed first as it created some of the functions used by the App.

This notebook assume you have created a staging area called DOCS and have copied some PDFs there.

In [None]:
docs_available = session.sql("ls @docs").collect()
list_docs = []
for doc in docs_available:
    list_docs.append(doc["name"])
st.dataframe(list_docs)


We are going to create a function using libraries available in the Snowflake Anaconda channel to read the PDF and chunk it on pieces. This will be registered as a UDTF:

In [None]:
#A class for chunking text and returning a table via UDTF
from snowflake.snowpark.types import StringType, StructField, StructType
from langchain.text_splitter import RecursiveCharacterTextSplitter
from snowflake.snowpark.files import SnowflakeFile
import PyPDF2, io
import logging

class pdf_text_chunker:

    def read_pdf(self, file_url: str) -> str:
    
        logger = logging.getLogger("udf_logger")
        logger.info(f"Opening file {file_url}")
    
        with SnowflakeFile.open(file_url, 'rb') as f:
            buffer = io.BytesIO(f.readall())
            
        reader = PyPDF2.PdfReader(buffer)   
        text = ""
        for page in reader.pages:
            try:
                text += page.extract_text().replace('\n', ' ')
            except:
                text = "Unable to Extract"
                logger.warn(f"Unable to extract from file {file_url}, page {page}")
        
        return text

    def process(self,file_url: str):

        text = self.read_pdf(file_url)
        
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = 3500, #Adjust this as you see fit
            chunk_overlap  = 200, #This let's text have some form of overlap. Useful for keeping chunks contextual
            length_function = len
        )
    
        chunks = text_splitter.split_text(text)
        df = pd.DataFrame(chunks, columns=['chunks'])
        
        yield from df.itertuples(index=False, name=None)

In [None]:
schema = StructType([
     StructField("chunk", StringType())
 ])

session.udtf.register( 
    pdf_text_chunker,
    output_schema= schema, 
    input_types = [StringType()] , 
    is_permanent = True , 
    name = 'pdf_text_chunker' , 
    replace = True , 
    packages=['snowflake-snowpark-python', 'pypdf2','pandas','langchain'], 
    stage_location = 'CC_DOCUMENTS.DATA2.UDF'
)

Process the current PDFs in the DOCS staging area. We are using an internal staging area for simplicity but this would be an S3 in the CSP.

Here we call the UDTF defined before to process the documents and create the new table:

In [None]:
create or replace table docs_chunks_table as
    select relative_path, 
            size,
            file_url, 
            build_scoped_file_url(@docs, relative_path) as scoped_file_url,
            func.chunk as chunk,
            snowflake.ml.embed_text('e5-base-v2',chunk) as chunk_vec
    from 
        directory(@docs),
        TABLE(pdf_text_chunker(build_scoped_file_url(@docs, relative_path))) as func;


In [None]:
-- To check the column chunk_vec use traditional UI as of today there is an error displaying vector data type here
select relative_path, size, file_url, chunk, len(chunk) from docs_chunks_table limit 4;

Let´s do some testing to check it finds documents related to the question we are asking:

In [None]:
myquestion = "'Write 3 best practices for devops'"

cmd = f"""
    with results as
    (SELECT RELATIVE_PATH,
       VECTOR_COSINE_DISTANCE(chunk_vec, 
                snowflake.ml.embed_text('e5-base-v2',{myquestion})) as distance,
       chunk
    from docs_chunks_table
    order by distance desc
    limit 1)
    select chunk, relative_path from results
    
    """

df_context = session.sql(cmd).to_pandas()
prompt_context = df_context._get_value(0,'CHUNK')
relative_path =  df_context._get_value(0,'RELATIVE_PATH')
print (relative_path)

cmd2 = f"select GET_PRESIGNED_URL(@docs, '{relative_path}', 360) as URL_LINK from directory(@docs)"
df_url_link = session.sql(cmd2).to_pandas()
url_link = df_url_link._get_value(0,'URL_LINK')

print (url_link)

This next function creates the prompt with the question we are asking. If the client wants to use RAG then VECTOR_COSINE_DISTANCE() will be used to find similar chunks to the question being asked and that text will be added to the prompt as context. 

Once the prompt has been built, the cortex function complete() is called to genreate the answer to the question:

In [None]:

def create_prompt (myquestion, rag):

    if rag == 1:
        myquestion_quoted = f"'{myquestion}'"
    
        cmd = f"""
        with results as
        (SELECT RELATIVE_PATH,
           VECTOR_COSINE_DISTANCE(chunk_vec, 
                    snowflake.ml.embed_text('e5-base-v2',{myquestion_quoted})) as distance,
           chunk
        from docs_chunks_table
        order by distance desc
        limit 1)
        select chunk, relative_path from results
        
        """
        df_context = session.sql(cmd).to_pandas()
        prompt_context = df_context._get_value(0,'CHUNK')
        prompt_context = prompt_context.replace("'", "")
        relative_path =  df_context._get_value(0,'RELATIVE_PATH')
    
        prompt = f"""
         'Answer the question based on the context. Be considse
          Context: {prompt_context}
          Question:  
           {myquestion} 
           Answer: '
           """
        cmd2 = f"select GET_PRESIGNED_URL(@docs, '{relative_path}', 360) as URL_LINK from directory(@docs)"
        df_url_link = session.sql(cmd2).to_pandas()
        url_link = df_url_link._get_value(0,'URL_LINK')

    else:
        prompt = f"""
         'Question:  
           {myquestion} 
           Answer: '
           """
        url_link = "None"
        relative_path = "None"
        
    return prompt, url_link, relative_path

def complete(myquestion, rag = 1):

    prompt, url_link, relative_path =create_prompt (myquestion, rag)
    cmd = f"""
        select snowflake.ml.complete(
            'llama2-7b-chat',
            {prompt})
            as response
            """
    
    df_response = session.sql(cmd).collect()
    return df_response, url_link, relative_path

This helper prints the response and the link to the document used if RAG has been used:

In [None]:
def display_response (question, rag=0):
    response, url_link, relative_path = complete(question, rag)
    st.markdown(response[0].RESPONSE)
    if rag == 1:
        display_url = f"Link to [{relative_path}]({url_link}) that may be useful"
        st.markdown(display_url)

    

In [None]:
question = "Write 5 best practices to migrate magento"

Check the differences when using RAG or not:

In [None]:
display_response(question, rag=0)



In [None]:
display_response(question, rag=1)



Now we create a stream on the DOCS staging area. This stream will track all new documents uploaded into that staging area.

In [None]:
create or replace stream docs_stream on stage docs;


And finally a task is created so when that stream has data the new documents will be processed reading them and creating chunks.

In [None]:
create or replace task task_extract_chunk_vec_from_pdf 
    warehouse = XS_WH
    schedule = '1 minute'
    when system$stream_has_data('docs_stream')
    as

    insert into docs_chunks_table (relative_path, size, file_url,
                            scoped_file_url, chunk, chunk_vec)
    select relative_path, 
            size,
            file_url, 
            build_scoped_file_url(@docs, relative_path) as scoped_file_url,
            func.chunk as chunk,
            snowflake.ml.embed_text('e5-base-v2',chunk) as chunk_vec
    from 
        docs_stream,
        TABLE(pdf_text_chunker(build_scoped_file_url(@docs, relative_path)))            as func;


In [None]:
alter task task_extract_chunk_vec_from_pdf resume;