##Generative AI Assignment-5: 

Create a retrieval augmented chain to accept search terms and return results from your vector db or in memory dictionary
 

In [0]:
%pip install langchain langchain_community sentence-transformers chromadb PyPDF2 transformers torch

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting langchain_community
  Downloading langchain_community-0.3.9-py3-none-any.whl (2.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.4/2.4 MB 7.1 MB/s eta 0:00:00
Collecting pydantic-settings<3.0.0,>=2.4.0
  Downloading pydantic_settings-2.6.1-py3-none-any.whl (28 kB)
Collecting httpx-sse<0.5.0,>=0.4.0
  Downloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.8-py3-none-any.whl (2.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.4/2.4 MB 56.1 MB/s eta 0:00:00
  Downloading langchain_community-0.3.7-py3-none-any.whl (2.4 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.4/2.4 MB 60.9 MB/s eta 0:00:00
Collecting langchain
  Downloading langchain-0.3.9-py3-none-any.whl (1.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.0/1.0 MB 68.9 MB/s eta 0:00:00
Collecting langchain_community
  Downl

In [0]:
dbutils.library.restartPython()

In [0]:
def mount_storage():
    storage_account_name = "genaicertificationsa"
    container_name = "gen-ai-container"
    mount_point = "/mnt/gen-ai-container"
    
    try:
        dbutils.fs.unmount(mount_point)
    except:
        print("Nothing to unmount")
    
    configs = {
        f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net": "j1wgCYCPWdinRWy/4OAXB1NrdROS3lLxoY/OPTJvgVi5tNwsM45Y8JkXZbBZEOe3ThfmI7F1XPDY+ASt31+r5w=="
    }
    
    dbutils.fs.mount(
        source = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/",
        mount_point = mount_point,
        extra_configs = configs
    )

In [0]:
from langchain_community.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import PyPDF2

def create_vectorstore(pdf_path):
    # Initialize embeddings
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    
    # Create text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    
    # Read PDF
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
    
    # Split text
    chunks = text_splitter.split_text(text)
    
    # Create vector store
    vectordb = Chroma.from_texts(
        texts=chunks,
        embedding=embeddings,
        persist_directory="./chroma_db"
    )
    return vectordb

In [0]:
from langchain.chains import RetrievalQA
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

def create_rag_chain(vectorstore):
    # Initialize local LLM
    local_llm = pipeline(
        "text2text-generation",
        model="google/flan-t5-small",
        max_length=512
    )
    
    llm = HuggingFacePipeline(pipeline=local_llm)
    
    # Create retriever
    retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 3}
    )
    
    # Create chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )
    return qa_chain

In [0]:
def query_document(question, qa_chain):
    try:
        result = qa_chain({"query": question})
        print("Answer:", result["result"])
        print("\nSource Documents:")
        for doc in result["source_documents"]:
            print("-", doc.page_content[:200], "...\n")
    except Exception as e:
        print(f"Error processing query: {str(e)}")

# Usage Example
pdf_path = "/dbfs/mnt/gen-ai-container/gen-ai-sample.pdf"
vectorstore = create_vectorstore(pdf_path)
rag_chain = create_rag_chain(vectorstore)
query_document("What is this document about?", rag_chain)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1003 > 512). Running this sequence through the model will result in indexing errors


Answer: Science/Tech

Source Documents:
- 35330 77
Lukyanenko, R., Maass, W., & Storey, V. C. (2022). Trust in arti -
ficial intelligence: From a Foundational Trust Framework to 
emerging research opportunities. Electronic Markets, 32(4), 
19 ...

- 35330 77
Lukyanenko, R., Maass, W., & Storey, V. C. (2022). Trust in arti -
ficial intelligence: From a Foundational Trust Framework to 
emerging research opportunities. Electronic Markets, 32(4), 
19 ...

- Taxonomy of risks posed by language models. In  2022 ACM 
Conference on Fairness, Accountability, and Transparency (pp. 
214–229). ACM. https:// doi. org/ 10. 1145/ 35311 46. 35330 88
Weisz, J., Mulle ...

