In [None]:
!pwd
!pip install --upgrade pip

# Install required libraries
!python3 -m pip -q install redis
!pip install -U langchain gradio
!pip install -U langchain-core
!pip install -U langchain-community
!pip install -qU pypdf
!pip install -U redisvl
!pip install openai
!pip install -qU langchain-openai

In [None]:
## Update the 'host' field with the correct Redis host URL
host = 'redis-12000.redis-poc.dlqueue.com'
port = 12000
password = 'admin'
requirePass = True


In [None]:
import redis

if requirePass:
    client = redis.Redis(host = host, port=port, decode_responses=True, password=password)
else:
    client = redis.Redis(host = 'localhost', decode_responses=True)

print(client.ping())
# Clear Redis database (optional)
client.flushdb()

REDIS_URL = f"redis://:{password}@{host}:{port}"
INDEX_NAME = f"idx_qna"

In [None]:
!wget https://storage.googleapis.com/abhi-data-2024/how_india_shops_online.pdf -O report.pdf


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.chains import RetrievalQA

#from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFLoader


file = "report.pdf"

# set up the file loader/extractor and text splitter to create chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2500, chunk_overlap=50, add_start_index=True
)

loader = PyPDFLoader(file)
documents = loader.load()

chunks = text_splitter.split_documents(documents)
#chunked_docs = [doc.page_content for doc in chunks]

In [None]:
chunked_docs = [doc.page_content for doc in chunks]
print(chunks)
print("*****")
print(chunked_docs)

In [None]:
import getpass

# setup the API Key
api_key = getpass.getpass("Enter your OpenAI API key: ")

# Create text embeddings with Open AI embedding model

Use the Open AI for text embeddings, developed by Google.

Text embeddings are a dense vector representation of a p\iece of content such that, if two pieces of content are semantically similar, their respective embeddings are located near each other in the embedding vector space. This representation can be used to solve common NLP tasks, such as:


*   Semantic search: Search text ranked by semantic similarity.
*   Recommendation: Return items with text attributes similar to the given text.
*   Classification: Return the class of items whose text attributes are similar to the given text.
*   Clustering: Cluster items whose text attributes are similar to the given text.
*   Outlier Detection: Return items where text attributes are least related to the given text.

The Open AI text-embeddings API lets you create a text embedding using Generative AI on Vertex AI. The text-embedding-3-large model accepts a maximum of 4096 input tokens (i.e. words) and outputs 1024-dimensional vector embeddings.

In [None]:
from langchain.vectorstores.redis import Redis
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredURLLoader
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredFileLoader
from langchain_openai import OpenAIEmbeddings


embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    dimensions=1024,
    api_key=api_key
)

def get_vectordb() -> Redis:
    """Create the Redis vectordb."""
    # Load Redis with documents
    vectordb = Redis.from_documents(
        documents=chunks,
        embedding=embeddings,
        index_name=INDEX_NAME,
        redis_url=REDIS_URL
    )
    return vectordb


redis = get_vectordb()


# Include RAG

We're going to build a complete RAG pipeline from scratch incorporating the following components:

Standard retrieval and chat completion
Dense content representation to improve accuracy
Query re-writing to improve accuracy
Semantic caching to improve performance
Conversational session history to improve personalization

### Define Prompt template
PromptTemplate defines the exect text of the response that would be fed to the LLM. This step is optional, but the defaults usually work well for OpenAI and might fall short for other models.

In [None]:
#@title Function to define prompt template

def create_prompt():
    """Create the QA chain."""
    from langchain.prompts import PromptTemplate
    from langchain.chains import RetrievalQA

    # Define our prompt
    prompt_template = """Use only the following pieces of context to answer the question. If you don't know the answer, say that you don't know, don't try to make up an answer.

    This should be in the following format:

    Question: [question here]
    Answer: [answer here]

    Begin!

    Context:
    ---------
    {context}
    ---------
    Question: {question}
    Answer:"""

    prompt = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )
    return prompt


In [None]:
from langchain_openai import OpenAI

llm = OpenAI(
    model="gpt-3.5-turbo-instruct",
    temperature=0.5,
    max_retries=2,
    api_key=api_key,
    verbose=False
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=redis.as_retriever(search_type="similarity_distance_threshold",search_kwargs={"distance_threshold":0.5}),
    #return_source_documents=True,
    chain_type_kwargs={"prompt": create_prompt()},
    #verbose=True
    )

In [None]:
qa.invoke('What are some motivations for shopping online?')['result']

In [None]:
qa.invoke('How do Indians like to pay for shopping online?')['result']

In [None]:
qa.invoke('What are some known challenges in shopping online?')['result']

In [None]:
qa.invoke('How home and kitchen segment is growing?')['result']

In [None]:
qa.invoke('What are the effects of social media on online shopping?')['result']

In [None]:
qa.invoke('What are some relevant items that are shopped online?')['result']

In [None]:
import gradio as gr

def handle(query):
    response = qa.run(query)
    return response

iface = gr.Interface(fn=handle, inputs="text", outputs="text")
iface.launch(share=True)

In [None]:
iface.close()