In [1]:
# Ref : https://www.pinecone.io/learn/series/langchain/langchain-retrieval-augmentation/
# Load dataset
from datasets import load_dataset

data = load_dataset("wikipedia", "20220301.simple", split='train[:1000]')
for index in range(50,55):
    print(data[index]['title'],':',data[index]['text'][:100].replace("\n", ""))

Beard : A beard is the hair growing on the lower part of a man's face.The hair that grows on the upper lip
Black : In light, black is lack of all color.  In painting, however, the black pigment is the combination of
Bubonic plague : Bubonic plague is the best-known form of the disease plague, which is caused by the bacterium Yersin
Biology : Biology  is the science that studies life, and living things, and the evolution of life. Living thin
Botany : Botany is the study of plants. It is a science. It is a branch of biology, and is also called plant 


In [2]:
from tqdm.auto import tqdm
from uuid import uuid4
from datasets import load_dataset
import pinecone
import openai
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings

# Load dataset
data = load_dataset("wikipedia", "20220301.simple", split='train[:100]')

#Connect database
pinecone.init(api_key="{YOUR_PINECONE_APIKEY}", environment="gcp-starter")
#pinecone.create_index("terry-wiki",dimension=1536,metric="cosine")
index = pinecone.Index("terry-wiki")

#create embedding API
os.environ["OPENAI_API_KEY"] = "{YOUR_OPENAI_KEY}"
embedding = OpenAIEmbeddings()

# create text splitter
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=400,
    chunk_overlap=20,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)

# Upsert records
batch_size = 100
texts = []
metadatas = []
count = 0

for i, record in enumerate(tqdm(data)):
    # first get metadata fields for this record
    metadata = {
        'wiki-id': str(record['id']),
        'source': record['url'],
        'title': record['title']
    }
    
    # create text chunk and metadata 
    full_text = record['text']
    text_chunks = text_splitter.split_text(full_text)
    for i,text in enumerate(text_chunks): # max medatada size is 40K
        record = {
            "chunk":i,
            "text":full_text,
            **metadata
        }
        metadatas.append(record)
        texts.append(text)
        count = count + 1
        if count > batch_size: # flush batch insert
            ids = [str(uuid4()) for _ in range(len(texts))]
            embeds = embedding.embed_documents(texts)
            try:
                index.upsert(vectors=zip(ids,embeds,metadatas))
                #flush buffers
                texts = []
                metadatas = []
                count = 0
            except Exception as e:
                print(e) # ignore exception
                print("retry")
                time.sleep(1) # wait 1 sec for retry

        

  0%|          | 0/100 [00:00<?, ?it/s]

# Search data without langchain

In [9]:
import pinecone
import openai
import os
from langchain.embeddings.openai import OpenAIEmbeddings

#Connect database
pinecone.init(api_key="{YOUR_PINECONE_APIKEY}", environment="gcp-starter")
#pinecone.create_index("terry-wiki",dimension=1536,metric="cosine")
vectordb = pinecone.Index("terry-wiki")

#create embedding API
os.environ["OPENAI_API_KEY"] = "{YOUR_OPENAI_KEY}"
embedding = OpenAIEmbeddings()
question = ["Where is the Cuba?"]

embedded_question = embedding.embed_documents(question)
      
query_result=vectordb.query(
  vector=embedded_question,
  top_k=3,
  include_values=False,
  include_metadata=True
)

#print(query_result.matches[0])
result_ids = [ result.id for result in query_result.matches]

for result in query_result.matches:
    id = result.id
    text = result.metadata['text'].replace('\n','')[:500]
    title = result.metadata['title']
    score = result.score
    print(id,score,title)
    print(text,"....")
    print('\n')

a2e665d1-813f-4a4a-bbbe-bfc7d6a04ea6 0.879255474 Cuba
Cuba is an island country in the Caribbean Sea. The country is made up of the big island of Cuba, the Isla de la Juventud island (Isle of Youth), and many smaller islands. Havana is the capital of Cuba. It is the largest city. The second largest city is Santiago de Cuba. In Spanish, the capital is called "La Habana". Cuba is near the United States, Mexico, Haiti, Jamaica and the Bahamas. People from Cuba are called Cubans (cubanos in Spanish). The official language is Spanish. Cuba is warm all y ....


5eb93f4d-b952-4afb-8425-99957f855269 0.860986769 Cuba
Cuba is an island country in the Caribbean Sea. The country is made up of the big island of Cuba, the Isla de la Juventud island (Isle of Youth), and many smaller islands. Havana is the capital of Cuba. It is the largest city. The second largest city is Santiago de Cuba. In Spanish, the capital is called "La Habana". Cuba is near the United States, Mexico, Haiti, Jamaica and the Ba

# Similarity search

In [7]:
import pinecone
import openai
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone


#create embedding API
os.environ["OPENAI_API_KEY"] = "{YOUR_OPENAI_KEY}"
embedding = OpenAIEmbeddings()

#Connect database
pinecone.init(api_key="{YOUR_PINECONE_APIKEY}", environment="gcp-starter")
#pinecone.create_index("terry-wiki",dimension=1536,metric="cosine")
text_field = "text"
index = pinecone.Index("terry-wiki")
vectordb = Pinecone(
    index, embedding.embed_query, text_field
)

query = "Where is best place for the vacation"

results = vectordb.similarity_search(
    query,  # our search query
    k=10  # return 10 most relevant docs
)
for result in results:
    print(result.metadata)

{'chunk': 13.0, 'source': 'https://simple.wikipedia.org/wiki/Catharism', 'title': 'Catharism', 'wiki-id': '135'}
{'chunk': 4.0, 'source': 'https://simple.wikipedia.org/wiki/Continent', 'title': 'Continent', 'wiki-id': '117'}
{'chunk': 80.0, 'source': 'https://simple.wikipedia.org/wiki/Australia', 'title': 'Australia', 'wiki-id': '27'}
{'chunk': 14.0, 'source': 'https://simple.wikipedia.org/wiki/Continent', 'title': 'Continent', 'wiki-id': '117'}
{'chunk': 15.0, 'source': 'https://simple.wikipedia.org/wiki/City', 'title': 'City', 'wiki-id': '144'}
{'chunk': 33.0, 'source': 'https://simple.wikipedia.org/wiki/City', 'title': 'City', 'wiki-id': '144'}
{'chunk': 22.0, 'source': 'https://simple.wikipedia.org/wiki/Astronomy', 'title': 'Astronomy', 'wiki-id': '48'}
{'chunk': 27.0, 'source': 'https://simple.wikipedia.org/wiki/April', 'title': 'April', 'wiki-id': '1'}
{'chunk': 6.0, 'source': 'https://simple.wikipedia.org/wiki/Crime', 'title': 'Crime', 'wiki-id': '151'}
{'chunk': 16.0, 'source':

# Search with MetaData Filter

In [5]:
query = "Where is the cuba"

results = vectordb.similarity_search(
    query,  # our search query
    k=10,# return 3 most relevant docs
    filter={
    "$and":[
        {"title":"Cuba"},
    ]
})

for result in results:
    print(result.metadata)

{'chunk': 0.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 18.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 4.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 2.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 1.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 22.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 10.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 5.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 3.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 28.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'C

# Use RAG result in prompt

In [21]:
!pip install --upgrade openai
!pip install --upgrade langchain

Collecting openai
  Obtaining dependency information for openai from https://files.pythonhosted.org/packages/6a/54/e0af4b74ebb732bfa9bc83d3e49e577d4e332990742a9ecbe228c532a02d/openai-1.7.2-py3-none-any.whl.metadata
  Downloading openai-1.7.2-py3-none-any.whl.metadata (17 kB)
Downloading openai-1.7.2-py3-none-any.whl (212 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.1/212.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.6.0
    Uninstalling openai-1.6.0:
      Successfully uninstalled openai-1.6.0
Successfully installed openai-1.7.2
Collecting langchain
  Obtaining dependency information for langchain from https://files.pythonhosted.org/packages/23/98/c70fac0f1b3193ced86013b563119c27c68ac26b684815f407555224108d/langchain-0.1.0-py3-none-any.whl.metadata
  Downloading langchain-0.1.0-py3-none-any.whl.metadata (13 kB)
Collecting lan

In [25]:
# Use RAG result 

from langchain import PromptTemplate
from langchain.llms import OpenAI

llm = OpenAI(openai_api_key="{YOUR_OPENAI_KEY}")

country="Cuba"
chat_template = PromptTemplate.from_template("""
You are the tour guide. I'm planning to visit {country}.
Please advise me what I can do in {country}.
Answer using only information the context below and do not use any other information."
context:
{context}
"""
)
rag_template = "Where is the {country}"

results = vectordb.similarity_search(rag_template.format(country=country),k=1)
context = ""
for result in results:
    context+=(result.page_content)

prompt= chat_template.format(country="Cuba", context = context)
print(prompt)
print('-'*50)
print(llm(prompt))
print("\n")


You are the tour guide. I'm planning to visit Cuba.
Please advise me what I can do in Cuba.
Answer using only information the context below and do not use any other information."
context:
Cuba is an island country in the Caribbean Sea. The country is made up of the big island of Cuba, the Isla de la Juventud island (Isle of Youth), and many smaller islands. Havana is the capital of Cuba. It is the largest city. The second largest city is Santiago de Cuba. In Spanish, the capital is called "La Habana". Cuba is near the United States, Mexico, Haiti, Jamaica and the Bahamas. People from Cuba are called Cubans (cubanos in Spanish). The official language is Spanish. Cuba is warm all year.

In 1492, Christopher Columbus landed on the island of Cuba. He claimed it for the Kingdom of Spain. Cuba became a Spanish colony until the Spanish–American War of 1898. After the war, it was part of the United States. It gained independence in 1902.

In 1959, guerrilla fighters led by Fidel Castro and Ch

# MMR

In [None]:
query = "Where is the cuba? Where is other country near the cuba?"
found_docs = await vectordb.amax_marginal_relevance_search(query, k=10, fetch_k=10)
for i, doc in enumerate(found_docs):
    print(doc.metadata, "\n")
    
docs = vectordb.similarity_search(query,k=10)
for doc in docs:
    print(doc.metadata)


# Multi Query

In [None]:
import pinecone
import openai
import logging
import os
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Pinecone

#create embedding API and llm
os.environ["OPENAI_API_KEY"] = "{YOUR_OPENAI_KEY}"
embedding = OpenAIEmbeddings()
llm = ChatOpenAI()

#Connect database
pinecone.init(api_key="{YOUR_PINECONE_APIKEY}", environment="gcp-starter")
index = pinecone.Index("terry-wiki")
text_field = "text"
vectordb = Pinecone(
    index, embedding.embed_query, text_field
)

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

query = "Where is the cuba? Where is other country near the cuba?"
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(), llm=llm
)
docs = retriever_from_llm.get_relevant_documents(query=query)
for doc in docs:
    print(doc.metadata)