In [3]:
%pip install datasets langchain-pinecone

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting langchain-pinecone
  Downloading langchain_pinecone-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.6.1,>=2023.1.0 (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets)
  Using cached fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.9.5-cp312-cp312-macosx_11_0_arm64.

In [2]:
# Ref : https://www.pinecone.io/learn/series/langchain/langchain-retrieval-augmentation/
# Load dataset
from datasets import load_dataset

data = load_dataset("wikipedia", "20220301.simple", split="train[:1000]")
for index in range(50, 55):
    print(data[index]["title"], ":", data[index]["text"][:100].replace("\n", ""))

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 205328/205328 [00:00<00:00, 974739.91 examples/s] 

Beard : A beard is the hair growing on the lower part of a man's face.The hair that grows on the upper lip
Black : In light, black is lack of all color.  In painting, however, the black pigment is the combination of
Bubonic plague : Bubonic plague is the best-known form of the disease plague, which is caused by the bacterium Yersin
Biology : Biology  is the science that studies life, and living things, and the evolution of life. Living thin
Botany : Botany is the study of plants. It is a science. It is a branch of biology, and is also called plant 





In [3]:
import time
from tqdm.auto import tqdm
from uuid import uuid4
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings

# Load dataset
data = load_dataset("wikipedia", "20220301.simple", split="train[:100]")

# Connect database
pc = Pinecone(api_key="{YOUR_PINECONE_API_KEY}")
# pc.create_index(
#     name="terry-wiki",
#     dimension=1536, # Replace with your model dimensions
#     metric="cosine", # Replace with your model metric
#     spec=ServerlessSpec(
#         cloud="aws",
#         region="us-east-1"
#     ) 
# )
index = pc.Index("terry-wiki")

# create embedding API
os.environ["OPENAI_API_KEY"] = "{YOUR_OPENAI_KEY}"
embedding = OpenAIEmbeddings()

# create text splitter
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=400,
    chunk_overlap=20,
    length_function=len,
    separators=["\n\n", "\n", " ", ""],
)

# Upsert records
batch_size = 100
texts = []
metadatas = []
count = 0

for i, record in enumerate(tqdm(data)):
    # first get metadata fields for this record
    metadata = {
        "wiki-id": str(record["id"]),
        "source": record["url"],
        "title": record["title"],
    }

    # create text chunk and metadata
    full_text = record["text"]
    text_chunks = text_splitter.split_text(full_text)
    for i, text in enumerate(text_chunks):  # max medatada size is 40K
        record = {"chunk": i, "text": full_text, **metadata}
        metadatas.append(record)
        texts.append(text)
        count = count + 1
        if count > batch_size:  # flush batch insert
            ids = [str(uuid4()) for _ in range(len(texts))]
            embeds = embedding.embed_documents(texts)
            try:
                index.upsert(vectors=zip(ids, embeds, metadatas))
                # flush buffers
                texts = []
                metadatas = []
                count = 0
            except Exception as e:
                print(e)  # ignore exception
                print("retry")
                time.sleep(1)  # wait 1 sec for retry

100%|██████████| 100/100 [00:41<00:00,  2.43it/s]


# Search data without langchain

In [15]:
import os
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

# Connect database
pc = Pinecone(api_key="{YOUR_PINECONE_API_KEY}")
index = pc.Index("terry-wiki")

# create embedding API
os.environ["OPENAI_API_KEY"] = "{YOUR_OPENAI_KEY}"
embedding = OpenAIEmbeddings()

vectordb = PineconeVectorStore(index=index, embedding=embedding)

query_result = vectordb.similarity_search_with_score(
    query="Where is the Cuba?", 
    k=3
)

# print(query_result[0])

for result, score in query_result:
    text = result.page_content.replace("\n", "")[:500]
    title = result.metadata["title"]
    print(score, title)
    print(text, "....")
    print("\n")

0.877913952 Cuba
Cuba is an island country in the Caribbean Sea. The country is made up of the big island of Cuba, the Isla de la Juventud island (Isle of Youth), and many smaller islands. Havana is the capital of Cuba. It is the largest city. The second largest city is Santiago de Cuba. In Spanish, the capital is called "La Habana". Cuba is near the United States, Mexico, Haiti, Jamaica and the Bahamas. People from Cuba are called Cubans (cubanos in Spanish). The official language is Spanish. Cuba is warm all y ....


0.860044062 Cuba
Cuba is an island country in the Caribbean Sea. The country is made up of the big island of Cuba, the Isla de la Juventud island (Isle of Youth), and many smaller islands. Havana is the capital of Cuba. It is the largest city. The second largest city is Santiago de Cuba. In Spanish, the capital is called "La Habana". Cuba is near the United States, Mexico, Haiti, Jamaica and the Bahamas. People from Cuba are called Cubans (cubanos in Spanish). The offici

# Similarity search

In [16]:
import os
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore


# create embedding API
os.environ["OPENAI_API_KEY"] = "{YOUR_OPENAI_KEY}"
embedding = OpenAIEmbeddings()

# Connect database
pc = Pinecone(api_key="{YOUR_PINECONE_API_KEY}")
index = pc.Index("terry-wiki")
vectordb = PineconeVectorStore(index=index, embedding=embedding)

query = "Where is best place for the vacation"

results = vectordb.similarity_search(
    query, k=10  # our search query  # return 10 most relevant docs
)
for result in results:
    print(result.metadata)

{'chunk': 13.0, 'source': 'https://simple.wikipedia.org/wiki/Catharism', 'title': 'Catharism', 'wiki-id': '135'}
{'chunk': 80.0, 'source': 'https://simple.wikipedia.org/wiki/Australia', 'title': 'Australia', 'wiki-id': '27'}
{'chunk': 4.0, 'source': 'https://simple.wikipedia.org/wiki/Continent', 'title': 'Continent', 'wiki-id': '117'}
{'chunk': 14.0, 'source': 'https://simple.wikipedia.org/wiki/Continent', 'title': 'Continent', 'wiki-id': '117'}
{'chunk': 15.0, 'source': 'https://simple.wikipedia.org/wiki/City', 'title': 'City', 'wiki-id': '144'}
{'chunk': 22.0, 'source': 'https://simple.wikipedia.org/wiki/Astronomy', 'title': 'Astronomy', 'wiki-id': '48'}
{'chunk': 9.0, 'source': 'https://simple.wikipedia.org/wiki/Native%20American', 'title': 'Native American', 'wiki-id': '37'}
{'chunk': 33.0, 'source': 'https://simple.wikipedia.org/wiki/City', 'title': 'City', 'wiki-id': '144'}
{'chunk': 27.0, 'source': 'https://simple.wikipedia.org/wiki/April', 'title': 'April', 'wiki-id': '1'}
{'ch

# Search with MetaData Filter

In [17]:
query = "Where is the cuba"

results = vectordb.similarity_search(
    query,  # our search query
    k=10,  # return 3 most relevant docs
    filter={
        "$and": [
            {"title": "Cuba"},
        ]
    },
)

for result in results:
    print(result.metadata)

{'chunk': 0.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 18.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 4.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 2.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 1.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 22.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 10.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 5.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 3.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 11.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'C

# Use RAG result in prompt

In [19]:
# Use RAG result
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini", api_key="{YOUR_OPENAI_KEY}")

country = "Cuba"
chat_template = PromptTemplate.from_template(
    """
You are the tour guide. I'm planning to visit {country}.
Please advise me what I can do in {country}.
Answer using only information the context below and do not use any other information."
context:
{context}
"""
)
rag_template = "Where is the {country}"

results = vectordb.similarity_search(rag_template.format(country=country), k=1)
context = ""
for result in results:
    context += result.page_content

prompt = chat_template.format(country="Cuba", context=context)
print(prompt)
print("-" * 50)
print(llm.invoke(prompt))
print("\n")


You are the tour guide. I'm planning to visit Cuba.
Please advise me what I can do in Cuba.
Answer using only information the context below and do not use any other information."
context:
Cuba is an island country in the Caribbean Sea. The country is made up of the big island of Cuba, the Isla de la Juventud island (Isle of Youth), and many smaller islands. Havana is the capital of Cuba. It is the largest city. The second largest city is Santiago de Cuba. In Spanish, the capital is called "La Habana". Cuba is near the United States, Mexico, Haiti, Jamaica and the Bahamas. People from Cuba are called Cubans (cubanos in Spanish). The official language is Spanish. Cuba is warm all year.

In 1492, Christopher Columbus landed on the island of Cuba. He claimed it for the Kingdom of Spain. Cuba became a Spanish colony until the Spanish–American War of 1898. After the war, it was part of the United States. It gained independence in 1902.

In 1959, guerrilla fighters led by Fidel Castro and Ch

# MMR

In [20]:
query = "Where is the cuba? Where is other country near the cuba?"
found_docs = vectordb.max_marginal_relevance_search(query, k=10, fetch_k=10)
for i, doc in enumerate(found_docs):
    print(doc.metadata, "\n")

docs = vectordb.similarity_search(query, k=10)
for doc in docs:
    print(doc.metadata)

{'chunk': 0.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'} 

{'chunk': 22.0, 'source': 'https://simple.wikipedia.org/wiki/Argentina', 'title': 'Argentina', 'wiki-id': '54'} 

{'chunk': 18.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'} 

{'chunk': 6.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'} 

{'chunk': 2.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'} 

{'chunk': 1.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'} 

{'chunk': 4.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'} 

{'chunk': 24.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'} 

{'chunk': 22.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'} 

{'chunk': 5.0, 'source': 'https://simple.wikipedia.

# Multi Query

In [23]:
import logging
import os
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_pinecone import PineconeVectorStore

# create embedding API and llm
os.environ["OPENAI_API_KEY"] = "{YOUR_OPENAI_KEY}"
embedding = OpenAIEmbeddings()
llm = ChatOpenAI()

# Connect database
vectordb = PineconeVectorStore(index=index, embedding=embedding)

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

query = "Where is the cuba? Where is other country near the cuba?"
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(), llm=llm
)
docs = retriever_from_llm.get_relevant_documents(query=query)
for doc in docs:
    print(doc.metadata)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. Can you provide the location of Cuba?', '2. What countries are located near Cuba?', '3. Can you tell me about the neighboring countries of Cuba?']


{'chunk': 0.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 18.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 2.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 1.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 22.0, 'source': 'https://simple.wikipedia.org/wiki/Argentina', 'title': 'Argentina', 'wiki-id': '54'}
{'chunk': 5.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
{'chunk': 4.0, 'source': 'https://simple.wikipedia.org/wiki/Cuba', 'title': 'Cuba', 'wiki-id': '178'}
