In [1]:
!pip install -qU \
  langchain==0.0.335 \
  pinecone-client==2.2.4 \
  openai \
  datasets \
  tiktoken


In [2]:
from langchain.retrievers.you import YouRetriever
from langchain.chains import RetrievalQA
from langchain.chat_models.openai import ChatOpenAI
import os

os.environ["OPENAI_API_KEY"] = "<<OPENAI_API_KEY>>"
os.environ["YDC_API_KEY"] = "<<YDC_API_KEY>>"

llm = ChatOpenAI(model="gpt-3.5-turbo-16k")

you_retriever = YouRetriever()


## Create Pinecone Indes

In [3]:
from datasets import load_dataset

dataset = load_dataset(
    "jamescalam/ai-arxiv-chunked",
    split="train"
)

dataset

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 41584
})

In [4]:
import pinecone

# get API key from app.pinecone.io and environment from console
pinecone.init(
    api_key="<<PINECONE_API_KEY>>",
    environment="<<PINECONE_ENV>>"
)

In [5]:
import time

index_name = 'you-pinecone'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=1536,
        metric='cosine'
    )
    # wait for index to finish initialization
    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

index = pinecone.Index(index_name)

In [9]:
from langchain.embeddings.openai import OpenAIEmbeddings

embed_model = OpenAIEmbeddings(
    model="text-embedding-ada-002",
    disallowed_special=()
)

In [None]:
from tqdm.auto import tqdm  # for progress bar

data = dataset.to_pandas()  # this makes it easier to iterate over the dataset

batch_size = 100

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i+batch_size)
    # get batch of data
    batch = data.iloc[i:i_end]
    # generate unique ids for each chunk
    ids = [f"{x['doi']}-{x['chunk-id']}" for i, x in batch.iterrows()]
    # get text to embed
    texts = [x['chunk'] for _, x in batch.iterrows()]
    # embed text
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'text': x['chunk'],
         'source': x['source'],
         'title': x['title']} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

  0%|          | 0/416 [00:00<?, ?it/s]

In [10]:
from langchain.vectorstores import Pinecone

text_field = "text"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)
pinecone_retriever = vectorstore.as_retriever()



In [11]:
retriever_infos = [
    {
        "name": "pinecone",
        "description": "use this tool when you need information about LLMs (llama 2, gpt-4, etc) or ML",
        "retriever": pinecone_retriever
    }, {
        "name": "you.com",
        "description": "use this tool for general purpose queries that can be found on the web",
        "retriever": you_retriever
    }
]

In [12]:
from langchain.chains.router.multi_retrieval_qa import MultiRetrievalQAChain

retrieval_qa = MultiRetrievalQAChain.from_retrievers(
    llm=llm, retriever_infos=retriever_infos,
    verbose=True
)

In [13]:
res = retrieval_qa.invoke({"input": "tell me about the llama 2 llm"})



[1m> Entering new MultiRetrievalQAChain chain...[0m




pinecone: {'query': 'tell me about the llama 2 LLM'}
[1m> Finished chain.[0m


In [14]:
res

{'input': 'tell me about the llama 2 llm',
 'query': 'tell me about the llama 2 LLM',
 'result': 'Llama 2 is a collection of pretrained and fine-tuned large language models (LLMs). These LLMs range in scale from 7 billion to 70 billion parameters. The main focus of Llama 2 is on optimizing the LLMs for dialogue use cases. \n\nAccording to the developers, Llama 2 models outperform open-source chat models on most benchmarks that were tested. They also claim that based on their humane evaluations for helpfulness and safety, Llama 2 models may be a suitable substitute for closed-source models.\n\nThe approach to fine-tuning and safety of the Llama 2 models is described in detail in the research work. However, the specific details of the fine-tuning and safety methods are not mentioned in the given context.\n\nOverall, Llama 2 aims to provide high-performing language models for dialogue applications, offering improved performance compared to existing open-source models.'}

In [15]:
res = retrieval_qa.invoke({"input": "who is the German chancellor now?"})



[1m> Entering new MultiRetrievalQAChain chain...[0m




you.com: {'query': 'who is the current German chancellor?'}
[1m> Finished chain.[0m


In [16]:
res

{'input': 'who is the German chancellor now?',
 'query': 'who is the current German chancellor?',
 'result': 'The current German chancellor is Olaf Scholz.'}