In [None]:
! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain youtube-transcript-api pytube langchain-huggingface

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = ''
os.environ['OPENAI_API_KEY'] = ''

In [None]:
import uuid

from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import uuid

from langchain.storage import InMemoryByteStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.retrievers.multi_vector import MultiVectorRetriever

<mark> Part 12: Multi-representation Indexing

In [None]:
loader = WebBaseLoader("https://lilianweng.github.io/posts/2017-06-21-overview/")
docs = loader.load()

loader = WebBaseLoader("https://lilianweng.github.io/posts/2024-02-05-human-data-quality/")
docs.extend(loader.load())

In [None]:
template_summary = '''
Summarize the following document in maximum 100 words.
Please include important/key words in summary.

{doc}
'''

prompt_summary = ChatPromptTemplate.from_template(template_summary)

llm = ChatOpenAI(model="gpt-3.5-turbo",max_retries=0)

summary_chain = (
    {'doc': lambda x : x.page_content}
    | prompt_summary
    | llm
    | StrOutputParser()
)

In [None]:
summaries = summary_chain.batch(docs, {"max_concurrency": 5})
summaries

['The document provides an overview of deep learning, focusing on its models like Convolutional Neural Networks, Recurrent Neural Networks, Autoencoders, Reinforcement Learning, and Generative Adversarial Networks. It explains why deep learning works now due to the availability of more data and powerful computers. The post also mentions toolkits, libraries, and resources for learning deep learning, and suggests starting with the book "Deep Learning" by Goodfellow, Bengio, and Courville. It emphasizes the importance of curiosity and passion in staying updated with the field. Key terms include deep learning models, reinforcement learning, generative adversarial network, and toolkits.',
 'The document discusses the importance of high-quality human data for training deep learning models. It covers topics such as human raters, the wisdom of the crowd, rater agreement, data quality for model training, influence functions, prediction changes during training, and noisy cross-validation. Variou

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-small")

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/68.1k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/66.7M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# The vectorstore to use to index the summaries of child chunks
vectorstore = Chroma(collection_name="summaries",
                     embedding_function=embeddings)

# The storage layer for the parent documents
store = InMemoryByteStore()
id_key = "doc_id"

# The retriever
retriever = MultiVectorRetriever( ##Retrieve from a set of multiple embeddings for the same document.
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]
print(doc_ids)

['6bc5627b-6687-44fc-b765-da0694f85305', '871fed88-cb64-41fa-9a45-9d31de5543a0']


In [None]:
# Docs linked to summaries
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]
display(summary_docs[0])

# Add
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

Document(metadata={'doc_id': '6bc5627b-6687-44fc-b765-da0694f85305'}, page_content='The document provides an overview of deep learning, focusing on its models like Convolutional Neural Networks, Recurrent Neural Networks, Autoencoders, Reinforcement Learning, and Generative Adversarial Networks. It explains why deep learning works now due to the availability of more data and powerful computers. The post also mentions toolkits, libraries, and resources for learning deep learning, and suggests starting with the book "Deep Learning" by Goodfellow, Bengio, and Courville. It emphasizes the importance of curiosity and passion in staying updated with the field. Key terms include deep learning models, reinforcement learning, generative adversarial network, and toolkits.')

In [None]:
query = "Memory in agents"
sub_docs = vectorstore.similarity_search(query, k=1)
sub_docs[0]

Document(metadata={'doc_id': '6bc5627b-6687-44fc-b765-da0694f85305'}, page_content='The document provides an overview of deep learning, focusing on its models like Convolutional Neural Networks, Recurrent Neural Networks, Autoencoders, Reinforcement Learning, and Generative Adversarial Networks. It explains why deep learning works now due to the availability of more data and powerful computers. The post also mentions toolkits, libraries, and resources for learning deep learning, and suggests starting with the book "Deep Learning" by Goodfellow, Bengio, and Courville. It emphasizes the importance of curiosity and passion in staying updated with the field. Key terms include deep learning models, reinforcement learning, generative adversarial network, and toolkits.')

In [None]:
retrieved_docs = retriever.get_relevant_documents(query, n_results=1)
retrieved_docs[0].page_content[0:500]

"\n\n\n\n\n\nAn Overview of Deep Learning for Curious People | Lil'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nLil'Log\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nPosts\n\n\n\n\nArchive\n\n\n\n\nSearch\n\n\n\n\nTags\n\n\n\n\nFAQ\n\n\n\n\nemojisearch.app\n\n\n\n\n\n\n\n\n\n      An Overview of Deep Learning for Curious People\n    \nDate: June 21, 2017  |  Estimated Reading Time: 12 min  |  Author: Lilian Weng\n\n\n \n\n\nTable of Contents\n\n\n\nWhy Does Deep Learning Work Now?\n\nDeep Learning Models\n\nConvolutional Neural Network\n\nRecurrent Neural Network\n\nRNN: Sequence-t"

<mark> Part 13: RAPTOR