## 1. Install dependencies

In [None]:
! pip3 install deeplake langchain openai tiktoken llama-index
! pip install -q llama-index-embeddings-openai llama-index-llms-openai

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

# 2. Download data

In [2]:
!mkdir -p 'data/paul_graham/'
!curl 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -o 'data/paul_graham/paul_graham_essay.txt'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 75042  100 75042    0     0  52257      0  0:00:01  0:00:01 --:--:-- 52221


## .3 Create chunks

In [6]:
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.readers import SimpleDirectoryReader

documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
node_parser = SimpleNodeParser.from_defaults(chunk_size=512)
nodes = node_parser.get_nodes_from_documents(documents)

# By default, the node/chunks ids are set to random uuids. To ensure same id's per run, we manually set them.
for idx, node in enumerate(nodes):
    node.id_ = f"node_{idx}"

print(f"Number of Documents: {len(documents)}")
print(f"Number of nodes: {len(nodes)} with the current chunk size of {node_parser.chunk_size}")

Number of Documents: 1
Number of nodes: 61 with the current chunk size of 512


## 4. Create Deep lake vector store

In simple terms do the following steps:

1. chunk the document you have into smaller chunks (already did in step 3). 
2. using embeddigns model create embedding for these chunks.
3. Save these embeddigns to local vector store using deep lake.

In [21]:
from llama_index.core import VectorStoreIndex, ServiceContext, StorageContext
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

# Create a DeepLakeVectorStore locally to store the vectors
dataset_path = "./data/paul_graham/deep_lake_db"
vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=True)

# LLM that will answer questions with the retrieved context
llm = OpenAI(model="gpt-3.5-turbo-1106")
embed_model = OpenAIEmbedding()

service_context = ServiceContext.from_defaults(
    embed_model=embed_model,
    llm=llm,
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

vector_index = VectorStoreIndex(
    nodes,
    service_context=service_context,
    storage_context=storage_context,
    show_progress=True,
)

  service_context = ServiceContext.from_defaults(
  from .autonotebook import tqdm as notebook_tqdm
Generating embeddings: 100%|██████████| 61/61 [00:02<00:00, 20.41it/s]


Uploading data to deeplake dataset.


100%|██████████| 61/61 [00:00<00:00, 234.47it/s]

Dataset(path='./data/paul_graham/deep_lake_db', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
   text       text      (61, 1)      str     None   
 metadata     json      (61, 1)      str     None   
 embedding  embedding  (61, 1536)  float32   None   
    id        text      (61, 1)      str     None   





# 5. Upload local db to cloud

Once the embeddings has been stored in your local database you can also upload that to deep lakes cloud infrastructure.

In [22]:
import deeplake
local = "./data/paul_graham/deep_lake_db"

username = "akshatsingh1718"

hub_path = f"hub://{username}/optimization_paul_graham"
hub_managed_path = f"hub://{username}/optimization_paul_graham_managed"

# First upload our local vector store
deeplake.deepcopy(local, hub_path, overwrite=True)
# Create a managed vector store under a different name
deeplake.deepcopy(hub_path, hub_managed_path, overwrite=True, runtime={"tensor_db": True})

Copying dataset: 96%|█████████▋| 27/28 [00:26<00:00


This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/akshatsingh1718/optimization_paul_graham
Your Deep Lake dataset has been successfully created!


Copying dataset: 96%|█████████▋| 27/28 [00:36<00:01


This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/akshatsingh1718/optimization_paul_graham_managed
Your Deep Lake dataset has been successfully created!


Dataset(path='hub://akshatsingh1718/optimization_paul_graham_managed', tensors=['embedding', 'id', 'metadata', 'text'])

In [23]:
# instantiate vector store with the managed database just created in deep lake cloud infrastructure
db = DeepLakeVectorStore(dataset_path=hub_managed_path, overwrite=False, read_only=True,)

Deep Lake Dataset in hub://akshatsingh1718/optimization_paul_graham_managed already exists, loading from the storage


## 6. Fetch docs and ids from the vector store

In [25]:
docs = db._vectorstore.dataset.text.data(fetch_chunks=True, aslist=True)['value']
ids = db._vectorstore.dataset.id.data(fetch_chunks=True, aslist=True)['value']
print(len(docs))

61


## 7. Generate synthetic training dataset

In [26]:
from openai import OpenAI

client = OpenAI()

def generate_question(text):
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo-1106",
            messages=[
                {
                    "role": "system",
                    "content": "You are a world class expert for generating questions based on provided context. \
                        You make sure the question can be answered by the text.",
                },
                {
                    "role": "user",
                    "content": text,
                },
            ],
        )
        return response.choices[0].message.content
    except:
        question_string = "No question generated"
        return question_string

In [27]:
import random
from tqdm import tqdm


def generate_queries(docs: list[str], ids: list[str], n: int):

    questions = []
    relevances = []
    pbar = tqdm(total=n)
    while len(questions) < n:
        # 1. randomly draw a piece of text and relevance id
        r = random.randint(0, len(docs) - 1)
        text, label = docs[r], ids[r]

        # 2. generate queries and assign and relevance id
        generated_qs = [generate_question(text)]
        if generated_qs == ["No question generated"]:
            print("No question generated")
            continue

        questions.extend(generated_qs) # questions += [ <gen_que1>, <gen_que2>, ... <gen_queN> ]
        relevances.extend([[(label, 1)] for _ in generated_qs]) # [ (chunk_id, corpus_id), ... ]
        pbar.update(len(generated_qs))

    return questions[:n], relevances[:n]

In [28]:
questions, relevances = generate_queries(docs, ids, n=40)
print(len(questions)) #40
print(questions[0])

100%|██████████| 40/40 [00:37<00:00,  1.06it/s]

40
What inspired the author to write another book on Lisp and what did the author imagine achieving from it?





## 8. Deep Memeory Training

The deep memory model will train using (question, context) pair to better understand what type of question should return what type of contexts.

The model will optimize the query and transforms them into space optimized for the specific use case.

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding

openai_embeddings = OpenAIEmbedding()

job_id = db._vectorstore.deep_memory.train(
    queries=questions,
    relevance=relevances,
    embedding_function=openai_embeddings.embed_documents,
)

In [None]:
db.vectorstore.deep_memory.status(job_id="<Your_job_id>")

## 9. Inference using Deep memeory

In [None]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding


query = "What are the main things Paul worked on before college?"

llm = OpenAI(model="gpt-3.5-turbo-1106")
embed_model = OpenAIEmbedding()

service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm,)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

db = DeepLakeVectorStore(dataset_path=hub_managed_path, overwrite=False, read_only=True,)
vector_index = VectorStoreIndex.from_vector_store(db, service_context=service_context, storage_context=storage_context, show_progress=True)

query_engine = vector_index.as_query_engine(similarity_top_k=3, vector_store_kwargs={"deep_memory": True})
response_vector = query_engine.query(query)
print(response_vector.response)

## 10. Evaluations (Deep memory vs Vanilla)

In [None]:
# Generate validation queries
validation_questions, validation_relevances = generate_queries(docs, ids, n=40)

# Launch the evaluation function
recalls = db._vectorstore.deep_memory.evaluate(
    queries=validation_questions,
    relevance=validation_relevances,
    embedding_function=openai_embeddings.embed_documents,
)