In [7]:
import os
import time
from typing import List
from bs4 import BeautifulSoup as Soup
import numpy as np
import ray
from langchain.document_loaders import ReadTheDocsLoader
from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader

from langchain.embeddings.base import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
# from local_embeddings import LocalHuggingFaceEmbeddings
import pinecone
import time
import os
from tqdm.auto import tqdm
from uuid import uuid4
import demoConfig

In [8]:
# add Pinecone API key from app.pinecone.io
api_key = demoConfig.pinecone_api_key
# set Pinecone environment - find next to API key in console
env = demoConfig.pinecone_env
index_name = demoConfig.pinecone_index_name
# initialize pinecone client
pinecone.init(api_key=api_key, environment=env)

In [10]:
ray.init(ignore_reinit_error=True)

2023-09-02 14:58:10,000	INFO worker.py:1452 -- Connecting to existing Ray cluster at address: 10.0.30.137:6379...
2023-09-02 14:58:10,000	INFO worker.py:1474 -- Calling ray.init() again after it has already been called.


0,1
Python version:,3.9.15
Ray version:,2.5.1
Dashboard:,http://session-qpyhypyxh4newtctgy74s7rshz.i.anyscaleuserdata.com


In [35]:
url = "https://aws.amazon.com/sagemaker/"
loader = RecursiveUrlLoader(url=url, max_depth=4, extractor=lambda x: Soup(x, "html.parser").text)

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=300,
    chunk_overlap=100,
    length_function=len,
)

In [36]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [37]:
# Stage one: read all the docs, split them into chunks.
st = time.time()
print("Loading documents ...")
docs = loader.load()
# Theoretically, we could use Ray to accelerate this, but it's fast enough as is.
chunks = text_splitter.create_documents(
    [doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs]
)
et = time.time() - st
print(f"Time taken: {et} seconds. {len(chunks)} chunks generated")

Loading documents ...
Time taken: 8.614511013031006 seconds. 4127 chunks generated


In [7]:
# Create or Reinitialize Pinecone index

import time

index_name = "llama-2-7b-example"

if index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)

pinecone.create_index(name=index_name, dimension=384, metric="cosine")
# wait for index to finish initialization
while not pinecone.describe_index(index_name).status["ready"]:
    time.sleep(1)

In [38]:
@ray.remote(num_gpus=0.1)
def process_shard(shard, embeddings):
    import pinecone
    # initialize pinecone client
    pinecone.init(api_key=api_key, environment=env)
    print(f"Starting process_shard of {len(shard)} chunks.")
    st = time.time()
    # batch_size = 100  # can increase but needs larger instance size otherwise instance runs out of memory
    result = Pinecone.from_documents(shard, embeddings, index_name=index_name)
    et = time.time() - st
    print(f"Shard completed in {et} seconds.")
    return None

In [39]:
# Stage two: embed the docs.
db_shards = 8
print(f"Loading chunks into vector store ... using {db_shards} shards")
st = time.time()
shards = np.array_split(chunks, db_shards)

futures = [process_shard.remote(shards[i], embeddings) for i in range(db_shards)]
# results = ray.get(futures)

et = time.time() - st
print(f"Shard processing complete. Time taken: {et} seconds.")

Loading chunks into vector store ... using 8 shards
Shard processing complete. Time taken: 0.8955435752868652 seconds.
[2m[36m(process_shard pid=164537)[0m Shard completed in 2.654496431350708 seconds.[32m [repeated 8x across cluster][0m
[2m[36m(process_shard pid=168854)[0m Starting process_shard of 516 chunks.[32m [repeated 8x across cluster][0m
