In [None]:
import dotenv
from envs import env
dotenv.load_dotenv()

In [None]:
from langchain.llms import OpenAI

llm = OpenAI(temperature=0.9)

In [None]:
from langchain.document_loaders import SeleniumURLLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone

In [None]:
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader('/workspace/carcamp', glob="**/*.html", silent_errors=True, load_hidden=True)

In [None]:
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()

In [None]:
import pinecone 

# initialize pinecone
pinecone.init(
    api_key=env("PINECONE_API_KEY"),  # find at app.pinecone.io
    environment=env("PINECONE_ENV")  # next to api key in console
)

index_name = "codeagi"

docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name)



In [None]:
query = "class Dealer"
res_docs = docsearch.similarity_search(query)

In [None]:
chain.run(input_documents=res_docs, question=query)

In [None]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

prompt_template = """Use the context below to use write Python code about the following topic:
    Context: {context}
    Task: {topic}
    Code:
    ```{language}
    """

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "topic", "language"]
)

llm = OpenAI(temperature=0)

chain = LLMChain(llm=llm, prompt=PROMPT, max_tokens_limit=500)

In [None]:
def generate_code(topic, language):
    docs = docsearch.similarity_search(topic, k=4)
    inputs = [{"context": doc.page_content, "topic": topic, "language": language} for doc in docs]
    txt = chain.apply(inputs)[0]['text']
    return txt.replace("'''", "")
    # return f"""```{language} {txt}"""

In [40]:
generate_code('Rewrite the Dealer Django model to be more efficient', 'python')

"\nclass Dealer(NameSlugModel):\n    street_address = models.CharField(max_length=200)\n    city = models.CharField(max_length=100)\n    state = models.CharField(max_length=100)\n    zip_code = models.PositiveIntegerField()\n    phone_number = models.CharField(max_length=30, blank=True, null=True)\n    email = models.EmailField()\n    dealer_number = models.CharField(max_length=100)\n    stripe_customer_id = models.UUIDField(blank=True, null=True)\n    aws_api_key = models.UUIDField(blank=True, null=True)\n    status = models.CharField(choices=DEALER_STATUS, default='unpaid', max_length=200)\n    website = models.URLField(blank=True, null=True)\n    ```"

## Delete the CodeAGI Index

In [None]:
pinecone.Index(index_name=index_name).delete(delete_all=True)

In [None]:
import tiktoken
enc = tiktoken.get_encoding("cl100k_base")

In [None]:
enc.encode('Test statement')

In [None]:
st = """
## Original Author

The writer's main points are that vector databases are popular for large language models, but there is a cost to introducing new infrastructure; however, the writer suggests that it might not be necessary to have a vector database. He argues that for N entities, it is possible to calculate the top k most similar entities to a given entity with O(N) complexity instead of pre-calculating and storing in a vector database, which takes O(N^2) complexity. He performed a benchmark to demonstrate the effect of the O(N) complexity and concluded that the O(N) complexity could be reasonable depending on the size of N and the latency requirements. He then discussed potential counterarguments before concluding that the right tool for the job is usually the tool you already have.

## Replies

	
jeadie 4 days ago | next [–]

I'm skeptical about some vector databases these days, but your article misses a few import points when it comes to LLMs.
1. To use LLMs effectively, you often need to generate and store more than 1 vector per document. 10 million vectors may only be 100,000 documents. This may still be enough for alot of small problems. 2. Pgvector currently has great limitations on recall/latency because underlying its ANN its using IVF (I'm currently working on adding HNSW-IVF and HNSW support to PGVector). In some cases, even elasticsearch can have issues with scale (the problem comes from the constraint of one ANN index per index segment, and immutability). 3. Pre-calculate seems like the wrong word to describe HNSW graph construction.

I think a point you miss that is important to consider for LLM + vector DBs is the fact that so much of the complexity of these uses cases cannot be captured by the vector DB (e.g. pinecone, chroma, qdrant, etc). I think there are some more end to end systems, at least in search, attempting to solve this (e.g Marqo, maybe Weaviate). Overall, I like the article. It makes a worthwhile claim and counterpoint to all the vector DB hype.

reply

	
kacperlukawski 4 days ago | parent | next [–]

I'd love to hear more about your thoughts on the complexity that cannot be, in your opinion, captured by the vector DB. I probably didn't get your point.
Disclaimer: I work for Qdrant, and we believe a database should be just a database. I remember attempting to move logic to the database layer and coupling neural encoders into the vector database sounds the same.

reply

	
isoprophlex 4 days ago | parent | prev | next [–]

> 1. To use LLMs effectively, you often need to generate and store more than 1 vector per document.
Could you elaborate this point for me? What would cause the 1 document -> ~ 100 vectors blowup; do you store vector embeddings for sections of the document, or use multiple models to create several types of vectors?

reply

	
Dachande663 4 days ago | root | parent | next [–]

If you look at something like LangChain[0], it supports/recommends splitting larger documents into smaller chunks. In this way, when doing something like semantic search you can get the specific paragraph/section that holds the closest relevance, rather than having to read the entire document again (think of a 100 page PDF).
https://python.langchain.com/en/latest/modules/indexes/text_...

reply

	
jeadie 4 days ago | root | parent | prev | next [–]

This is generally very context/use case specific. In general, if a document is a `Dict[str, Any]`, then you either have to have one (or multiple) vector(s) per field, unless you want to combine vectors across fields (it's not self-evident how you'd best do that). In saying that, specific reason's to do this (or why I've done it in the past).
1. Chunking long text fields in documents so as to get a better semantic vector for them (also you can only fit so much into an LLM). 2. Differently to 1. chunking long text fields (or even chunking images, audio, etc), is one way to perform highlighting. It helps to answer the question, for example, for a given document what about it was the reason it was returned? You can then point to the area in the image/text/audio that was most relevant. 3. You may want to run different LLMs on different fields (perhaps a separate multi-modal LLM vs a standard text LLM), or like another comment said have different transforms/representations of the same field.

Perhaps 100 vectors is non-standard, but definitely not unseen.

reply

	
jkb79 4 days ago | root | parent | next [–]

Only Vespa allows you to index multiple vectors per schema field, avoiding duplicating all the meta data of the document into the "chunk", and avoids maintaining the document to chunk fan-out. See https://blog.vespa.ai/semantic-search-with-multi-vector-inde...
reply

	
Sai_ 4 days ago | root | parent | prev | next [–]

I’m not a data scientist but I think I know why one document could lead to many vectors.
(Happy to be corrected and/or schooled.)

A vector is a list of numbers each of which represents weight accorded to a certain word along a certain dimension.

Let’s take an example.

Is an “apple” a “positive” or a “negative” thing? Most people would associate positivity with apples. So, for the general population, the vector for “apple” along the 0-1 continuum where 0 represents negative sentiment and 1 represents positive sentiment would be something like [0.8].

Let’s add one more dimension. Is an apple associated with computers (1) or not (0)? For the majority of the world where Windows has a massive market share, “apple” would recall a fruit, not a sleek laptop. Therefore, the vector for apple along the computer/non-computer dimension is probably [0.3].

Taking this together, apple = [0.8, 0.3] where positionally, 0.8 is the value for positive/negative sentiment while 0.3 is computer/non-computer.

Agree?

(Hoping you do)

But that [0.8, 0.3] vector is for the general population.

Would a bible literalist who publishes blogs on bible stories feel the same way?

For someone like that, the notion of the original sin could taint their sentiments towards the apple. So they might weight an apple at 0.2 on the positive/negative line. Since they’re bloggers, it’s more likely they associate apple with computers so they might call it 0.5. Therefore, their apple vector is [0.2, 0.5].

Extend this to more content and you’ll see why there are more than one vector.

At least that’s how I understood it. Happy to be corrected and/or schooled.

reply

	
ruslandanilin 4 days ago | root | parent | next [–]

In my opinion, you could represent "apple" as a vector, for example, [0.99, 0.3, 0.7] in relation to [fruits, computers, religion]. Then, you can create different user vectors that describe the interests of various groups. For instance, the general population might have a vector like [0.8, 0.2, 0.1], geeks as [0.6, 0.95, 0.05], and religious people as [0.7, 0.1, 0.95].
By creating these user vectors, you can compare them with the "apple" vector and find the best match using ANN. This approach allows you to determine which group is most interested in a given context or aspect of the word "apple." The ANN will help you identify similarities or patterns in the user vectors and the "apple" vector to find the most relevant matches.

Thank you

reply

	
Sai_ 3 days ago | root | parent | next [–]

I don’t know what ANN is but your comment raises two questions in my mind -
1. Where did your first vector of [0.99, 0.3, 0.7] come from? You later present the concept of user vectors which are vectors for different cohorts of users but don’t name the first vector as a user vector.

2. I feel my example of vectors for “general population users” and “bible literalist blogger” user aligns with your “user vector” concept.

reply

	
thanatropism 4 days ago | root | parent | prev | next [–]

Modern text embeddings are not word-based like that.
reply

	
Sai_ 3 days ago | root | parent | next [–]

If my understanding and explanation are directionally correct, I’m happy. I’ll be the first one to admit I’m not a data scientist.
Do you have a good example of how an actual data scientist would present the idea of vectors as applied to sentences/documents to a layperson?

reply

	
teaearlgraycold 4 days ago | root | parent | prev | next [–]

Storing one document as one embedding is like making a movie poster the average of all frames in the film.
reply

	
jkb79 4 days ago | root | parent | next [–]

That is a very good analogy!
reply

	
teaearlgraycold 3 days ago | root | parent | next [–]

:D Thanks!
reply

	
amitport 4 days ago | root | parent | prev | next [–]

One thing others didn't mention is that "document" is a general term but in some cases (e.g., question answering) the typical document can be a very short paragraph and take much less memory than the vector. Also note that with some ML architecture the vector is very large (e.g., an entire very layer output)
reply

	
ethanahte 4 days ago | parent | prev | next [–]

Hi, author here.
1. You make a great point about longer documents requiring multiple vectors which I should've mentioned in the post. Depending on your use case, this can certainly explode your dataset size! 2. Good to know about the pgvector limitations -- I haven't used it yet. 3. I guess "index" would be the more database-y term. That said, one thing I'll call out is that you have to re-index if you ever change your embedding model, and indexing can be slow. It took me ~20-30 minutes to index the 10 million embeddings in my benchmark.

reply

	
kordlessagain 4 days ago | root | parent | next [–]

I'm interested if anyone has some hard data on the "best" size of the document "fragments" that are used for embedding into a dense vector.
Obviously, embedding single words probably aren't particularly useful for reassembling portions of a document for submission to an LLM in the prompt. I'm currently pondering on what size of string is best for embedding, and considering a variable size might be one option.

Testing with strings around 512 characters seem to do pretty well, but it may be storing multiple lengths of similar runs in the document might be a better way to do it.

reply

Summarize the text above.

The original author discussed the cost of introducing new infrastructure for vector databases, suggesting that it might not be necessary if one can calculate the most similar entities with O(N) instead of O(N^2) complexity. Other commenters discussed the complexity of LLMs and vector databases, the need to generate more than one vector per document, and the benefits of chunking documents for semantic search. They also compared different user vectors and discussed the need for ML architectures with large vectors. In conclusion, the right tool for the job is usually the tool you already have.
"""

In [None]:
enc.encode(st)

In [None]:

def chunk_text(text, limit): 
    chunked_text = [] 
    current_chunk = "" 
    for word in text.split(): 
        current_chunk += word + " " 
        if len(current_chunk) > limit: 
            chunked_text.append(current_chunk) 
            current_chunk = "" 
    if current_chunk != "": 
        chunked_text.append(current_chunk) 
    return chunked_text 

In [None]:
chunk_text(st, 500)