In [None]:
pip install -r ./requirements.txt -q

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

# Embeddings
Core of LLM applications, text embeddings are numeric representations of text and are used in NLP and ML tasks

The distance between 2 embeddings or 2 vectors measures their relatedness which translates to the relatedness between the text concepts they represent.
* Similar embeddings or vectors represents similar concepts

**Text Classifications:** assigning a label to a peice of text
**Text Clustering:** grouping together pieces of text that are similar in meaning
**Question-Answering:** answering a question posed in natural language

# Introductions to Vector Databases
Challenges
- Efficient data processing
- Many of latest AI apps rely on **vector embeddings**.
  - Vector Embeddings are when text is converted to numbers that carry semantic information. They represent text as a set of numbers in a high dimensional space and the numbers represent meaning of the words in the text.

## Vector Database Solutions
- Pinecone
- Chroma
- Milvus
- qdrant

## Vector Databases
- New type of database, designed to store and query unstructured data
- Unstructured data is data that does not have a fixed schema, such as text, images, and audio

### Pipeline for Vector Databases
Use combination of different optimized algorithms that all participate in the Approximate Nearest Neighbor (ANN) search

### High Level Flow
1. Content is converted into Vector Embedding using embedding model
2. Vector Embedding is then indexed into the Vector Database
3. Application will make a query which is similarly converted to a Vector Embedding using the same embedding model
4. Query Vector Embedding is used to query Vector Database for similar content using ANN
5. Query result is returned
eg. If company wanted to query documents.
1. Each document would be converted to a Vector Embedding through an Embedding Model
2. The document Vector Embeddings would be stored in a Vector Database, each vector embedding would be referenced to a document
3. Finally company can query against Vector Database

# Pinecone
Used in OPL stack (OpenAI, Pinecone, LangChain)

## Indexes
Highest level of organizational unit of vector data in Pinecone. It accepts and stores vectors, serves queries over the vectors it contains, and does other operations over its contents.

**Serverless Indexes:** You don't configure or manage any compute or storage resources (they scale automatically)
**Pod-based Indexes:** You choose one or more pre-configured units of hardware (pods)

## Working with Vectors
(See code below on how to create/delete indexes and insert vectors into indexes)

## Namespaces
Pinecone allows you to partition the vectors in an index into namespaces. Queries and other operations are scoped to a specific namespace allowing different requests to search different subsets of your index

**Key information about namespaces:**
- Every index consists of 1 or more namespaces
- Each vector exists in only 1 namespace
- Namespaces are uniquely identified by a namespace name
- Default namespace is represented by the empty string and used if no specific namespaced is specified


In [None]:
pip install -q pinecone-client

In [None]:
pip install --upgrade -q pinecone-client

In [None]:
pip show pinecone-client

In [None]:
from pinecone import Pinecone
# pc = Pinecone(api_key='YOUR_API_KEY') define api key if env var not available
pc = Pinecone()
pc.list_indexes()

In [None]:
# Create Pinecone Index
from pinecone import PodSpec
index_name = 'langchain'
if index_name not in pc.list_indexes().names():
  print(f'Creating index {index_name}')
  pc.create_index(
    name=index_name,
    dimension=1536,
    metric='cosine',
    spec=PodSpec(
      environment='gcp-starter'
    )
  )
  print('Index Created!')
else:
  print(f'Index {index_name} already exists!')

In [None]:
pc.list_indexes()

In [None]:
# Delete Pinecone Index
index_name = 'langchain'
if index_name in pc.list_indexes().names():
  print(f'Deleting index {index_name} ...')
  pc.delete_index(index_name)
  print('Done')
else:
  print(f'Index {index_name} does not exist.')

In [None]:
# Describe Pinecone Index
index = pc.Index(index_name)
index.describe_index_stats()

In [None]:
# inserting vectors
import random
vectors = [[random.random() for _ in range(1536)] for v in range(5)]
# print(vectors)
ids = list('abcde')

index_name='langchain'
index = pc.Index(index_name)
index.upsert(vectors=zip(ids, vectors))

In [None]:
# updating vectors
index.upsert(vectors=[('c', [0.5] * 1536)])

In [None]:
# fetch vector by ID
index.fetch(ids=['c', 'd'])

In [None]:
# delete vectors by ID
index.delete(ids=['b','c'])

In [None]:
index.describe_index_stats()

In [None]:
# query
query_vector = [random.random() for _ in range(1536)]

In [None]:
# Query and return top 3 most similar vectors
index.query(
  vector=query_vector,
  top_k=3, # defines returning only top 3
  include_values=False
)

### Namespaces

In [None]:
# Start with empty index
index.describe_index_stats()
index = pc.Index('langchain')

# insert random vector into default namespace
import random
vectors = [[random.random() for _ in range(1536)] for v in range(5)]
ids = list('abcde')
index.upsert(vectors=zip(ids, vectors))

In [None]:
# add 3 vectors to 'first-namespace' namespace
vectors = [[random.random() for _ in range(1536)] for v in range(3)]
ids = list('xyz')
index.upsert(vectors=zip(ids, vectors), namespace='first-namespace')

In [None]:
# add 2 vectors to 'second-namespace' namespace
vectors = [[random.random() for _ in range(1536)] for v in range(2)]
ids = list('qp')
index.upsert(vectors=zip(ids, vectors), namespace='second-namespace')

In [None]:
# shows namespaces in index
index.describe_index_stats()

In [None]:
# to properly fetch vectors from a namespace you must explicitly define the namespace in the fetch
index.fetch(ids=['x'], namespace='first-namespace')

In [None]:
index.delete(ids=['x'], namespace='first-namespace')

In [None]:
# delete all vectors from a namespace, will also delete the namespace itself
index.delete(delete_all=True, namespace='first-namespace')

In [None]:
index.describe_index_stats()

# Splitting and Embedding Text Using LangChain