# Embeddings

Date: 2024/09/10

In [1]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import SpacyTextSplitter
import glob

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

DOCUMENTS = "../doc/*.txt"

# Note: the following code requires "en_core_web_lg".
# python3 -m spacy download en_core_web_lg
text_spliter = SpacyTextSplitter(
  chunk_size=600,
  pipeline="en_core_web_lg"
)

vector_store = Chroma(
    embedding_function=embeddings,
    persist_directory="./db"
)

In [2]:
# Clear all
ids = vector_store.get(include=[])['ids']
for id in ids:
  vector_store.delete(id)
    
print(vector_store.get())

# Document paths
file_paths = glob.glob(DOCUMENTS)

# Add embeddings
for path in file_paths:
    scene_id = path.split('/')[-1].replace('.txt', '')
    with open(path) as f:
        text = f.read()
        texts = text_spliter.split_text(text)
        vector_store.add_texts(texts, metadatas=[{'scene_id': scene_id}])

{'ids': [], 'embeddings': None, 'metadatas': [], 'documents': [], 'uris': None, 'data': None, 'included': ['metadatas', 'documents']}


