# Getting some python libraries setup
*italicized text*#### Step 1: Environment setup

First let us import some Python libraries

In [1]:
! pip install --upgrade pip
! pip install -q chromadb
! pip install -q langchain
! pip install -q torch
! pip install -q InstructorEmbedding
! pip install -q sentence_transformers
! pip install -q pandas



# Generate and Ingest "product" embeddings into Vector DB

Uses Chroma DB and LangChain with a locally persisted database. 
Store embeddings and documents, then use them again later.

In [3]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.document_loaders import telegram


# Load and process documents

Next we split documents into small chunks. This is so we can find the most relevant chunks for a query and pass only those into the LLM.

## Load and initialize the instructor model


In [9]:
EMBEDDING_MODEL_NAME = "hkunlp/instructor-large"
device_type = "cpu"
embeddings = HuggingFaceInstructEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    model_kwargs={"device": device_type},
)

load INSTRUCTOR_Transformer
max_seq_length  512


## Take the CSV file. Pick the chunks. Generate embeddings. Then persist into Chroma DB (vector database).


In [7]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk

from chromadb.config import Settings
PERSIST_DIRECTORY = 'db'
CHROMA_SETTINGS = Settings(
    anonymized_telemetry=False,
    is_persistent=True,
)

import pandas as pd
chunksize = 10
for chunk in pd.read_csv('products_export_xs.csv', chunksize=chunksize):
    chunk_as_list = chunk.to_string().split('\n')[1:]
    print(len(chunk_as_list))
    documents = telegram.text_to_docs(chunk_as_list)
    vectordb = Chroma.from_documents(documents=documents, embedding=embeddings, persist_directory=PERSIST_DIRECTORY, client_settings=CHROMA_SETTINGS)
    vectordb = None

10
10
10
10
10
10
10
10
10
9




==================================================================================================================================================
==================================================================================================================================================
## CLEANUP the db or specific collection(s)

In [8]:
# To cleanup, you can delete the collection
#vectordb.delete_collection()
#vectordb.persist()

# Or just nuke the persist directory
#!rm -rf db/