- The Price is Right

# Vector Store in RAG 

In [None]:
#import packages 
import os 
import re
import math 
import json
from tqdm import tqdm
import matplotlib.pyplot as plt
from dotenv import load_dotenv
from huggingface_hub import login
import numpy as np 
import pickle 
from sentence_transformers import SentenceTransformer 
from datasets import load_dataset
import chromadb
from items import Item
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from openai import OpenAI

#importing class from modules
from items import Item
from testing import Tester

In [3]:
#import environment variables 
load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['ANTHROPIC_API_KEY'] = os.getenv('ANTHROPIC_API_KEY')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')

#vector storedb
DB = "products_vectorstore"

In [4]:
# Log in to HuggingFace
hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [5]:
#import train.pkl dataset of pricer
with open('train.pkl', 'rb') as file:
    train = pickle.load(file)

In [6]:
print(train[0].prompt)

How much does this cost to the nearest dollar?

Star Micronics USB Thermal Receipt Printer with Device and Mfi USB Ports, Auto-cutter, and Internal Power Supply - Gray (Renewed)
High-speed printing of 43 receipts per minute with easy to setup USB connection - just Plug and Print; USB serial number feature means the PC will detect the on its Windows platform using any USB port Includes PromoPRNT promotion printing service allowing you to automatically create printed promotions in addition to receipts Drop-In and Print clamshell design allows for fast and easy paper loading; patented De-Curl function always delivers a flat receipt futurePRNT Software allows you to customize receipts to keep customers coming back time and time again with professional graphics (logos, coupons), word-triggered advertising, and more The small footprint and embedded power supply saves precious counter space Brand Star Mic

Price is $207.00


# Chroma Datastore

In [7]:
client = chromadb.PersistentClient(path=DB)

In [8]:
#clear out vector store if data exists
collection_name = "products"
existing_collection_names = [collection.name for collection in client.list_collections()]
if collection_name in existing_collection_names:
    client.delete_collection(collection_name)
    print(f"Deleted existing Collection:{collection_name}")

#create new vectorstoredb
collection = client.create_collection(collection_name)

Deleted existing Collection:products


# Hugging Face SentenceTransfomer for Embeddings

In [None]:
#initialize the embeddings model 
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
#do dry run on encoder
vector = model.encode(["Well Hi there"])[0]

In [11]:
#review embeddings 
vector 

array([-9.46715772e-02,  4.27620076e-02,  5.51620498e-02, -5.10970887e-04,
        1.16202980e-02, -6.80130422e-02,  2.76405867e-02,  6.06974587e-02,
        2.88531017e-02, -1.74128339e-02, -4.94346246e-02,  2.30993051e-02,
       -1.28614437e-02, -4.31402586e-02,  2.17509698e-02,  4.26548198e-02,
        5.10500371e-02, -7.79727101e-02, -1.23247243e-01,  3.67455892e-02,
        4.54119081e-03,  9.47938412e-02, -5.53098843e-02,  1.70641653e-02,
       -2.92872209e-02, -4.47124578e-02,  2.06784271e-02,  6.39320314e-02,
        2.27427725e-02,  4.87789586e-02, -2.33500893e-03,  4.72859032e-02,
       -2.86259297e-02,  2.30624489e-02,  2.45130286e-02,  3.95681411e-02,
       -4.33176868e-02, -1.02316663e-01,  2.79874611e-03,  2.39304528e-02,
        1.61556639e-02, -8.99080746e-03,  2.07256041e-02,  6.40123039e-02,
        6.89179078e-02, -6.98361844e-02,  2.89758621e-03, -8.10989439e-02,
        1.71122830e-02,  2.50659091e-03, -1.06529087e-01, -4.87733483e-02,
       -1.67762171e-02, -

In [21]:
#method to remove the extra text i added in prompt 
#In proprietary model training
def description(item):
    text = item.prompt.replace("How much does this cost to the nearest dollar?\n\n","")
    return text.split("\n\nPrice is $")[0]

In [22]:
#dry run the method
print(description(train[0]))

Star Micronics USB Thermal Receipt Printer with Device and Mfi USB Ports, Auto-cutter, and Internal Power Supply - Gray (Renewed)
High-speed printing of 43 receipts per minute with easy to setup USB connection - just Plug and Print; USB serial number feature means the PC will detect the on its Windows platform using any USB port Includes PromoPRNT promotion printing service allowing you to automatically create printed promotions in addition to receipts Drop-In and Print clamshell design allows for fast and easy paper loading; patented De-Curl function always delivers a flat receipt futurePRNT Software allows you to customize receipts to keep customers coming back time and time again with professional graphics (logos, coupons), word-triggered advertising, and more The small footprint and embedded power supply saves precious counter space Brand Star Mic


In [None]:
#instantiate embedding and description method
for i in tqdm(range(0,len(train), 1000)):
    documents=[description(item) for item in train[i:i+1000]]
    vectors=model.encode(documents).astype(float).tolist()
    metadatas=[{"category":item.category, "price":item.price}for item in train[i:i+1000]]
    ids=[f"doc_{j}"for j in range(i,i+1000)]
    collection.add(
        ids=ids,
        documents=documents,
        embeddings=vectors,
        metadatas=metadatas
    )

## Visualization

In [48]:
#max point to visualize
MAXIMUM_DATAPOINTS = 10_000

In [49]:
#set path to chromadb
DB = "products_vectorstore"
client = chromadb.PersistentClient(path=DB)

In [50]:
collection=client.get_or_create_collection('products')

In [51]:
#categories to visualize and color mapping
CATEGORIES = ['Appliances', 'Automotive', 'Cell_Phones_and_Accessories'\
              , 'Electronics','Musical_Instruments', 'Office_Products', \
              'Tools_and_Home_Improvement', 'Toys_and_Games']
COLORS = ['red', 'blue', 'brown', 'orange', 'yellow', 'green' , 'purple', 'cyan']

In [52]:
#pull data put from vectordb
result=collection.get(include=['embeddings','documents','metadatas'], limit=MAXIMUM_DATAPOINTS)
vectors=np.array(result['embeddings']) ##convert to numpy array and pull out of embbeddings
documents=result['documents'] ##pull out documents
categories=[metadatas['category'] for metadatas in result['metadatas']]##pull out categories
colors = [COLORS[CATEGORIES.index(c)] for c in categories] ##color and cat of index

- 2D plot 

In [53]:
#initialise tsne:t-distributed Stochastic Neighbor Embedding
tsne = TSNE(n_components=2, random_state=42, n_jobs=-1)

#fit transform the embeddings
reduced_vectors = tsne.fit_transform(vectors)

In [54]:
#create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:,0],
    y=reduced_vectors[:,1],
    mode='markers',
    marker=dict(size=3,color=colors,opacity=0.7),
)])

fig.update_layout(
    title='2D Chroma Vectorstore Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=1200,
    height=800,
    margin=dict(r=20,b=10,l=10,t=40)
)
fig.show()

- 3D Plot 

In [56]:
#initialise tsne:t-distributed Stochastic Neighbor Embedding
tsne = TSNE(n_components=3, random_state=42, n_jobs=-1)

#fit transform the embeddings
reduced_vectors = tsne.fit_transform(vectors)

In [57]:
#create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=3, color=colors, opacity=0.7),
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=1200,
    height=800,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()