In [115]:
import os

In [116]:
from pinecone import Pinecone

In [117]:
from dotenv import load_dotenv
load_dotenv()

True

In [118]:
import json

## VectorDB Init

In [119]:
PINECONE_KEY=os.getenv("PINECONE_KEY")
PINECONE_ENV=os.getenv("PINECONE_ENV")
PINECONE_INDEX_NAME="default"

In [120]:
pc = Pinecone(api_key=PINECONE_KEY)
index_list = pc.list_indexes()

if len(index_list) == 0:
    print("Creating index...")
    pc.create_index(PINECONE_INDEX_NAME, dimension=1536, metric='dotproduct')

print(pc.describe_index(PINECONE_INDEX_NAME))
index = pc.Index(PINECONE_INDEX_NAME)

{'dimension': 1024,
 'host': 'default-09vtdyw.svc.gcp-starter.pinecone.io',
 'metric': 'cosine',
 'name': 'default',
 'spec': {'pod': {'environment': 'gcp-starter',
                  'pod_type': 'starter',
                  'pods': 1,
                  'replicas': 1,
                  'shards': 1}},
 'status': {'ready': True, 'state': 'Ready'}}


In [121]:
business_data_path = 'business_data.json'
with open(business_data_path) as f:
    data = json.load(f)

print(data)

{'company_name': 'XYZ Corporation', 'id': 2, 'revenue': '$20,000,000', 'industry': 'Product Technology', 'USP': 'Best AI solutions', 'sales': '$10,000,000', 'profit': '$4,000,000', 'employees': 200, 'age': '2 years'}


## Embeddings

In [122]:
from angle_emb import AnglE

angle = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls').cuda()
emb_vectors = angle.encode(str(data), to_numpy=True)
print(emb_vectors)


[[ 0.25822684 -0.10153604  0.47885266 ...  0.0107529  -0.3698256
   0.34391046]]


In [123]:
ids = [str(data['id'])]
meta_data = [data]

to_upsert = zip(ids, emb_vectors, meta_data)
index.upsert(vectors=to_upsert)

{'upserted_count': 1}

## Retrieval QA

In [124]:
from groq import Groq
client = Groq(
    api_key=os.getenv("GROQ_API_KEY"),
)

In [125]:
# import numpy as np
# user_question = 'What is Product Technology'
# question_vector = angle.encode(str(user_question), to_numpy=True)
# print(type(question_vector))
# print(question_vector)

In [126]:
# similar_vector_ids = index.query(vector=[question_vector.tolist()], top_k=5)['matches']
# print(similar_vector_ids)