In [None]:
from openai import OpenAI
import pinecone
from dotenv import load_dotenv
import os
from tqdm.auto import tqdm
from time import sleep
from datasets import load_dataset

In [None]:
load_dotenv()

# Initialize OpenAI
client = OpenAI()

MODEL = "text-embedding-ada-002"

# Test the embeddings




# Initialize connection to pinecone
pinecone.init(
  api_key=os.environ.get("PINECONE_API_KEY"),
  environment=os.environ.get("PINECONE_API_ENV")
)

index_name = "zonsearch"

# Check if 'zonsearch' index already exists (only create index if not)
if index_name not in pinecone.list_indexes():
  pinecone.create_index(index_name, dimension=len(1536))
# Connect to index
index = pinecone.Index(index_name)

In [None]:
def get_embedding(text, model="text-embedding-ada-002"):
  text = text.replace("\n", " ")
  return client.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
# Test connections to OpenAI and Pinecone
embed = get_embedding("test")
print(embed)

index.describe_index_stats()

In [None]:
data = load_dataset('bprateek/amazon_product_description', split='train') # Amazon product data from HuggingFace.co
data

In [None]:
data[50]

In [None]:
# Cleaning up the data

new_data = []

for i in tqdm(range(0, len(data))):
  new_data.append({
    'id': data[i]['Uniq Id'] if data[i]['Uniq Id'] is not None else 'None',
    'name': data[i]['Product Name'] if data[i]['Product Name'] is not None else 'None',
    'category': data[i]['Category'] if data[i]['Category'] is not None else 'None',
    'description': data[i]['About Product'] if data[i]['About Product'] is not None else 'None',
    'link': data[i]['Product Url'] if data[i]['Product Url'] is not None else 'None',
  })

In [None]:
new_data

In [None]:
new_data[0]['description'] = '7 inch Chef Knife with maple handle. Made in Japan. Damascus steel blade. Comes with a sheath.'

In [None]:
embeddings = []

for entry in new_data:
    # Combine 'name', 'category', and 'description'
    combined_text = f"{entry['name']} | {entry['category']} | {entry['description']}"

    # Generate embedding
    try:
        response = client.embeddings.create(input=[combined_text], model=MODEL).data[0].embedding
        print(response)
        embeddings.append(response)
    except Exception as e:
        print(f"Error: {e}")
        # Handle error (e.g., skip this entry, retry, etc.)

# Now 'embeddings' contains the embedding for each entry

In [None]:
print(embeddings)

In [None]:
# Embedding and upserting product data (Only need to run once)

batch_size = 100  # how many embeddings we create and insert at once

for i in tqdm(range(0, 1, batch_size)):
  # find end of batch
  i_end = min(len(new_data), i+batch_size)
  meta_batch = new_data[i:i_end]
  # get ids
  ids_batch = [x['id'] for x in meta_batch]
  # get texts to encode
  texts = [x['description'] for x in meta_batch]
  print(texts)

  try:
    res = client.embeddings.create(input = texts, model=MODEL).data[0].embedding
    print(1)
  except:
    print(2)
    done = False
    while not done:
      sleep(5)
      try:
        print(3)
        res = client.embeddings.create(input = texts, model=MODEL).data[0].embedding
        print(4)
        done = True
      except:
        pass

  embeds = [record['embedding'] for record in res['data']]
  print(embeds)
  
  # cleanup metadata
  meta_batch = [{
    'name': x['name'],
    'category': x['category'],
    'description': x['description'],
    'link': x['link']
  } for x in meta_batch]
  to_upsert = list(zip(ids_batch, embeds, meta_batch))
  # upsert to Pinecone
  print(index.upsert(vectors=to_upsert))


In [None]:
query = ""

res = openai.Embedding.create(
    input=[query],
    engine=MODEL
)

# retrieve from Pinecone
xq = res['data'][0]['embedding']

# get relevant contexts (including the questions)
res = index.query(xq, top_k=5, include_metadata=True)
res