In [2]:
import pandas as pd
import pinecone
from dotenv import load_dotenv
import os

# load env vars
load_dotenv()
df = pd.read_csv('data/offer_retailer.csv')

Remove non asii characters -- Pinecone doesnt accept ascii characters

In [3]:
def keep_ascii(text):
    return ''.join(c for c in text if c.isascii())
df['OFFER'] = df['OFFER'].apply(keep_ascii)

For cases where there is missing data

Note: only "RETAILER" has missing values

In [4]:
df = df.fillna("")

Load model for generating embeddings

In [5]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer(os.environ['MODEL_NAME'])


  from .autonotebook import tqdm as notebook_tqdm


In [19]:
dim = model.encode(["text"])[0].shape[0]
dim

384

In [7]:
# create an index in pinecone
idxs = pinecone.list_indexes()
idxs

['fetchapp']

In [8]:
import time
pinecone.init(api_key=os.environ['PINECONE_API'], environment=os.environ['PINECONE_ENV'])

if os.environ['IDX_NAME'] not in idxs:
    pinecone.create_index(os.environ['IDX_NAME'], dimension=dim, metric = 'cosine')
    time.sleep(120) # wait for index to be created

In [9]:
index = pinecone.Index("fetchapp")

Prepare data for pushing into Pinecone

I am adding in meta data as well to pinecone

Pine cone accepts data in the following format:

```
[((embedding, key), {metadata:value}),((embedding, key), {metadata:value}),((embedding, key), {metadata:value}), ...]
```

In [10]:
res = model.encode(df['OFFER']).tolist()

pinecone_data = []
for idx, row in df.iterrows():
    pinecone_data.append((df['OFFER'][idx], res[idx], {'BRAND': row['BRAND'], 'RETAILER': row['RETAILER']}))


In [11]:
index.upsert(pinecone_data)

{'upserted_count': 384}

In [12]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 375}},
 'total_vector_count': 375}

Testing to see if works:

In [13]:
res = index.query(
  vector=model.encode(["Aldi eggs"]).tolist(),
  top_k=3,
  include_values=False,
  include_metadata=True

)
[res['matches'][x].id for x in range(len(res['matches']))]

['Egglife Egg White Wraps at Aldi',
 'Envy Apples pre-packed bags',
 'Envy Apples, pre-packed bags']