In [None]:
import pandas as pd
import pinecone
from dotenv import load_dotenv
import os

# load env vars
load_dotenv()
df = pd.read_csv('data/offer_retailer.csv')

Remove non asii characters -- Pinecone doesnt accept ascii characters


In [None]:
def keep_ascii(text):
    return ''.join(c for c in text if c.isascii())


df['OFFER'] = df['OFFER'].apply(keep_ascii)

For cases where there is missing data

Note: only "RETAILER" has missing values


In [None]:
df = df.fillna("")

Load model for generating embeddings


In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer(os.environ['MODEL_NAME'])

In [None]:
dim = model.encode(["text"])[0].shape[0]
dim

In [None]:
from pinecone import Pinecone

pc = Pinecone(api_key=os.environ['PINECONE_API'],
              environment=os.environ['PINECONE_ENV'])
idxs = pc.list_indexes()


idxs

In [None]:
from pinecone import Pinecone, ServerlessSpec



if os.environ['IDX_NAME'] not in idxs:

    pc.create_index(
        os.environ['IDX_NAME'], dimension=dim, metric='cosine',
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

    time.sleep(120)  # wait for index to be created

In [None]:
index = pc.Index(os.environ['IDX_NAME'])

Prepare data for pushing into Pinecone

I am adding in meta data as well to pinecone

Pine cone accepts data in the following format:

```
[((embedding, key), {metadata:value}),((embedding, key), {metadata:value}),((embedding, key), {metadata:value}), ...]
```


In [None]:
res = model.encode(df['OFFER']).tolist()

pinecone_data = []
for idx, row in df.iterrows():
    pinecone_data.append((df['OFFER'][idx], res[idx], {
                         'BRAND': row['BRAND'], 'RETAILER': row['RETAILER']}))

In [None]:
index.upsert(pinecone_data)

In [None]:
index.describe_index_stats()

Testing to see if works:


In [None]:
res = index.query(
    vector=model.encode(["Aldi eggs"]).tolist(),
    top_k=3,
    include_values=False,
    include_metadata=True

)
[res['matches'][x].id for x in range(len(res['matches']))]