In [None]:
from pinecone.grpc import PineconeGRPC
import os
from datetime import datetime
import ray
import numpy as np
import time
import random
import sys

In [2]:
PINECONE_API_KEY = 'YOUR-API-KEY-HERE'
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [None]:
ray.init(
    runtime_env={
        "env_vars": {
          "PINECONE_API_KEY": os.getenv("PINECONE_API_KEY"),
        }
    },
)

In [None]:
index_name = "YOUR-PINECONE-INDEX-NAME-HERE"

batch_size = 350
# from https://www.pinecone.io/blog/working-at-scale/
MAX_BYTES_SIZE_PER_REQUEST = 2 * 1024 * 1024
MAX_BATCH_LENGTH = 300

In [None]:
def format_records(row):
    """Format each row before upserting. Notably, we apply the inverse of
    `flatten_metadata_col()` in `2_merge_embeddings.ipynb` to properly
    format the metadata."""
    metadata_dict = {
        "document_id": row["metadata_document_id"],
        # cannot upsert raw np.ndarray of floats in rand_coeff, get the following error:
        # "Metadata value must be a string, number, boolean or list of strings"
        "rand_coeff": [str(elem) for elem in row["metadata_rand_coeff"].tolist()],
        "source": row["metadata_source"],
        "timestamp": row["metadata_timestamp"],
        "text": row["text"],
    }

    row["metadata"] = metadata_dict
    del row["metadata_document_id"]
    del row["metadata_rand_coeff"]
    del row["metadata_source"]
    del row["metadata_timestamp"]
    del row["text"]

    return row

def process_batch(batch):
    batch_keys = batch.keys()
    batch_records = [dict(zip(batch_keys, vals)) for vals in zip(*(batch[k] for k in batch_keys))]
    batch_records = [format_records(row) for row in batch_records]
    return batch_records


In [None]:
# Utilities to calculate row byte size, used in determining how 
# many rows to batch together for the upsert call.
def _get_row_size(row):
    return (
        sys.getsizeof(row["id"]) + row["values"].nbytes
        + _get_row_md_size(row)
    )

def _get_row_md_size(row):
    md = row["metadata"]
    return (
        # from https://www.pinecone.io/blog/working-at-scale/,
        # get utf-8 encoded size to get the most accurate estimate for grpc call.
        # + sys.getsizeof(md["document_id"]) + sys.getsizeof(md["source"])
        + sys.getsizeof(md["document_id"]) + len(md["source"].encode('utf-8'))
        + len(md["text"].encode('utf-8')) + md["timestamp"].nbytes
        # + sys.getsizeof(md["text"]) + md["timestamp"].nbytes
        + sum([sys.getsizeof(c) for c in md["rand_coeff"]])
    )

def chunker(seq):
    # Chunk `seq` into batches based on byte size, to avoid errors like:
    # UNKNOWN:Error received from peer ipv4:52.41.228.72:443 grpc_message:"Request size 4MB exceeds the maximum supported size of 2MB"
    curr_batch = []
    curr_batch_size = 0
    for pos in range(len(seq)):
        curr_row = seq[pos]
        row_size = _get_row_size(curr_row)
        if curr_batch_size + row_size < MAX_BYTES_SIZE_PER_REQUEST and len(curr_batch) < MAX_BATCH_LENGTH:
            curr_batch.append(seq[pos])
            curr_batch_size += row_size
        else:
            yield curr_batch
            curr_batch = [seq[pos]]
            curr_batch_size = row_size
    yield curr_batch

def split_batch(batch, n):
    """Split `batch` into `n` sub-batches, returned as a list of `n` lists."""
    sub_batch_size = len(batch) // n
    for i in range(0, len(batch), sub_batch_size):
        yield batch[i:i + sub_batch_size]

In [None]:
def upload(big_batch):
    """Chunk `big_batch` into smaller batches, each batch is upserted to Pinecone
    with a basic wait-and-retry scheme on failure.
    In the case of a payload size error, we try to upload each vector individually."""
    client = PineconeGRPC(PINECONE_API_KEY)
    index = client.Index(index_name)
    total_vectors = 0
    num_failures_payload_size = 0
    num_failures_other = 0
    max_attempts_per_batch = 5
    data = process_batch(big_batch)
    
    for batch in chunker(data):
        upsert_successful = False
        attempt = 0
        recent_exception = None
        upload_vector_individually = False
        while not upsert_successful and attempt < max_attempts_per_batch:
            if not upload_vector_individually:
                try:
                    result = index.upsert(vectors=batch)
                    total_vectors += result.upserted_count
                    upsert_successful = True
                except Exception as e:
                    recent_exception = e

                    if "which exceeds the limit of" in str(e):
                        upload_vector_individually = True
                        print(f"===> Payload size exceeds limit, try uploading individually next")

                    # wait at least 5 seconds per retry
                    wait_s = max(5, random.randrange(0, 16 * 2 ** attempt))
                    attempt += 1
                    print(f"===> Exception on attempt {attempt}/{max_attempts_per_batch}: {e}")
                    if attempt < max_attempts_per_batch:
                        print(f"===> Wait {wait_s} for next retry attempt")
                        time.sleep(wait_s)
            else:
                curr_success = 0
                upsert_successful = True
                for sub_batch in split_batch(batch, len(batch)):
                    try:
                        result = index.upsert(vectors=sub_batch)
                        total_vectors += result.upserted_count
                        curr_success += result.upserted_count
                    except Exception as e:
                        if "which exceeds the limit of" in str(e):
                            print(f"===> Embedding with id {sub_batch[0]['id']} exceeds size limit: {e}")
                            print(f"===> Embedding text: {sub_batch[0]['metadata']['text']}")
                        num_failures_payload_size += len(sub_batch)
                        recent_exception = e
                        print(f"===> Exception when uploading individual embedding: {e}")
                        print(f"===> Embedding: {sub_batch[0]}")
                # after an individual upload attempt, move on to next batch, 
                # even if not all the vectors succeeded.
                print(f"===> Successfully uploaded {curr_success}/{len(batch)} individual embeddings")
                upsert_successful = True 

        if not upsert_successful and not upload_vector_individually:
            num_failures_other += len(batch)
            print(f"===> Exception after {max_attempts_per_batch} attempts: {recent_exception}")
    return {
        'upserted': np.array([total_vectors]), 
        'errors_payload_size': np.array([num_failures_payload_size]),
        'errors_other': np.array([num_failures_other])
    }

In [None]:
# Output path containing merged embeddings files 
# generated from notebook `2_merge_embeddings.ipynb`
merged_output_prefix = "YOUR-MERGED-EMBEDDINGS-BUCKET-HERE"

# Range of merged embedded files to upsert, from [f_start_index, f_end_index).
# For example, the values below will upsert embeddings from all 1413 merged files.
f_start_index = 0
f_end_index = 1413

merged_output_paths = [
    f"{merged_output_prefix}/{i}.parquet"
    for i in range(f_start_index, f_end_index)
]

In [None]:
# Main Ray Data code to run upsert.
start_t = datetime.now()

embedded_ds_read = ray.data.read_parquet(merged_output_paths)
new_ds = embedded_ds_read.map_batches(
    upload, 
    batch_size=batch_size * 20,
)
summary = new_ds.sum(['upserted', 'errors_other', 'errors_payload_size'])

duration = datetime.now() - start_t
print(f"===> Finished upserting {len(merged_output_paths)} merged files in {duration}")
print("===> Summary:", {k: f"{v: ,}" for k,v in summary.items()})

In [None]:
# Check upserted index info. This may not reflect the exact nmber of vectors upserted
# from above, since there seems to be some delay in registering the vectors.
client = PineconeGRPC(PINECONE_API_KEY)
index = client.Index(index_name)
print(client.describe_index(index_name))
print(index.describe_index_stats())