In [26]:
import os
import time
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI

# It is highly recommended to set these environment variables before running the script.
# export PINECONE_API_KEY="YOUR_API_KEY"
# export OPENAI_API_KEY="YOUR_API_KEY"

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
if not PINECONE_API_KEY:
    raise ValueError("PINECONE_API_KEY environment variable not set.")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY environment variable not set.")

openai_client = OpenAI(api_key=OPENAI_API_KEY)
pinecone_client = Pinecone(api_key=PINECONE_API_KEY)

# Default records (you can pass different records into semantic_search)
records = [
    {
        "_id": "rec1",
        "title": "Exp X",
        "type": "Simple Promotion",
        "start_date": "2025-06-18",
        "end_date": "2025-06-30",
        "items": [
            {
                "promotion_id": "PROMO437",
                "component_id": "COMP437",
                "item_id": "ITEM001",
                "discount_type": "% Off",
                "discount_value": "30"
            },
            {
                "promotion_id": "PROMO437",
                "component_id": "COMP437",
                "item_id": "ITEM021",
                "discount_type": "% Off",
                "discount_value": "30"
            },
            {
                "promotion_id": "PROMO437",
                "component_id": "COMP437",
                "item_id": "ITEM041",
                "discount_type": "% Off",
                "discount_value": "30"
            }
        ]
    }
]

# Pinecone index details
index_name = "semantic-search-demo"
dimension = 1536
metric = "cosine"
cloud = "aws"
region = "us-east-1"



In [27]:
def create_embeddings(texts):
    """
    Creates OpenAI embeddings for a list of texts.
    Returns a list of embedding vectors (one per input text).
    """
    response = openai_client.embeddings.create(
        input=texts,
        model="text-embedding-ada-002"
    )
    return [embedding.embedding for embedding in response.data]



In [28]:
def upsert_records_to_pinecone(index, records_to_upsert):
    """
    Upserts the provided records to the Pinecone index.
    Uses each record's 'title' to create embeddings (simple example).
    """
    print("Creating and upserting records...")
    texts_to_embed = [rec['title'] for rec in records_to_upsert]
    embeddings = create_embeddings(texts_to_embed)

    to_upsert = []
    for i, record in enumerate(records_to_upsert):
        vector_id = record['_id']
        vector_embedding = embeddings[i]
        vector_metadata = {
            "title": record["title"],
            "type": record.get("type"),
            "start_date": record.get("start_date"),
            "end_date": record.get("end_date"),
            "items": record.get("items")
        }
        to_upsert.append((vector_id, vector_embedding, vector_metadata))

    index.upsert(vectors=to_upsert)
    print("Upsert complete.")
    time.sleep(3)  # Give a moment for upsert to be visible



In [29]:
def query_pinecone_index(index, query_text, top_k=5):
    """
    Queries the Pinecone index with the provided query_text and returns structured matches.
    """
    print(f"Querying with text: '{query_text}'")
    query_vector = create_embeddings([query_text])[0]

    search_results = index.query(
        vector=query_vector,
        top_k=top_k,
        include_values=False,
        include_metadata=True
    )

    matches = []
    if not getattr(search_results, "matches", None):
        print("No matches found.")
        return matches

    print("\n--- Search Results ---")
    for match in search_results.matches:
        print(f"ID: {match.id}")
        print(f"Score: {match.score}")
        print("Metadata:")
        for key, value in (match.metadata or {}).items():
            print(f"  {key}: {value}")
        print("-" * 20)

        matches.append({
            "id": match.id,
            "score": match.score,
            "metadata": match.metadata
        })
    return matches



In [34]:
def ensure_index_exists(name):
    """
    Create index if it doesn't exist, and wait until it's ready.
    """
    existing = pinecone_client.list_indexes()  # returns a list of names in Pinecone v3+
    if name not in existing:
        print(f"Creating Pinecone index '{name}'...")
        pinecone_client.create_index(
            name=name,
            dimension=dimension,
            metric=metric,
            spec=ServerlessSpec(cloud=cloud, region=region)
        )
        print("Index created successfully.")
        # Wait for the index to be ready
        while not pinecone_client.describe_index(name).status['ready']:
            time.sleep(1)
            print("Waiting for index to be ready...")


In [35]:
def semantic_search(query_text, records_to_index=None, upsert=True, top_k=5, delete_index_after=False):
    """
    Run a semantic search given a query text.
    - query_text: the query string to search with (this is the parameter you asked to pass).
    - records_to_index: optional list of records to upsert (defaults to the module 'records').
    - upsert: if True, upserts records before querying (set False if the index already has data).
    - top_k: number of results to return.
    - delete_index_after: if True, deletes the index at the end (useful for ephemeral runs).
    Returns a list of match dicts: [{'id', 'score', 'metadata'}, ...]
    """
    if records_to_index is None:
        records_to_index = records

    ensure_index_exists(index_name)
    index = pinecone_client.Index(index_name)
    print(f"Connected to index '{index_name}'")

    if upsert and records_to_index:
        upsert_records_to_pinecone(index, records_to_index)

    matches = query_pinecone_index(index, query_text, top_k=top_k)

    if delete_index_after:
        print(f"\nDeleting index '{index_name}'...")
        pinecone_client.delete_index(index_name)
        print("Index deleted.")

    return matches

In [36]:
query = "What is the duration of the promotion?"
results = semantic_search(query)
print("Search Results:", results)

Creating Pinecone index 'semantic-search-demo'...
Index created successfully.
Connected to index 'semantic-search-demo'
Creating and upserting records...


PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Tue, 16 Sep 2025 10:40:58 GMT', 'Content-Type': 'application/json', 'Content-Length': '150', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '507', 'x-pinecone-request-id': '4178752704764965618', 'x-envoy-upstream-service-time': '81', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Metadata value must be a string, number, boolean or list of strings, got '[{\"component_id\"...' for field 'items'","details":[]}
