In [21]:
with open('description.txt', 'r') as file:
    text = file.read()


In [22]:
import json
import re

# First, let's clean and parse the JSON data properly
def clean_json_data(text):
    """Clean the JSON string and parse it properly"""
    # Remove unicode characters like \u00a0
    cleaned_text = re.sub(r'\\u00a0', '', text)
    # Parse the JSO
    data = json.loads(cleaned_text)
    return data

# Clean the data
products = clean_json_data(text)
print(f"Found {len(products)} products")
print(f"First product model: {products[0]['model_no']}")


Found 100 products
First product model: I-7547


In [23]:
def parse_description(desc):
    # Extract short and full descriptions separately
    full_desc_match = re.search(r"Full Description:\s*(.*?)(?:\.|$)", desc)
    short_desc_match = re.search(r"Short Description:\s*(.*?)(?:\.|$)", desc)
    
    # Extract all other key-value style entries
    key_value_pairs = re.findall(r"([\w\s/]+?):\s*(.*?)(?:\.|$)", desc)

    parsed = {
        "full_description": full_desc_match.group(1) if full_desc_match else "",
        # "short_description": short_desc_match.group(1) if short_desc_match else "",
        "specs": {k.strip(): v.strip() for k, v in key_value_pairs 
                  if k not in ["Full Description", "Short Description"]}
    }
    return parsed

# Apply parsing
for item in products:
    parsed = parse_description(item["description"])
    item["full_description"] = parsed["full_description"]
    # item["short_description"] = parsed["short_description"]
    item["specs"] = parsed["specs"]
    del item["description"]
    specs = item.get("specs", {})
    if "Fi" in specs:
        # Move value to correct key
        specs["Wi-Fi"] = specs.pop("Fi")

In [24]:
def clean_spec_value(key, value):
    value = value.strip()
    
    # Normalize "No XYZ support" → "not supported"
    if re.match(r'no\s+[a-zA-Z0-9\-/ ]+\s+support', value, flags=re.IGNORECASE):
        return "not supported"
    
    # Normalize "XYZ not available" → "not available"
    if re.search(r'not available', value, flags=re.IGNORECASE):
        return "not available"

    return value

def specs_to_string(specs):
    parts = []
    for k, v in specs.items():
        cleaned = clean_spec_value(k, v)
        parts.append(f"{k}: {cleaned}")
    return ". ".join(parts)



In [25]:
products

[{'model_no': 'I-7547',
  'full_description': 'Ethernet(TCP/IP) to HART Converter',
  'specs': {'Short Description': 'HART Converter',
   'Serial Port': 'not available',
   'Ethernet': 'Supports TCI and IP ethernet port',
   'USB': 'No USB port available',
   'Channels': 'Supports 8 HART devices',
   'Memory': 'Memory not available',
   'Input Protocol': 'Supports HART input protocol',
   'Output Protocol': 'Supports Ethernet TCP and IP output protocol',
   'Cloud Connectivity': 'Cloud connectivity via external modem',
   'SIM Card': 'No SIM support available',
   'Micro SD': 'No MicroSD support',
   'Wi-Fi': 'No Wi-Fi support'}},
 {'model_no': 'I-7567',
  'full_description': 'USB to HART converter',
  'specs': {'Short Description': 'HART Converter',
   'Serial Port': 'not available',
   'Ethernet': 'not available',
   'USB': 'No USB port available',
   'Channels': 'Supports 8 HART devices',
   'Memory': 'Memory not available',
   'Input Protocol': 'Supports HART input protocol',
   'O

In [26]:
def create_embedding_text(item):
    parts = [
        f"Model No: {item['model_no']}",
        # f"Short Description: {item['short_description']}",
        f"Full Description: {item['full_description']}",
        specs_to_string(item["specs"])  # Cleaned specs as one string
    ]
    return ". ".join(parts)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)              # Collapse multiple spaces
    text = re.sub(r'\s([:;,.])', r'\1', text)      # Remove space before punctuation
    text = text.strip()
    return text

# Add new field for embedding
for item in products:
    item["embedding_text"] = create_embedding_text(item)
    item["embedding_text"] = item["embedding_text"].lower()
    item["embedding_text"] = normalize_text(item["embedding_text"])

products

[{'model_no': 'I-7547',
  'full_description': 'Ethernet(TCP/IP) to HART Converter',
  'specs': {'Short Description': 'HART Converter',
   'Serial Port': 'not available',
   'Ethernet': 'Supports TCI and IP ethernet port',
   'USB': 'No USB port available',
   'Channels': 'Supports 8 HART devices',
   'Memory': 'Memory not available',
   'Input Protocol': 'Supports HART input protocol',
   'Output Protocol': 'Supports Ethernet TCP and IP output protocol',
   'Cloud Connectivity': 'Cloud connectivity via external modem',
   'SIM Card': 'No SIM support available',
   'Micro SD': 'No MicroSD support',
   'Wi-Fi': 'No Wi-Fi support'},
  'embedding_text': 'model no: i-7547. full description: ethernet(tcp/ip) to hart converter. short description: hart converter. serial port: not available. ethernet: supports tci and ip ethernet port. usb: no usb port available. channels: supports 8 hart devices. memory: not available. input protocol: supports hart input protocol. output protocol: supports eth

In [27]:
docs = [item["embedding_text"] for item in products]


In [28]:
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

# Set your keys
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY_HERE"   # OpenAI key
os.environ["PINECONE_API_KEY"] = "YOUR_PINECONE_API_KEY_HERE"


In [29]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

In [31]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="YOUR_PINECONE_API_KEY_HERE")

In [None]:
index_name = "model-prompter"

if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }
    )

In [32]:
# index_name = "model-prompter"

# # Create index if not exists
# if index_name not in pc.list_indexes().names():
#     pc.create_index(
#         name=index_name,
#         dimension=1536,  # for text-embedding-3-small
#         metric='euclidean',
#         spec=pc.serverless.ServerlessSpec(
#             cloud='aws',
#             region='us-east-1'
#         )
#     )

# # index = pc.Index(index_name)


# # Delete the old index
# # pc.delete_index("model-prompter")

# # pc.create_index(
# #     name="model-prompter",
# #     dimension=1536,
# #     metric="euclidean",
# #     spec=ServerlessSpec(
# #         cloud="aws",
# #         region="us-east-1"
# #     )
# # )

In [37]:
docs = [item["embedding_text"] for item in products]
print(docs)

['model no: i-7547. full description: ethernet(tcp/ip) to hart converter. short description: hart converter. serial port: not available. ethernet: supports tci and ip ethernet port. usb: no usb port available. channels: supports 8 hart devices. memory: not available. input protocol: supports hart input protocol. output protocol: supports ethernet tcp and ip output protocol. cloud connectivity: cloud connectivity via external modem. sim card: not supported. micro sd: not supported. wi-fi: not supported', 'model no: i-7567. full description: usb to hart converter. short description: hart converter. serial port: not available. ethernet: not available. usb: no usb port available. channels: supports 8 hart devices. memory: not available. input protocol: supports hart input protocol. output protocol: supports usb output protocol. cloud connectivity: cloud connectivity via external modem. sim card: not supported. micro sd: not supported. wi-fi: not supported', 'model no: i-7570. full descript

In [None]:
def extract_model_id(text):
    match = re.search(r"model no:\s*([a-zA-Z0-9\-_.]+)", text, re.IGNORECASE)
    return match.group(1) if match else None

# Build properly formatted documents
formatted_docs = []
for doc in docs:
    model_id = extract_model_id(doc)
    if model_id:
        formatted_docs.append({"id": model_id, "chunk_text": doc})

formatted_docs

[{'id': 'i-7547.',
  'chunk_text': 'model no: i-7547. full description: ethernet(tcp/ip) to hart converter. short description: hart converter. serial port: not available. ethernet: supports tci and ip ethernet port. usb: no usb port available. channels: supports 8 hart devices. memory: not available. input protocol: supports hart input protocol. output protocol: supports ethernet tcp and ip output protocol. cloud connectivity: cloud connectivity via external modem. sim card: not supported. micro sd: not supported. wi-fi: not supported'},
 {'id': 'i-7567.',
  'chunk_text': 'model no: i-7567. full description: usb to hart converter. short description: hart converter. serial port: not available. ethernet: not available. usb: no usb port available. channels: supports 8 hart devices. memory: not available. input protocol: supports hart input protocol. output protocol: supports usb output protocol. cloud connectivity: cloud connectivity via external modem. sim card: not supported. micro sd: 

In [None]:
index = pc.Index("model-prompter")
index.insert(records=formatted_docs)

AttributeError: 'Index' object has no attribute 'insert'

In [None]:
# docs = [item["embedding_text"] for item in products]
# embeddings = get_embeddings(docs)

In [None]:
vectors = []

for i, item in enumerate(products):
    vectors.append({
        "id": item["model_no"],  # unique ID per model
        "values": embeddings[i],  # corresponding embedding
        "metadata": {
           
            "full_description": item["full_description"],
            "embedding_text": item["embedding_text"]
        }
    })

# Upload in batches (for large uploads)
index.upsert(vectors=vectors)


PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Tue, 15 Jul 2025 11:48:56 GMT', 'Content-Type': 'application/json', 'Content-Length': '104', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '3743', 'x-pinecone-request-id': '5524559877245460507', 'x-envoy-upstream-service-time': '51', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Vector dimension 1536 does not match the dimension of the index 2048","details":[]}
