In [None]:

from langchain.embeddings.openai import OpenAIEmbeddings
# from langchain.vectorstores import Pinecone
# from langchain.llms import OpenAI

from openai import OpenAI
import json
import re


In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
with open('description.txt', 'r') as file:
    text = file.read()


# First, let's clean and parse the JSON data properly
def clean_json_data(text):
    """Clean the JSON string and parse it properly"""
    # Remove unicode characters like \u00a0
    cleaned_text = re.sub(r'\\u00a0', '', text)
    # Parse the JSON
    data = json.loads(cleaned_text)
    return data

# Clean the data
products = clean_json_data(text)
print(f"Found {len(products)} products")
print(f"First product model: {products[0]['model_no']}")

def parse_description(desc):
    # Extract short and full descriptions separately
    full_desc_match = re.search(r"Full Description:\s*(.*?)(?:\.|$)", desc)
    short_desc_match = re.search(r"Short Description:\s*(.*?)(?:\.|$)", desc)
    
    # Extract all other key-value style entries
    key_value_pairs = re.findall(r"([\w\s/]+?):\s*(.*?)(?:\.|$)", desc)

    parsed = {
        "full_description": full_desc_match.group(1) if full_desc_match else "",
        # "short_description": short_desc_match.group(1) if short_desc_match else "",
        "specs": {k.strip(): v.strip() for k, v in key_value_pairs 
                  if k not in ["Full Description", "Short Description"]}
    }
    return parsed

# Apply parsing
for item in products:
    parsed = parse_description(item["description"])
    item["full_description"] = parsed["full_description"]
    # item["short_description"] = parsed["short_description"]
    item["specs"] = parsed["specs"]
    del item["description"]
    specs = item.get("specs", {})
    if "Fi" in specs:
        # Move value to correct key
        specs["Wi-Fi"] = specs.pop("Fi")

def clean_spec_value(key, value):
    value = value.strip()
    
    # Normalize "No XYZ support" → "not supported"
    if re.match(r'no\s+[a-zA-Z0-9\-/ ]+\s+support', value, flags=re.IGNORECASE):
        return "not supported"
    
    # Normalize "XYZ not available" → "not available"
    if re.search(r'not available', value, flags=re.IGNORECASE):
        return "not available"

    return value

def specs_to_string(specs):
    parts = []
    for k, v in specs.items():
        cleaned = clean_spec_value(k, v)
        parts.append(f"{k}: {cleaned}")
    return ". ".join(parts)

def create_embedding_text(item):
    parts = [
        f"Model No: {item['model_no']}",
        # f"Short Description: {item['short_description']}",
        f"Full Description: {item['full_description']}",
        specs_to_string(item["specs"])  # Cleaned specs as one string
    ]
    return ". ".join(parts)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)              # Collapse multiple spaces
    text = re.sub(r'\s([:;,.])', r'\1', text)      # Remove space before punctuation
    text = text.strip()
    return text

# Add new field for embedding
for item in products:
    item["embedding_text"] = create_embedding_text(item)
    item["embedding_text"] = item["embedding_text"].lower()
    item["embedding_text"] = normalize_text(item["embedding_text"])


docs = [item["embedding_text"] for item in products]

import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec

# Set your keys
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY_HERE"   # OpenAI key
os.environ["PINECONE_API_KEY"] = "YOUR_PINECONE_API_KEY_HERE"



Found 100 products
First product model: I-7547


In [6]:
embeddings=OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])
embeddings

  embeddings=OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY'])


OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x1192846a0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x11a6ecbb0>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='YOUR_OPENAI_API_KEY_HERE', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)

In [None]:

# 1. Initialize clients
client = OpenAI(api_key="YOUR_OPENAI_API_KEY_HERE")

pc = Pinecone(api_key="YOUR_PINECONE_API_KEY_HERE", environment="us-east-1")


In [25]:
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import re


index_name = "model-prompter"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name= index_name,
        dimension=3072,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index("model-prompter")

def extract_model_id(text):
    match = re.search(r"model no:\s*([a-zA-Z0-9\-_.]+)", text, re.IGNORECASE)
    return match.group(1) if match else None

# Embed and upsert
records = []
for doc in docs:
    model_id = extract_model_id(doc)
    if model_id:
        response = client.embeddings.create(
            model="text-embedding-3-large",
            input=[doc]
        )
        embedding = response.data[0].embedding
        records.append({
            "id": model_id,
            "values": embedding,
            "metadata": {"text": doc}
        })

index.upsert(vectors=records)

{'upserted_count': 100}

In [23]:
query_text = "USB to RS232 converter"

# Embed the query
query_embedding = client.embeddings.create(
    model="text-embedding-3-large",
    input=[query_text]
).data[0].embedding

# Query Pinecone
results = index.query(
    vector=query_embedding,
    top_k=5,
    include_metadata=True
)

# Display results
for match in results['matches']:
    print(f"✅ ID: {match['id']}\n🔹 Score: {match['score']:.4f}\n📝 Text: {match['metadata']['text'][:150]}\n---")


✅ ID: usb232.
🔹 Score: 0.6565
📝 Text: model no: usb232. full description: rs232 to usb onverter. short description: rs232 converter. serial port: supports rs232 serial port. ethernet: not 
---
✅ ID: usb
🔹 Score: 0.6168
📝 Text: model no: usb 2514. full description: usb to 4-port rs-232 converter. short description: usb to rs232 converter. serial port: supports rs232 serial po
---
✅ ID: usb485.
🔹 Score: 0.5920
📝 Text: model no: usb485. full description: rs485 to usb onverter. short description: rs485 converter. serial port: supports rs485 serial port. ethernet: not 
---
✅ ID: i-7561u.
🔹 Score: 0.5899
📝 Text: model no: i-7561u. full description: high-speed usb to isolated rs-232/422/485 converter with ca-usb18 cable. short description: usb to serial convert
---
✅ ID: i-7520a.
🔹 Score: 0.5217
📝 Text: model no: i-7520a. full description: isolated rs-232 to rs-422/485 converter. short description: serial converter. serial port: supports rs232, 422, a
---


Name: pinecone
Version: 7.3.0
Summary: Pinecone client and SDK
Home-page: https://www.pinecone.io
Author: Pinecone Systems, Inc.
Author-email: support@pinecone.io
License: Apache-2.0
Location: /Users/akshayeiyer/Documents/My Projects/Langchain/VectorDB Pinecone project/venv/lib/python3.10/site-packages
Requires: certifi, pinecone-plugin-assistant, pinecone-plugin-interface, python-dateutil, typing-extensions, urllib3
Required-by: 


In [9]:
import serpapi

results = serpapi.search({
    "q": "analog and digital i/o to modbus rtu",
    "location": "Austin, Texas, United States", 
    "hl": "en",
    "gl": "us",
    "engine": "google",
    "api_key": "0405f5209eae5d423ed1543842c123c70f1d3ee6341c6a557e644f500e7d3eac"
})

print(results)

{[37m[39;49;00m
[37m    [39;49;00m[94m"search_metadata"[39;49;00m:[37m [39;49;00m{[37m[39;49;00m
[37m        [39;49;00m[94m"id"[39;49;00m:[37m [39;49;00m[33m"6879ed6c018c48d204358bb1"[39;49;00m,[37m[39;49;00m
[37m        [39;49;00m[94m"status"[39;49;00m:[37m [39;49;00m[33m"Success"[39;49;00m,[37m[39;49;00m
[37m        [39;49;00m[94m"json_endpoint"[39;49;00m:[37m [39;49;00m[33m"https://serpapi.com/searches/0056cf78c405db8f/6879ed6c018c48d204358bb1.json"[39;49;00m,[37m[39;49;00m
[37m        [39;49;00m[94m"pixel_position_endpoint"[39;49;00m:[37m [39;49;00m[33m"https://serpapi.com/searches/0056cf78c405db8f/6879ed6c018c48d204358bb1.json_with_pixel_position"[39;49;00m,[37m[39;49;00m
[37m        [39;49;00m[94m"created_at"[39;49;00m:[37m [39;49;00m[33m"2025-07-18 06:45:00 UTC"[39;49;00m,[37m[39;49;00m
[37m        [39;49;00m[94m"processed_at"[39;49;00m:[37m [39;49;00m[33m"2025-07-18 06:45:26 UTC"[39;49;00m,[37m[39;49;00m
[37

In [2]:
extract_model_info("https://www.plc-io.com/it/home/moduli-i-o/products.1.5.2.sp.uw?l=2&gad_source=1&gad_campaignid=714218552&gbraid=0AAAAADql6FQ8HZRQKO3j8iSICJ5NqYvu4&gclid=CjwKCAjwvuLDBhAOEiwAPtF0VmCqa9lidhuz5PK8F1NEczfUSfVk5IBb-tRI-Ll3aE4qlByG2N0meBoCr8UQAvD_BwE")

[]