In [1]:
import openai
import langchain
import pinecone
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.llms import OpenAI
import os

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
with open('description.txt', 'r') as file:
    text = file.read()


In [4]:
text

'[\n  {\n    "model_no": "I-7547\\u00a0",\n    "description": "Full Description: Ethernet(TCP/IP) to HART Converter. Short Description: HART Converter. Serial Port: not available. Ethernet: Supports TCI and IP ethernet port. USB: No USB port available. Channels: Supports 8 HART devices. No Analog Input ports. No Analog Output ports. No Digital Input ports. No Digital Output ports. Memory: Memory not available. Input Protocol: Supports HART input protocol. Output Protocol: Supports Ethernet TCP and IP output protocol. Cloud Connectivity: Cloud connectivity via external modem. SIM Card: No SIM support available. Wi-Fi: No Wi-Fi support. Micro SD: No MicroSD support."\n  },\n  {\n    "model_no": "I-7567\\u00a0",\n    "description": "Full Description: USB to HART converter. Short Description: HART Converter. Serial Port: not available. Ethernet: not available. USB: No USB port available. Channels: Supports 8 HART devices. No Analog Input ports. No Analog Output ports. No Digital Input port

In [5]:
import json
import re

# First, let's clean and parse the JSON data properly
def clean_json_data(text):
    """Clean the JSON string and parse it properly"""
    # Remove unicode characters like \u00a0
    cleaned_text = re.sub(r'\\u00a0', '', text)
    # Parse the JSON
    data = json.loads(cleaned_text)
    return data

# Clean the data
products = clean_json_data(text)
print(f"Found {len(products)} products")
print(f"First product model: {products[0]['model_no']}")


Found 100 products
First product model: I-7547


In [6]:
products

[{'model_no': 'I-7547',
  'description': 'Full Description: Ethernet(TCP/IP) to HART Converter. Short Description: HART Converter. Serial Port: not available. Ethernet: Supports TCI and IP ethernet port. USB: No USB port available. Channels: Supports 8 HART devices. No Analog Input ports. No Analog Output ports. No Digital Input ports. No Digital Output ports. Memory: Memory not available. Input Protocol: Supports HART input protocol. Output Protocol: Supports Ethernet TCP and IP output protocol. Cloud Connectivity: Cloud connectivity via external modem. SIM Card: No SIM support available. Wi-Fi: No Wi-Fi support. Micro SD: No MicroSD support.'},
 {'model_no': 'I-7567',
  'description': 'Full Description: USB to HART converter. Short Description: HART Converter. Serial Port: not available. Ethernet: not available. USB: No USB port available. Channels: Supports 8 HART devices. No Analog Input ports. No Analog Output ports. No Digital Input ports. No Digital Output ports. Memory: Memory 

In [7]:
def parse_description(desc):
    # Extract short and full descriptions separately
    full_desc_match = re.search(r"Full Description:\s*(.*?)(?:\.|$)", desc)
    short_desc_match = re.search(r"Short Description:\s*(.*?)(?:\.|$)", desc)
    
    # Extract all other key-value style entries
    key_value_pairs = re.findall(r"([\w\s/]+?):\s*(.*?)(?:\.|$)", desc)

    parsed = {
        "full_description": full_desc_match.group(1) if full_desc_match else "",
        # "short_description": short_desc_match.group(1) if short_desc_match else "",
        "specs": {k.strip(): v.strip() for k, v in key_value_pairs 
                  if k not in ["Full Description", "Short Description"]}
    }
    return parsed

# Apply parsing
for item in products:
    parsed = parse_description(item["description"])
    item["full_description"] = parsed["full_description"]
    # item["short_description"] = parsed["short_description"]
    item["specs"] = parsed["specs"]
    del item["description"]
    specs = item.get("specs", {})
    if "Fi" in specs:
        # Move value to correct key
        specs["Wi-Fi"] = specs.pop("Fi")

In [8]:
def clean_spec_value(key, value):
    value = value.strip()
    
    # Normalize "No XYZ support" → "not supported"
    if re.match(r'no\s+[a-zA-Z0-9\-/ ]+\s+support', value, flags=re.IGNORECASE):
        return "not supported"
    
    # Normalize "XYZ not available" → "not available"
    if re.search(r'not available', value, flags=re.IGNORECASE):
        return "not available"

    return value

def specs_to_string(specs):
    parts = []
    for k, v in specs.items():
        cleaned = clean_spec_value(k, v)
        parts.append(f"{k}: {cleaned}")
    return ". ".join(parts)



In [9]:
products

[{'model_no': 'I-7547',
  'full_description': 'Ethernet(TCP/IP) to HART Converter',
  'specs': {'Short Description': 'HART Converter',
   'Serial Port': 'not available',
   'Ethernet': 'Supports TCI and IP ethernet port',
   'USB': 'No USB port available',
   'Channels': 'Supports 8 HART devices',
   'Memory': 'Memory not available',
   'Input Protocol': 'Supports HART input protocol',
   'Output Protocol': 'Supports Ethernet TCP and IP output protocol',
   'Cloud Connectivity': 'Cloud connectivity via external modem',
   'SIM Card': 'No SIM support available',
   'Micro SD': 'No MicroSD support',
   'Wi-Fi': 'No Wi-Fi support'}},
 {'model_no': 'I-7567',
  'full_description': 'USB to HART converter',
  'specs': {'Short Description': 'HART Converter',
   'Serial Port': 'not available',
   'Ethernet': 'not available',
   'USB': 'No USB port available',
   'Channels': 'Supports 8 HART devices',
   'Memory': 'Memory not available',
   'Input Protocol': 'Supports HART input protocol',
   'O

In [12]:
products_trial = products
def create_embedding_text(item):
    parts = [
        f"Model No: {item['model_no']}",
        # f"Short Description: {item['short_description']}",
        f"Full Description: {item['full_description']}",
        specs_to_string(item["specs"])  # Cleaned specs as one string
    ]
    return ". ".join(parts)

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)              # Collapse multiple spaces
    text = re.sub(r'\s([:;,.])', r'\1', text)      # Remove space before punctuation
    text = text.strip()
    return text

# Add new field for embedding
for item in products:
    item["embedding_text"] = create_embedding_text(item)
    item["embedding_text"] = item["embedding_text"].lower()
    item["embedding_text"] = normalize_text(item["embedding_text"])

products

[{'model_no': 'I-7547',
  'full_description': 'Ethernet(TCP/IP) to HART Converter',
  'specs': {'Short Description': 'HART Converter',
   'Serial Port': 'not available',
   'Ethernet': 'Supports TCI and IP ethernet port',
   'USB': 'No USB port available',
   'Channels': 'Supports 8 HART devices',
   'Memory': 'Memory not available',
   'Input Protocol': 'Supports HART input protocol',
   'Output Protocol': 'Supports Ethernet TCP and IP output protocol',
   'Cloud Connectivity': 'Cloud connectivity via external modem',
   'SIM Card': 'No SIM support available',
   'Micro SD': 'No MicroSD support',
   'Wi-Fi': 'No Wi-Fi support'},
  'embedding_text': 'model no: i-7547. full description: ethernet(tcp/ip) to hart converter. short description: hart converter. serial port: not available. ethernet: supports tci and ip ethernet port. usb: no usb port available. channels: supports 8 hart devices. memory: not available. input protocol: supports hart input protocol. output protocol: supports eth

In [27]:
# def chunk_text(text, chunk_size=800, chunk_overlap=50):
#     text_splitter = RecursiveCharacterTextSplitter(
#         chunk_size = chunk_size,
#         chunk_overlap = chunk_overlap)
    
#     doc = text_splitter.split_documents (text)
#     return doc
docs = []
for items in products:
    docs.append(items['embedding_text'])

print(docs)



['model no: i-7547. full description: ethernet(tcp/ip) to hart converter. short description: hart converter. serial port: not available. ethernet: supports tci and ip ethernet port. usb: no usb port available. channels: supports 8 hart devices. memory: not available. input protocol: supports hart input protocol. output protocol: supports ethernet tcp and ip output protocol. cloud connectivity: cloud connectivity via external modem. sim card: not supported. micro sd: not supported. wi-fi: not supported', 'model no: i-7567. full description: usb to hart converter. short description: hart converter. serial port: not available. ethernet: not available. usb: no usb port available. channels: supports 8 hart devices. memory: not available. input protocol: supports hart input protocol. output protocol: supports usb output protocol. cloud connectivity: cloud connectivity via external modem. sim card: not supported. micro sd: not supported. wi-fi: not supported', 'model no: i-7570. full descript

In [15]:
embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x1257d3230>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x1257d3b60>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [16]:
vectors = embeddings.embed_query("What is the model number of products with memory not available and supports 8 HART devices?")

In [18]:
len(vectors)

1536

pinecone_key = "YOUR_PINECONE_API_KEY_HERE"

## Pinecone Vector database

Pinecone provides a long term embedding memory for high performing AI applications
- Good for scaling


In [25]:
# #Vector DB Search in Pinecone
# pinecone.init(api_key="YOUR_PINECONE_API_KEY_HERE",
#               environmemnt="llama-text-embed-v2",
#               )
# index_name = "model-prompter"

In [24]:
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(
        api_key= "YOUR_PINECONE_API_KEY_HERE"
              )

if 'model-prompter' not in pc.list_indexes().names():
        pc.create_index(
            name='model-prompter',
            dimension=1536,
            metric='euclidean',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )

In [28]:
index = Pinecone.from_documents(docs, embeddings, index_name="model-prompter")

AttributeError: from_documents is not a top-level attribute of the Pinecone class provided by pinecone's official python package developed at https://github.com/pinecone-io/pinecone-python-client. You may have a name collision with an export from another dependency in your project that wraps Pinecone functionality and exports a similarly named class. Please refer to the following knowledge base article for more information: https://docs.pinecone.io/troubleshooting/pinecone-attribute-errors-with-langchain


In [29]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

  client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])


In [30]:
def get_embeddings(texts, batch_size=100):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        response = client.embeddings.create(
            input=batch,
            model="text-embedding-3-small"
        )
        batch_embeddings = [r.embedding for r in response.data]
        all_embeddings.extend(batch_embeddings)
    return all_embeddings

In [31]:
docs = [item["embedding_text"] for item in products]
embeddings = get_embeddings(docs)

AttributeError: 'OpenAI' object has no attribute 'embeddings'