In [None]:
from pinecone import Pinecone,ServerlessSpec
import os
from dotenv import load_dotenv
import time
load_dotenv()

In [None]:
gemini_api= os.getenv("GEMINI_API_KEY")
pinecone_api = os.getenv("PINECONE_API_KEY")

In [None]:
pc = Pinecone(api_key=pinecone_api)
index_name = "new"

# Create index if not exists
if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }
    )
    print("Created new index")
else:
    print("Index aldready esists")
index = pc.Index(index_name)

In [None]:
index

In [None]:
records = [{"id":"1" , "chunk_text" : "A paper which summarized most peft techniques. Lora Qlora techniqies are one of the best peft techniques."}]

In [None]:
dense_index = pc.Index(index_name)

In [None]:
dense_index.upsert_records("example-namespace", records)

In [None]:
stats = dense_index.describe_index_stats()
stats

In [None]:
query = "best peft methods"

In [None]:
# Search the dense index and rerank results
reranked_results = dense_index.search(
    namespace="example-namespace",
    query={
        "top_k": 10,
        "inputs": {
            'text': query
        }
    },
    rerank={
        "model": "bge-reranker-v2-m3",
        "top_n": 10,
        "rank_fields": ["chunk_text"]
    }   
)

# Print the reranked results
for hit in reranked_results['result']['hits']:
    print(f"id: {hit['_id']}, score: {round(hit['_score'], 2)}, text: {hit['fields']['chunk_text']}")

In [None]:
reranked_results['result']['hits']

In [None]:
output = {}
for hit in reranked_results['result']['hits']:
  output["id"] = hit["_id"]
  output["score"] = round(hit['_score'], 2)
  output["text"] = hit['fields']['chunk_text']
  print(f"id: {hit['_id']}, score: {round(hit['_score'], 2)}, text: {hit['fields']['chunk_text']}")

In [None]:
output

In [None]:
import json
ans = json.dumps(output)

In [None]:
pc.delete_index("new")

In [None]:
# from pinecone import Pinecone, ServerlessSpec
# import json
# from typing import Literal

# class VectorDatabase:
#   def __init__(self, pinecone_api_key: str, index_name: str):
#     self.pc = Pinecone(api_key=pinecone_api_key)
#     self.index_name = index_name
#     self.index = self.setup()
    
#   def setup(self):
#     if not self.pc.has_index(self.index_name):
#       self.pc.create_index_for_model(
#           name=index_name,
#           cloud="aws",
#           region="us-east-1",
#           embed={
#               "model":"llama-text-embed-v2",
#               "field_map":{"text": "chunk_text"}
#           }
#       )
#     return self.pc.Index(self.index_name)
  
#   def insert(self, id: str, summary: str, title: str, namespace: Literal["paper","dataset","algo"]):
#     records = [{
#       "id": id,
#       "chunk_text": summary,
#       "metadata": {"title": title}
#     }]
#     self.index.upsert_records(namespace, records)
    
#   def retreive(self , k:int , query: str, namespace: Literal["paper","dataset","algo"]):
#     reranked_results = self.index.search(
#       namespace=namespace,
#       query={
#         "top_k": k,
#         "inputs": {"text": query}
#       },
#       rerank={
#         "model": "bge-reranker-v2-m3",
#         "top_n": k,
#         "rank_fields": ["chunk_text"]
#       }   
#     )
#     output = []
#     for hit in reranked_results['result']['hits']:
#       output.append({
#         "id": hit["_id"],
#         "score": round(hit['_score'], 2),
#         "text": hit['fields']['chunk_text']
#       })
#     return json.dumps(output, indent=2)


In [1]:
from pinecone import Pinecone, ServerlessSpec
from typing import Literal
import json
import time
class VectorDatabase:
  """
  A class to interact with a Pinecone vector database, specifically designed
  for serverless indexes with automatic embedding generation.
  """
  def __init__(self, pinecone_api_key: str, index_name: str):
    """
    Initializes the Pinecone client and sets up the index.
    
    Args:
      pinecone_api_key: Your API key for Pinecone.
      index_name: The name of the index to use or create.
    """
    self.pc = Pinecone(api_key=pinecone_api_key)
    self.index_name = index_name
    self.index = self._setup()

  def _setup(self):
    if not self.pc.has_index(self.index_name):
      self.pc.create_index_for_model(
          name=self.index_name,
          cloud="aws",
          region="us-east-1",
          embed={
              "model":"llama-text-embed-v2",
              "field_map":{"text": "chunk_text"}
          }
      )
      print("Index is created sucessfully")
    else:
      print("Index was aldready present")
    idx = self.pc.Index(self.index_name)
    time.sleep(10)
    return idx

  def insert(self, id: str, summary: str, title: str, namespace: Literal["paper", "dataset", "algo"]):
    """
    Inserts a record into the index. The text is embedded automatically by Pinecone.
    
    Args:
      id: The unique identifier for the record.
      summary: The text content to be embedded and stored.
      title: The title of the document.
      namespace: The namespace to insert the record into.
    """
    records = [{
      "id": id,
      "chunk_text": summary,
      "title": title
    }]
    temp = self.index.describe_index_stats()
    self.index.upsert_records(namespace,records)
    print(f"Successfully inserted record with id: {id} into namespace: {namespace}")

  def retrieve(self , k:int , query: str, namespace: Literal["paper","dataset","algo"]):
    reranked_results = self.index.search(
      namespace=namespace,
      query={
        "top_k": k,
        "inputs": {"text": query}
      },
      rerank={
        "model": "bge-reranker-v2-m3",
        "top_n": k,
        "rank_fields": ["chunk_text"]
      }   
    )
    output = []
    for hit in reranked_results['result']['hits']:
      output.append({
        "id": hit["_id"],
        "score": round(hit['_score'], 2),
        "text": hit['fields']['chunk_text']
      })
    return json.dumps(output, indent=2)

  def clear_all(self):
    """
    Deletes the entire index from your Pinecone project.
    """
    print(f"Deleting index '{self.index_name}' entirely...")
    self.pc.delete_index(self.index_name)
    print("Index deleted successfully.")

In [2]:
import os
from dotenv import load_dotenv
# load env
load_dotenv()
pinecone_api_key = os.getenv("PINECONE_API_KEY")

# Initialize database
db = VectorDatabase(pinecone_api_key=pinecone_api_key, index_name="anu")


Index is created sucessfully


In [3]:
# db.index.describe_index_stats()

In [4]:
summary_text = "This paper introduces a novel algorithm for time-series forecasting using attention mechanisms."
db.insert(
    id="paper_001",
    summary=summary_text,
    title="Attention-based Forecasting",
    namespace="paper"
)


Successfully inserted record with id: paper_001 into namespace: paper


In [5]:
# db.index.describe_index_stats()

In [6]:
# Insert another record
summary_text2 = "This dataset contains climate data including rainfall, temperature, and humidity over 30 years."
db.insert(
    id="dataset_001",
    summary=summary_text2,
    title="Climate Dataset",
    namespace="dataset"
)

Successfully inserted record with id: dataset_001 into namespace: dataset


In [7]:
# db.index.describe_index_stats()

In [8]:
query = "time series forecasting with attention"
results = db.retrieve(k=1,query=query, namespace="paper")

print("Search Results:", results)


Search Results: [
  {
    "id": "paper_001",
    "score": 0.99,
    "text": "This paper introduces a novel algorithm for time-series forecasting using attention mechanisms."
  }
]


In [9]:
db.clear_all()

Deleting index 'anu' entirely...
Index deleted successfully.
