# Import Libraries

In [2]:
import os
from langchain_openai import OpenAIEmbeddings
import hashlib
from pinecone import Pinecone
from datetime import date
from pinecone import ServerlessSpec
import time
from langchain_pinecone import PineconeVectorStore

# API Keys

In [8]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

# Initialization

In [9]:
# Pinecone Initialization
pc = Pinecone(api_key=PINECONE_API_KEY)
# index = pc.Index("echo-openai")

# OpenAI Initialization
EMBEDDINGS = OpenAIEmbeddings(model='text-embedding-3-small', openai_api_key=OPENAI_API_KEY)

# Partitioning

### Index

In [None]:
# Pinecone Initalization
pc = Pinecone(api_key=PINECONE_API_KEY)

# organization name
index_name = input("Give organization name")

# If organization name does not exist, it creates new index
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ),
        deletion_protection="disabled"
    )
    # wait for index to be ready
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)
    print(index_name + "is created successfully.")
    index = pc.Index(index_name)
else:
    print("Organization already exists.")

Organization already exists.


### Upsert

In [3]:
namespace = input("Give meeting title")

docsearch = PineconeVectorStore.from_documents(
    documents="transcript ni siya",
    index_name=index_name,
    embedding="embeddings",
    namespace=namespace
)

time.sleep(5)

NameError: name 'index_name' is not defined

### Query

In [14]:
index = pc.Index("echo-openai")
namespace = "USJ-R"

for ids in index.list(namespace=namespace):
    query = index.query(
        id=ids[0],
        namespace=namespace,
        top_k=4,
        include_values=True,
        include_metadata=True
    )
print(query)

{'matches': [{'id': '02322b814df2e040435198dacf5f4b0dcf4118fde053c6a5c28ae80e53a249c0',
              'metadata': {'date': '2024-09-13',
                           'text': '[{"Gien":" Hello my name is Gien.  Hello '
                                   'my name is Vandil."},{"a":" This is a test '
                                   'recording."},{"Gien":" This is a  '
                                   'Fantastic."}]',
                           'title': 'Update on development'},
              'score': 1.00033283,
              'values': [-7.58200476e-05,
                         0.00808506086,
                         0.0391380601,
                         -0.0411629416,
                         -0.0185276624,
                         -0.0443449,
                         -0.0222736932,
                         0.0140150702,
                         0.0113393338,
                         -0.0282470919,
                         0.0154903401,
                         -0.0396876708,
       

# Chunking

In [None]:
def chunk_text(text, max_chunk_size=500):
    # Ensure each text ends with a newline to correctly split sentences
    if not text.endswith("\n"):
        text += "\n"

    # Split text into sentence
    sentences = text.split("\n")
    chunks = []
    current_chunk = ""

    # Iterate over sentence and assemble chunks
    for sentence in sentences:
        # Check if adding the current sentence exceeds the maximum chunk size
        if (len(current_chunk) + len(sentences) + 2 > max_chunk_size and current_chunk):
            # Add the current chunk to the list and start a new chunk
            chunks.append(current_chunk.strip())
            current_chunk = ""
        # Add the current sentence to the current chunk
        current_chunk += sentence.strip() + "\n"
    # Add any remaining text as the last chunk
    if (current_chunk):
        chunks.append(current_chunk.strip())

    return chunks # type: list[str]

# chunked_text = chunk_text(text=text1)
# print(chunked_text)

# Generate Embeddings

In [None]:
def generate_embeddings(texts):
    """
    Generate embeddings for a list of text.
    """
    embedded = EMBEDDINGS.embed_documents(texts)

    print("Generating embeddings: Done!")
    return embedded

# chunked_text_embeddings = generate_embeddings(texts=chunked_text)
# print(chunked_text_embeddings)

# Combine Vector and Text

In [None]:
def generate_short_id(content):
    """
    Generate a short ID based on the content using SHA-256 hash.
    """
    hash_obj = hashlib.sha256()
    hash_obj.update(content.encode("utf-8"))

    print("Generating short id: Done!")
    return hash_obj.hexdigest()

def combine_vector_and_text(texts, meeting_title, date, text_embeddings):
    """
    Process a list of texts along with their embeddings.
    """
    data_with_metadata = []

    for doc_text, embedding in zip(texts, text_embeddings):
        if not isinstance(doc_text, str):
            doc_text = str(doc_text)

        if not isinstance(meeting_title, str):
            meeting_title = str(meeting_title)

        if not isinstance(date, str):
            date = str(date)

        text_id = generate_short_id(doc_text)
        data_item = {
            "id": text_id,
            "values": embedding,
            "metadata": {"text": doc_text, "title": meeting_title, "date": date},
        }

        data_with_metadata.append(data_item)

    print("Combining vector and text: Done!")
    return data_with_metadata

# data_with_meta_data = combine_vector_and_text(texts=chunked_text, meeting_title=meeting_title, date=date, text_embeddings=chunked_text_embeddings)
# print(data_with_meta_data)

# Upsert to Pinecone

In [None]:
def upsert_data_to_pinecone(data_with_metadata, namespace_name):
    """
    Upsert data with metadata into a Pinecone index.
    """
    index.upsert(vectors=data_with_metadata, namespace=namespace_name)
    print("Upserting vectors to Pinecone: Done!")

# upsert_data_to_pinecone(data_with_metadata=data_with_meta_data, namespace_name=organization)
# index.describe_index_stats()

# MAIN

In [None]:
def Pinecone(texts, meeting_title):
    today = str(date.today()) # INITIALIZATION FOR DATE (DYNAMIC) BASED ON STORING
    namespace = 'USJ-R' # NAMESPACE DEFAULTED TO 'USJ-R' FOR ISOLATION (STATIC)

    chunked_text = chunk_text(text=texts)
    chunked_text_embeddings = generate_embeddings(texts=chunked_text)
    data_with_meta_data = combine_vector_and_text(texts=chunked_text, meeting_title=meeting_title, date=today,  text_embeddings=chunked_text_embeddings)
    upsert_data_to_pinecone(data_with_metadata=data_with_meta_data, namespace_name=namespace)

Pinecone(texts=text1, meeting_title="Kickoff Meeting for Software Development Project")

# Chatbot