# Import Libraries

In [27]:
import os
from langchain_openai import OpenAIEmbeddings
import hashlib
from pinecone import Pinecone
from datetime import date

# API Keys

In [28]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

# Initialization

In [29]:
organization_name = input("Input organization name")
meeting_title = input("Input meeting title")

# Pinecone Initialization
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(organization_name)

# OpenAI Initialization
EMBEDDINGS = OpenAIEmbeddings(model='text-embedding-3-small', openai_api_key=OPENAI_API_KEY)

### Sample Transcript

In [30]:
sample_transcript = """Czech: Hello my name is Czech.     
Gian: Hello my name is Gian.
Shaundyl: Hello my name is Shaundyl.
Czech (Team Lead): Alright, everyone, thanks for joining today’s meeting. We have about 10 minutes to go over the final details before the product launch. Let's start with the progress update. Bob, how are we doing on the development front?
Shaundyl (Developer): Things are looking good. We’ve implemented all the major features. I’m currently working on the final round of bug fixes. I should be done with it by tomorrow, but I need the QA team to give it another pass afterward.
Czech: Great to hear that. Gian, how are we looking on the project timeline? Any changes or concerns?
Gian (Project Manager): We’re on track, but barely. The marketing materials were delayed by two days, but I managed to align the social media schedule to compensate for the delay. As long as the development and testing stay on track, we should meet the launch date. We might want to allocate some buffer time for any last-minute issues though.
Czech: Makes sense. Shaundyl, do you think we’ll have time for a final round of testing before we push live?
Shaundyl: We should. I’ll aim to finish bug fixes by midday tomorrow. If QA can start immediately after, we’ll have 24 hours for testing before the go-live. I’ll stay available for any hotfixes, just in case.
Czech: Perfect. Gian, can you make sure the QA team is on standby for tomorrow afternoon?
Gian: Absolutely. I’ll notify them as soon as we’re done here. I’ll also double-check the launch checklist to make sure nothing’s been missed.
Czech: Great. And one last thing – how are we handling customer support on launch day? Any special preparations?
Gian: We’ve set up a dedicated support channel for the product and briefed the customer support team on the common issues we’re anticipating. We’ll also monitor social media for any unexpected feedback.
Czech: Sounds like we’re in good shape. Thanks, everyone. Let’s aim to regroup tomorrow for a final status check. Anything else before we wrap up?
Shaundyl: Nothing from my side. I’ll update you if any blockers come up.
Gian: I’m all set. Let’s get this done!
Czech: Alright then, thanks again! Talk tomorrow.
"""

# Chunking

### Original Chunking

In [15]:
def chunk_text(text, max_chunk_size=500):
    # Ensure each text ends with a newline to correctly split sentences
    if not text.endswith("\n"):
        text += "\n"

    # Split text into sentence
    sentences = text.split("\n")
    chunks = []
    current_chunk = ""

    # Iterate over sentence and assemble chunks
    for sentence in sentences:
        # Check if adding the current sentence exceeds the maximum chunk size
        if (len(current_chunk) + len(sentences) + 2 > max_chunk_size and current_chunk):
            # Add the current chunk to the list and start a new chunk
            chunks.append(current_chunk.strip())
            current_chunk = ""
        # Add the current sentence to the current chunk
        current_chunk += sentence.strip() + "\n"
    # Add any remaining text as the last chunk
    if (current_chunk):
        chunks.append(current_chunk.strip())

    return chunks # type: list[str]

chunked_text = chunk_text(text=sample_transcript)
print("\n".join(f"Chunk {i+1}:\n{chunk}" for i, chunk in enumerate(chunked_text)))

Chunk 1:
Czech: Hello my name is Czech.
Gian: Hello my name is Gian.
Shaundyl: Hello my name is Shaundyl.
Czech (Team Lead): Alright, everyone, thanks for joining today’s meeting. We have about 10 minutes to go over the final details before the product launch. Let's start with the progress update. Bob, how are we doing on the development front?
Shaundyl (Developer): Things are looking good. We’ve implemented all the major features. I’m currently working on the final round of bug fixes. I should be done with it by tomorrow, but I need the QA team to give it another pass afterward.
Chunk 2:
Czech: Great to hear that. Gian, how are we looking on the project timeline? Any changes or concerns?
Gian (Project Manager): We’re on track, but barely. The marketing materials were delayed by two days, but I managed to align the social media schedule to compensate for the delay. As long as the development and testing stay on track, we should meet the launch date. We might want to allocate some buffe

### Recursive Chunking

In [42]:
def chunk_text_recursive(text, max_chunk_size=500):
    # Helper function for recursive chunking
    def recursive_chunk(sentences, current_chunk=""):
        # Base case: if no sentences are left, return the current chunk
        if not sentences:
            return [current_chunk.strip()] if current_chunk.strip() else []

        # Extract the next sentence
        sentence = sentences[0]
        remaining_sentences = sentences[1:]

        # Check if adding the current sentence exceeds the max_chunk_size
        if len(current_chunk) + len(sentence) + 1 > max_chunk_size:
            # Return the current chunk and continue with the next sentences
            return [current_chunk.strip()] + recursive_chunk(remaining_sentences, "")
        else:
            # Add the current sentence and continue recursively
            return recursive_chunk(remaining_sentences, current_chunk + sentence.strip() + "\n")

    # Ensure each text ends with a newline for sentence splitting
    if not text.endswith("\n"):
        text += "\n"

    # Split text into sentences by newline and filter out empty sentences
    sentences = [sentence for sentence in text.split("\n") if sentence.strip()]

    # Start recursive chunking
    return recursive_chunk(sentences)

chunked_text = chunk_text_recursive(text=sample_transcript)
print("\n".join(f"Chunk {i+1}:\n{chunk}" for i, chunk in enumerate(chunked_text)))

Chunk 1:
Czech: Hello my name is Czech.
Gian: Hello my name is Gian.
Shaundyl: Hello my name is Shaundyl.
Czech (Team Lead): Alright, everyone, thanks for joining today’s meeting. We have about 10 minutes to go over the final details before the product launch. Let's start with the progress update. Bob, how are we doing on the development front?
Chunk 2:
Czech: Great to hear that. Gian, how are we looking on the project timeline? Any changes or concerns?
Gian (Project Manager): We’re on track, but barely. The marketing materials were delayed by two days, but I managed to align the social media schedule to compensate for the delay. As long as the development and testing stay on track, we should meet the launch date. We might want to allocate some buffer time for any last-minute issues though.
Chunk 3:
Shaundyl: We should. I’ll aim to finish bug fixes by midday tomorrow. If QA can start immediately after, we’ll have 24 hours for testing before the go-live. I’ll stay available for any hotf

# Generate Embeddings

In [43]:
def generate_embeddings(texts):
    """
    Generate embeddings for a list of text.
    """
    embedded = EMBEDDINGS.embed_documents(texts)

    print("Generating embeddings: Done!")
    return embedded

chunked_text_embeddings = generate_embeddings(texts=chunked_text)
print(chunked_text_embeddings)

Generating embeddings: Done!
[[0.014478643424808979, 0.018023155629634857, 0.016808252781629562, -0.014416019432246685, -0.04786970093846321, 0.0037981390487402678, -0.007627589628100395, 0.04889673367142677, 0.014228147454559803, -0.07474787533283234, 0.014804287813603878, -0.036021262407302856, -0.02513722889125347, -0.014340871013700962, 0.005200914107263088, 0.03754928335547447, -0.09278355538845062, 0.004461952019482851, -0.04185780882835388, 0.013338888995349407, 0.01020769402384758, 0.0143032968044281, -0.01809830404818058, 0.06392646580934525, 0.01130987424403429, -0.011397548019886017, -0.029257882386446, -0.008379076607525349, -0.009268335998058319, -0.040229588747024536, -0.005852202419191599, -0.013388987630605698, 0.011616731993854046, -0.007139123510569334, 0.010433140210807323, 0.004800121299922466, 0.018336275592446327, 0.023396285250782967, -0.015818795189261436, 0.004984861705452204, -0.03787492960691452, -0.010182644240558147, 0.01556829921901226, -0.0222565308213233

# Combine Vector and Text

In [44]:
def generate_short_id(content):
    """
    Generate a short ID based on the content using SHA-256 hash.
    """
    hash_obj = hashlib.sha256()
    hash_obj.update(content.encode("utf-8"))

    print("Generating short id: Done!")
    return hash_obj.hexdigest()

def combine_vector_and_text(texts, meeting_title, text_embeddings):
    """
    Process a list of texts along with their embeddings.
    """
    today = str(date.today())
    
    data_with_metadata = []

    for doc_text, embedding in zip(texts, text_embeddings):
        if not isinstance(doc_text, str):
            doc_text = str(doc_text)

        if not isinstance(meeting_title, str):
            meeting_title = str(meeting_title)

        if not isinstance(today, str):
            today = str(today)

        text_id = generate_short_id(doc_text)
        data_item = {
            "id": text_id,
            "values": embedding,
            "metadata": {"text": doc_text, "title": meeting_title, "date": today},
        }

        data_with_metadata.append(data_item)

    print("Combining vector and text: Done!")
    return data_with_metadata

data_with_meta_data = combine_vector_and_text(texts=chunked_text, meeting_title=meeting_title, text_embeddings=chunked_text_embeddings)
print(data_with_meta_data)

Generating short id: Done!
Generating short id: Done!
Generating short id: Done!
Generating short id: Done!
Combining vector and text: Done!
[{'id': 'e773312289360121c6069407784bbc732f6f8d4505330eb28deaaa0add6a0cd7', 'values': [0.014478643424808979, 0.018023155629634857, 0.016808252781629562, -0.014416019432246685, -0.04786970093846321, 0.0037981390487402678, -0.007627589628100395, 0.04889673367142677, 0.014228147454559803, -0.07474787533283234, 0.014804287813603878, -0.036021262407302856, -0.02513722889125347, -0.014340871013700962, 0.005200914107263088, 0.03754928335547447, -0.09278355538845062, 0.004461952019482851, -0.04185780882835388, 0.013338888995349407, 0.01020769402384758, 0.0143032968044281, -0.01809830404818058, 0.06392646580934525, 0.01130987424403429, -0.011397548019886017, -0.029257882386446, -0.008379076607525349, -0.009268335998058319, -0.040229588747024536, -0.005852202419191599, -0.013388987630605698, 0.011616731993854046, -0.007139123510569334, 0.010433140210807323,

# Upsert to Pinecone

In [45]:
def upsert_data_to_pinecone(data_with_metadata, namespace):
    """
    Upsert data with metadata into a Pinecone index.
    """
    index.upsert(vectors=data_with_metadata, namespace=namespace)
    print("Upserting vectors to Pinecone: Done!")

upsert_data_to_pinecone(data_with_metadata=data_with_meta_data, namespace=meeting_title)
index.describe_index_stats()

Upserting vectors to Pinecone: Done!


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

# MAIN

In [None]:
def store_to_pinecone(texts, meeting_title):
    today = str(date.today()) # INITIALIZATION FOR DATE (DYNAMIC) BASED ON STORING

    chunked_text = chunk_text(text=texts)
    chunked_text_embeddings = generate_embeddings(texts=chunked_text)
    data_with_meta_data = combine_vector_and_text(texts=chunked_text, meeting_title=meeting_title, date=today,  text_embeddings=chunked_text_embeddings)
    upsert_data_to_pinecone(data_with_metadata=data_with_meta_data, namespace_name=meeting_title)

store_to_pinecone(texts=sample_transcript, meeting_title=index,)

Generating embeddings: Done!
Generating short id: Done!
Generating short id: Done!
Generating short id: Done!
Generating short id: Done!
Generating short id: Done!
Combining vector and text: Done!


TypeError: upsert_data_to_pinecone() got an unexpected keyword argument 'namespace_name'