## Step 1

In [1]:
from langchain_community.document_loaders import ConcurrentLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
loader = ConcurrentLoader.from_filesystem("/home/leapfrog/llm/github/kss/data_team_kss/aggregated_content.txt", glob="**/*.txt")
loaded_data = loader.load()
leapfrog_confluence_data = loaded_data[0].page_content

# Split # Recursive chunking divides the input text into smaller chunks in a hierarchical and iterative manner using a set of separators. If the initial attempt at splitting the text doesn’t produce chunks of the desired size or structure, the method recursively calls itself on the resulting chunks with a different separator or criterion until the desired chunk size or structure is achieved. This means that while the chunks aren’t going to be exactly the same size, they’ll still “aspire” to be of a similar size.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.create_documents([leapfrog_confluence_data])

In [None]:
splits

In [2]:
len(splits)

34

In [4]:
from langchain_cohere import CohereEmbeddings

import os
from dotenv import load_dotenv
load_dotenv()
os.environ['COHERE_API_KEY'] = os.getenv('COHERE_API_KEY')
os.environ['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')

embeddings = CohereEmbeddings()
index_name = "index-data-team-kss-slack"

In [5]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone()

if index_name not in pc.list_indexes().names():
  # Create the index  # https://docs.cohere.com/docs/embed-2
  pc.create_index(
    name=index_name,
    dimension=4096,
    metric='cosine',
    spec=ServerlessSpec(
      cloud="aws",
      region="us-east-1"
    )
  )

index = pc.Index(index_name)

In [7]:
from langchain_pinecone import PineconeVectorStore
from langchain_community.vectorstores import Pinecone

# Check if there is already some data in the index on Pinecone
if index.describe_index_stats()['total_vector_count'] > 0:
    # If there is, use from_existing_index to use the vector store
    vectorstore = Pinecone.from_existing_index(
        index_name,
        embeddings,
    )
    print("A vector index with this name already exists. It will be utilized.")
else:
    # If there is not, use from_documents to fill the vector store
    vectorstore = PineconeVectorStore.from_documents(
        splits,
        embeddings,
        index_name=index_name
    )
    print("Vector index created successfully.")

print('--------------------------')
print(index.describe_index_stats())

A vector index with this name already exists. It will be utilized.
----------------------------
{'dimension': 4096,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 34}},
 'total_vector_count': 34}


## Step 2

In [2]:
# Import required libraries
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_cohere import CohereEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import Pinecone

# Load credentials from environment variables
load_dotenv()
COHERE_API_KEY = os.getenv('COHERE_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
GROQ_API_KEY = os.getenv('GROQ_API_KEY')

if not all([COHERE_API_KEY, PINECONE_API_KEY, GROQ_API_KEY]):
    raise ValueError("Missing API keys. Please check your environment variables.")

In [3]:
embeddings = CohereEmbeddings()
index_name = "index-data-team-kss-slack"

try:
    docsearch = Pinecone.from_existing_index(index_name, embeddings)
    retriever = docsearch.as_retriever()
except Exception as e:
    raise ValueError(f"Error retrieving index '{index_name}': {str(e)}")

llm = ChatGroq(temperature=0, model_name="llama3-8b-8192")


In [4]:
# question = 'tell me about latest Leaptalk on ai?'
question = 'When was the last session related to health and who hosted it?'
# question = 'I was interested in ai initiatives, what can i can do for it?'

In [5]:

context = retriever.invoke(question)
context_str = "\n".join([doc.page_content for doc in context])

system = f"""
<rules>
NO MATTER WHAT, STRICTLY FOLLOW THESE RULES FOR EVERY QUESTION.
Answer QUESTION related to "Leapfrog Technology" that is in information provided.
	For example,
		- Leave policy
		- Company values, vision, mission, and strategy
		- Etc
Answer casual greetings and conversation QUESTION.
	For example,
		Human: Hey!
		AI: Hello! How can I help?
</rules>
Never give me any answers that are not mentioned inside the <rules></rules> above. If I asked about things not related to `Leapfrog Technology` like programming, literature, general knowledge question, business ideas, yourself, AI Model, OpenAI, anthropic, claude, version etc. respond that you only know about `Leapfrog Technology`.
You are AskRibby, an AI chatbot created by `Leapfrog Technology` which provide a detailed answer to a my QUESTION by analyzing the entire CONTEXT.
Do not use past history as a knowledge base for follow-ups or suggestions.
CONTEXT for the QUESTION is provided below.

{context_str}
"""

In [6]:
def process_question(system, question):
    prompt = ChatPromptTemplate.from_messages([("system", system), ("human", question)])
    chain_instance = prompt | llm

    try:
        for chunk in chain_instance.stream({"text": question}):
            if chunk.content is not None:
                print(chunk.content, end='')
    except Exception as e:
        print(f"Error processing the question: {str(e)}")

process_question(system, question)


According to the provided context, the last session related to health was the Ergonomics Session, which was hosted by Leapfrog Technology in collaboration with Sarwanidan Clinic and CAMS Nepal. The session was scheduled to take place on May 10th, 2024, from 4:00 PM to 5:00 PM at Mustang Hall, Wing B.