<a href="https://colab.research.google.com/github/ankitrahejagatech/GenAISprint/blob/main/sprint1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Ensure the necessary libraries are installed
!pip install langchain openai chromadb pandas requests tiktoken langchain-community

import os
import pandas as pd
from io import StringIO
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chains import VectorDBQAWithSourcesChain
from langchain.callbacks import get_openai_callback



In [None]:
# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = ""  # Replace with your valid API key

# Sample CSV data for a single user's daily healthcare information over 7 days
csv_data = """
date,time,blood_pressure,sugar_level,steps,calories_burned,notes
2024-06-08,08:00,120/80,90,500,50,Felt good after morning walk.
2024-06-08,10:00,115/78,85,300,30,Had a light breakfast.
2024-06-08,12:00,118/80,95,200,20,Worked at desk.
2024-06-08,14:00,122/82,100,100,10,Lunch with friends.
2024-06-08,16:00,125/85,110,150,15,Afternoon snack.
2024-06-08,18:00,120/80,105,400,40,Evening walk.
2024-06-08,20:00,115/78,98,250,25,Dinner at home.
2024-06-08,22:00,118/80,92,100,10,Read a book before bed.
2024-06-07,08:00,130/85,95,450,45,Felt tired in the morning.
2024-06-07,10:00,125/80,90,250,25,Had a big breakfast.
2024-06-07,12:00,120/80,100,150,15,Worked at desk.
2024-06-07,14:00,123/83,105,100,10,Lunch alone.
2024-06-07,16:00,126/87,115,100,10,Felt sleepy.
2024-06-07,18:00,122/80,110,350,35,Evening run.
2024-06-07,20:00,118/78,105,200,20,Light dinner.
2024-06-07,22:00,116/79,97,100,10,Watched TV before bed.
2024-06-06,08:00,125/82,85,500,50,Morning walk.
2024-06-06,10:00,118/80,80,300,30,Healthy breakfast.
2024-06-06,12:00,120/82,90,200,20,Desk work.
2024-06-06,14:00,123/84,95,150,15,Lunch with colleagues.
2024-06-06,16:00,122/80,105,150,15,Afternoon tea.
2024-06-06,18:00,120/78,100,400,40,Evening jog.
2024-06-06,20:00,115/76,95,250,25,Home dinner.
2024-06-06,22:00,117/78,90,150,15,Read book before bed.
2024-06-05,08:00,130/85,100,400,40,Feeling tired.
2024-06-05,10:00,125/80,95,300,30,Skipped breakfast.
2024-06-05,12:00,120/82,105,200,20,Worked through lunch.
2024-06-05,14:00,124/85,110,100,10,Quick lunch.
2024-06-05,16:00,126/86,120,150,15,Afternoon meeting.
2024-06-05,18:00,122/80,115,350,35,Evening gym.
2024-06-05,20:00,118/78,108,200,20,Light dinner.
2024-06-05,22:00,116/79,100,100,10,Watched movie.
2024-06-04,08:00,125/82,90,500,50,Morning run.
2024-06-04,10:00,120/80,85,300,30,Healthy breakfast.
2024-06-04,12:00,118/80,95,200,20,Desk work.
2024-06-04,14:00,122/82,100,100,10,Lunch with friends.
2024-06-04,16:00,125/85,110,150,15,Afternoon walk.
2024-06-04,18:00,120/80,105,400,40,Evening jog.
2024-06-04,20:00,115/78,98,250,25,Dinner at home.
2024-06-04,22:00,118/80,92,100,10,Read book before bed.
2024-06-03,08:00,130/85,95,450,45,Morning yoga.
2024-06-03,10:00,125/80,90,250,25,Big breakfast.
2024-06-03,12:00,120/80,100,150,15,Worked at desk.
2024-06-03,14:00,123/83,105,100,10,Lunch alone.
2024-06-03,16:00,126/87,115,100,10,Afternoon nap.
2024-06-03,18:00,122/80,110,350,35,Evening run.
2024-06-03,20:00,118/78,105,200,20,Light dinner.
2024-06-03,22:00,116/79,97,100,10,Watched TV.
2024-06-02,08:00,125/82,85,500,50,Felt great after morning walk.
2024-06-02,10:00,118/80,80,300,30,Healthy breakfast.
2024-06-02,12:00,120/82,90,200,20,Desk work.
2024-06-02,14:00,123/84,95,150,15,Lunch with colleagues.
2024-06-02,16:00,122/80,105,150,15,Afternoon tea.
2024-06-02,18:00,120/78,100,400,40,Evening jog.
2024-06-02,20:00,115/76,95,250,25,Home dinner.
2024-06-02,22:00,117/78,90,150,15,Read book before bed.
"""

# Configuration parameters
CONFIG = {
    "csv_data": csv_data,  # Directly use the sample CSV data
    "csv_fields": ["date", "time", "blood_pressure", "sugar_level", "steps", "calories_burned", "notes"],  # Fields to use from CSV files
    "chunk_strategy": "recursive",  # options: "character", "recursive"
    "chunk_size": 1000,  # Size of each chunk
    "chunk_overlap": 200,  # Overlap size between chunks
    "embedding_model": "text-embedding-ada-002",  # Using text-embedding-ada-002 for embeddings
    "top_k_responses": 5,  # Number of top responses to retrieve
    "clean_data": True,  # Whether to clean data
    "advanced_retrieval": True,  # Whether to use advanced retrieval strategies
    "rerank_responses": True,  # Whether to rerank responses
    "use_rewriter": True,  # Whether to use a rewriter for the query
    "embedding_engine_type": "openai",  # Type of embedding engine to use, e.g., "openai"
    "query_prompt": "How was my health today, and has this improved in the last 7 days?"  # Query prompt to be used
}

def load_documents(config):
    """Load and process CSV data into documents."""
    from langchain.schema import Document
    documents = []

    # Load CSV data
    print("Loading CSV data...")
    df = pd.read_csv(StringIO(config["csv_data"]))
    # Ensure all fields are strings before joining
    df[config["csv_fields"]] = df[config["csv_fields"]].astype(str)
    text_data = df[config["csv_fields"]].apply(lambda x: ' '.join(x.dropna()), axis=1).tolist()
    for text in text_data:
        # Create Document objects with text as page_content
        documents.append(Document(page_content=text, metadata={}))
    print(f"Loaded {len(documents)} documents.")
    return documents

def clean_documents(documents):
    """Clean and preprocess documents."""
    print("Cleaning documents...")
    cleaned_documents = documents  # Placeholder for cleaning logic
    print("Documents cleaned.")
    return cleaned_documents

def chunkify_documents(documents, strategy="character", chunk_size=1000, overlap=200):
    """Split documents into chunks using the specified strategy."""
    print("Chunkifying documents...")
    if strategy == "character":
        text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    elif strategy == "recursive":
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    chunks = []
    for doc in documents:
        # Use dot notation to access 'page_content' attribute
        chunks.extend(text_splitter.split_text(doc.page_content))
    print(f"Created {len(chunks)} chunks.")
    return chunks

def embed_chunks(chunks, embedding_model, embedding_engine_type="openai"):
    """Embed the chunks using the specified embedding model and engine."""
    print(f"Embedding chunks using {embedding_engine_type} engine with model {embedding_model}...")
    if embedding_engine_type == "openai":
        embedding_engine = OpenAIEmbeddings(model=embedding_model)
    else:
        embedding_engine = None  # Placeholder for other embedding engines

    # Ensure chunks are strings before embedding
    chunks = [str(chunk) for chunk in chunks]  # Convert each chunk to string explicitly

    embeddings = embedding_engine.embed_documents(chunks)
    print(f"Embedded {len(chunks)} chunks.")
    return embeddings, embedding_engine

def store_in_vectordb(documents, embeddings, embedding_engine):
    """Store the embeddings in a Chroma vector database."""
    print("Storing embeddings in VectorDB...")

    # Initialize Chroma with the embedding function
    vectordb = Chroma.from_documents(
        documents=documents,
        embedding=embedding_engine  # Pass the embedding engine to Chroma
    )

    print("Embeddings stored in VectorDB.")
    return vectordb

def rewrite_query(query):
    """Rewrite the query to improve retrieval performance."""
    print("Rewriting query...")
    rewritten_query = query  # Placeholder for rewriting logic
    print(f"Original query: {query}")
    print(f"Rewritten query: {rewritten_query}")
    return rewritten_query

def advanced_retrieve(vectordb, query_embedding, top_k=5):
    """Retrieve documents using advanced retrieval strategies."""
    print("Using advanced retrieval strategies...")
    responses = vectordb.similarity_search(query_embedding, k=top_k)
    return responses

def rerank_responses(responses):
    """Rerank the responses to improve accuracy."""
    print("Reranking responses...")
    reranked_responses = responses  # Placeholder for reranking logic
    return reranked_responses

def validate_response(response):
    """Validate the response to ensure it meets criteria."""
    print("Validating response...")
    validated_response = response  # Placeholder for validation logic
    print("Response validated.")
    return validated_response

def guardrail_validation(question):
    """Validate the input question for safety and appropriateness."""
    print("Validating input question...")
    validated_question = question  # Placeholder for guardrail logic
    print("Input question validated.")
    return validated_question

# Main RAG function
def run_rag_pipeline(config):
    """Run the RAG pipeline from loading documents to retrieving and validating responses."""
    print("Starting RAG process...")

    # Load and process documents
    documents = load_documents(config)

    # Clean documents if specified
    if config["clean_data"]:
        documents = clean_documents(documents)

    # Chunkify documents
    chunks = chunkify_documents(documents, strategy=config["chunk_strategy"], chunk_size=config["chunk_size"], overlap=config["chunk_overlap"])

    # Embed chunks
    embeddings, embedding_engine = embed_chunks(chunks, config["embedding_model"], config["embedding_engine_type"])

    # Store embeddings in VectorDB
    vectordb = store_in_vectordb(documents, embeddings, embedding_engine)

    # Define the input question from config
    question = config["query_prompt"]

    # Validate the input question
    question = guardrail_validation(question)

    # Rewrite query if specified
    if config["use_rewriter"]:
        question = rewrite_query(question)

    # Embed the query
    query_embedding = embedding_engine.embed_query(question)

    # Retrieve responses
    if config["advanced_retrieval"]:
        responses = advanced_retrieve(vectordb, query_embedding, top_k=config["top_k_responses"])
    else:
        responses = vectordb.similarity_search(query_embedding, k=config["top_k_responses"])

    # Rerank responses if specified
    if config["rerank_responses"]:
        responses = rerank_responses(responses)

    # Validate and print responses
    for response, score in responses:
        validated_response = validate_response(response)
        print(f"Response: {validated_response}\nScore: {score}\n")

    print("RAG process completed.")

# Run the main function
if __name__ == "__main__":
    run_rag_pipeline(CONFIG)



Starting RAG process...
Loading CSV data...
Loaded 56 documents.
Cleaning documents...
Documents cleaned.
Chunkifying documents...
Created 56 chunks.
Embedding chunks using openai engine with model text-embedding-ada-002...
Embedded 56 chunks.
Storing embeddings in VectorDB...
Embeddings stored in VectorDB.
Validating input question...
Input question validated.
Rewriting query...
Original query: How was my health today, and has this improved in the last 7 days?
Rewritten query: How was my health today, and has this improved in the last 7 days?
Using advanced retrieval strategies...


TypeError: expected string or buffer