## RAG

### Indexing
1. **Load**: First we need to load our data. This is done with CSV Loaders
2. **Split**: Text splitters break large Documents into smaller chunks. This is useful both for indexing data and for passing it in to a model, since large chunks are harder to search over and won't fit in a model's finite context window.
3. **Store**: We need somewhere to store and index our splits, so that they can later be searched over. This is often done using a VectorStore and Embeddings model.

### Retrieval and generation
4. **Retrieve**: Given a user input, relevant splits are retrieved from storage using a Retriever.
5. **Generate**: A ChatModel / LLM produces an answer using a prompt that includes the question and the retrieved data

## Package Installation

In [None]:
!pip install langchain langchain_community langchain_chroma langchain-openai langchainhub gradio

## Use OpenAI API Key

In [None]:
import getpass
import os

# Set the OpenAI API key for accessing the OpenAI services
os.environ["OPENAI_API_KEY"] = getpass.getpass()

from langchain_openai import ChatOpenAI

# Initialize the ChatOpenAI object with the GPT-4 model
llm = ChatOpenAI(model="gpt-4o")


## Load data from CSV

In [None]:
import os
import csv
from langchain_community.document_loaders.csv_loader import CSVLoader

# Define the directory containing the database files
database_folder = "Database"

# List all files in the database folder
files = [os.path.join(database_folder, f) for f in os.listdir(database_folder) if os.path.isfile(os.path.join(database_folder, f))]

all_docs = []

# Function to detect the structure of a CSV file
def detect_csv_structure(file_path):
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        header = next(reader)
    return header

# Load data from each CSV file in the folder
for file in files:
    if file.endswith(".csv"):
        header = detect_csv_structure(file)
        print(f"Detected columns in {file}: {header}")
        source_column = "URL" if "URL" in header else header[0]  # Adjust source column as needed
        loader = CSVLoader(
            file_path=file,
            source_column=source_column,
            csv_args={
                "delimiter": ",",
                "quotechar": '"',
                "fieldnames": header
            }
        )
        docs = loader.load()
        print(f"Loaded {len(docs)} documents from {file}")
        all_docs.extend(docs)

print(f"Total documents loaded: {len(all_docs)}")


## Split data to chunks

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Initialize the text splitter with specific chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(all_docs)

print(f"Total splits created: {len(all_splits)}")
print(all_splits[0])


## Store document and embedding to vector database

In [None]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
import time

# Initialize the embeddings
embedding = OpenAIEmbeddings(model='text-embedding-3-small')

# Initialize the vector store for storing document embeddings
vectorstore = Chroma(
    collection_name="my_collection",
    embedding_function=embedding,
    persist_directory="./chroma_db"
)

batch_size = 10

# Add document splits to the vector store in batches
for i in range(0, len(all_splits), batch_size):
    batch = all_splits[i: i + batch_size]
    vectorstore.add_documents(batch)
    time.sleep(1)


## Similarity search

In [None]:
from scipy import spatial

# Testing question
question = "What is global investor expand engagement?"

# Create a retriever object to search for similar documents
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# Retrieve documents relevant to the question
retrieved_docs = retriever.invoke(question)

print(f"Total documents retrieved: {len(retrieved_docs)}")
# print(retrieved_docs[0])

# Calculate and print similarity scores
question_embedding = embedding.embed_query(question)  # Embed the question
for i in range(len(retrieved_docs)):
    doc_embedding = embedding.embed_documents([retrieved_docs[i].page_content])[0]  # Embed the document
    similarity = 1 - spatial.distance.cosine(question_embedding, doc_embedding)  # Calculate cosine similarity
    print(f"Document {i+1} (Similarity: {similarity:.4f}):\n{retrieved_docs[i].page_content}\n")

## Reply generation

In [None]:
from langchain import hub

# Load a predefined prompt from the Langchain hub
prompt = hub.pull("rlm/rag-prompt")

# Generate example messages based on the retrieved documents and the question
example_messages = prompt.invoke(
    {"context": "{retrieved_docs}", "question": "{your_question}", "reference": "{source}"}
).to_messages()

print(example_messages[0].content)


## Customize the prompt

In [None]:
from langchain_core.prompts import PromptTemplate

# Define a custom prompt template
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer and followed by the reference contains URL source (if it doesn't contain URL don't put any reference).

{context}

Question: {question}

Helpful Answer:

Reference:
- {source_column}"""

# Create a prompt object from the template
prompt = PromptTemplate.from_template(template)
example_messages = prompt.invoke(
    {
        "context": retrieved_docs[0].page_content,
        "question": question,
        # "source_file": files[0],  # Replace with actual source file
        "source_column": 'URL'  # Replace with actual source column
    }
).to_messages()

print(example_messages[0].content)


## LCEL (Langchain Expression Language)

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Function to format documents for the prompt
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Create a RAG (Retrieval-Augmented Generation) chain
rag_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough(),
        # "source_file": lambda x: files[0],  # Provide source_file
        "source_column": lambda x: "URL"  # Provide source_column
    }
    | prompt
    | llm
    | StrOutputParser()
)

# Stream the response from the RAG chain for the given question
for chunk in rag_chain.stream(question):
    print(chunk, end="", flush=True)


## Gradio interface setup

In [None]:
import gradio as gr
import datetime
import csv
import os

# Function to log interactions to a CSV file
def log_interaction_csv(user_message, bot_message, vote_message=None, log_file="chat_log.csv"):
    file_exists = os.path.isfile(log_file)
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    try:
        with open(log_file, "a+", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            if not file_exists:
                writer.writerow(["Timestamp", "Query", "Answer", "Satisfaction"])

            if vote_message:
                file.seek(0)
                rows = list(csv.reader(file))
                if len(rows) > 1:
                    rows[-1][-1] = vote_message
                    file.seek(0)
                    file.truncate()
                    writer.writerows(rows)
            else:
                writer.writerow([timestamp, user_message, bot_message, ""])

    except Exception as e:
        print(f"Error writing to file: {e}")

# Function to generate a chatbot response
def chatbot_response(question):
    response = ""
    for chunk in rag_chain.stream(question):
        response += chunk
    return response

# Function to handle user votes on responses
def vote(data: gr.LikeData):
    vote_message = "Liked" if data.liked else "Disliked"
    log_interaction_csv("", "", vote_message)
    print(vote_message)

# Gradio interface setup
with gr.Blocks() as demo:
    gr.Markdown("## TABC - ChatBot V.0.1\nThe database of chatbot (V.0.1) now contains detailed information about the GRESB foundation, its impact, and sustainability focus.")
    chatbot = gr.Chatbot(label="Ask me anything!")

    with gr.Row():
        txt = gr.Textbox(show_label=False, placeholder="Enter your question here...")
        submit_btn = gr.Button("Send")
        retry_btn = gr.Button("Regenerate") # Add a retry button

        # Function to handle user message input
        def user_message(message, history):
            history.append((message, None))
            return history, ""

        # Function to handle bot response
        def bot_response(history):
            user_message = history[-1][0]
            bot_message = chatbot_response(user_message)
            history[-1] = (user_message, bot_message)

            # Log the interaction
            log_interaction_csv(user_message, bot_message)

            return history

        # Function to handle retry
        def retry(history):
            if history:
                last_question = history[-1][0]  # Get the last question
                history.pop() # Remove the last interaction
                # Re-run the last question by triggering user_message and bot_response
                history, _ = user_message(last_question, history)
                history = bot_response(history)
            return history, ""

    # Handle the submit, click, and retry events
    txt.submit(user_message, [txt, chatbot], [chatbot, txt], queue=False).then(
        bot_response, chatbot, chatbot
    )
    submit_btn.click(user_message, [txt, chatbot], [chatbot, txt], queue=False).then(
        bot_response, chatbot, chatbot
    )
    retry_btn.click(retry, chatbot, [chatbot, txt])
    # Add the voting functionality to the chatbot
    chatbot.like(vote, None, None)

# Launch the Gradio interface
demo.launch()


In [None]:
# List of examples
"""
1. What is global investor expand engagement?
2. Give me some information about ticker named "REG"
3. How did [Fund Name X] perform in the first quarter of 2020?" (Replace [Fund Name X] with an actual fund from your data)
4. Which fund in the [Region Y] region showed the strongest growth in 2021?
5. What was the average performance of [Sector Z] funds in the second half of 2022?

"""
