# Notebook for exploring variations on LLM, embeddings, RAG arch and prompt

## DBs that are going to be retrieved

In [None]:
dbs = [
    "full_docs",
    "fragments_docs",
    "posts_forum",
    "threads_forum",
]

## Options of embedding, chat and vectorscore to be tested

In [None]:
# we are going to test the open ai api for embeddings
embedding_models = ["text-embedding-3-small", "text-embedding-3-large", "text-embedding-ada-002"]

# we are going to test the open ai api for chat
chat_models = ["gpt-3.5-turbo-0125", "gpt-4o"]

# we are going to test the anthropic api for chat
chat_models_claude = ["claude-3-sonnet-20240229"]

# we are going to use faiss
vectorstores = ['faiss']

## Imports

In [None]:
# general
import pandas as pd
import numpy as np
import os, asyncio, time, re
from getpass import getpass
from datetime import datetime
import tiktoken # metrics
import nest_asyncio
nest_asyncio.apply()
from typing import Callable

# embedding and chat
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
# openai api key
openai_api_key = getpass("Enter the OpenAI API key: ")

# Anthropic chat
os.environ["ANTHROPIC_API_KEY"] = getpass("Enter your Anthropic API key: ")
from langchain_anthropic import ChatAnthropic

# vectorstore
if 'faiss' in vectorstores:
    from langchain_community.vectorstores import FAISS

# for tracking
import weave
from weave import Evaluation

## Import test data 

In [None]:
test_path = "../002_create_test_dataset/questions_test_dataset.csv"
test_dataset = pd.read_csv(test_path)

# drop origin
#test_dataset = test_dataset.drop(columns=['origin'])

# change columns
test_dataset = test_dataset.rename(columns={'answer': 'expected'})

# sample 50 questions
test_dataset = test_dataset.sample(50, random_state=42)

# as dict
test_dataset = test_dataset.to_dict(orient='records')
test_dataset

## General definitions for accessing data and creating model

In [None]:
def load_db(dbs, model_embeddings, vectorstore = 'faiss'):
    embeddings = OpenAIEmbeddings(model=model_embeddings, openai_api_key=openai_api_key)
    if vectorstore == 'faiss':
        dbs = [f"dbs/{name}_db/faiss/{model_embeddings}" for name in dbs]
        dbs = [FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True) for db_path in dbs]
        db = dbs[0]
        for db_ in dbs[1:]:
            db.merge_from(db_)
    
    return db

In [None]:
@weave.op()
def build_chat(chat_pars, prompt_template):
    prompt = ChatPromptTemplate.from_template(prompt_template)
    llm = ChatOpenAI(**chat_pars, openai_api_key=openai_api_key)
    chain = prompt | llm

    return chain, llm
    
@weave.op()
def build_retriever(dbs_name, embeddings_name, vectorstore = 'faiss', retriever_pars = {}):
    db = load_db(dbs_name, embeddings_name, vectorstore)
    if vectorstore == 'faiss':
        retriever = db.as_retriever(**retriever_pars)

    return retriever

class RAGModel(weave.Model):
    structure : str = "simple-rag" # just a retriever and a llm

    dbs_name : list
    embeddings_name : str

    vectorstore : str
    retriever_pars : dict

    prompt_template : str
    chat_pars : dict[str, str|int]

    @weave.op()
    def predict(self, question: str):
        retriever = build_retriever(self.dbs_name, self.embeddings_name, self.vectorstore, self.retriever_pars)
        chain, llm = build_chat(self.chat_pars, self.prompt_template)

        if self.vectorstore == 'faiss':
            context = retriever.invoke(question)

        response = chain.invoke(
            {
                "context": context,
                "question": question,
            }
        )
        
        return {"context": str(context), "answer": response.content}
    
    def ask(self, question: str):
        out = self.predict(question)
        return out["answer"]
    

## Metrics

In [None]:
# reference embedding
reference_embedding = "text-embedding-ada-002"
reference_embedding = OpenAIEmbeddings(model=reference_embedding, openai_api_key=openai_api_key)

# reference chat
reference_chat = "gpt-4o"
reference_chat = ChatOpenAI(model=reference_chat, openai_api_key=openai_api_key)

# reference tokenization
reference_tokenization = tiktoken.get_encoding("cl100k_base")

In [None]:
# EVALUATE THE RETRIEVER
def calc_context_recall(context, question, expected):
    # measure if the expected meaning is contained in the context meaning
    # info: https://aclanthology.org/2024.eacl-demo.16.pdf

    # we try to find a set of fundamental statements that encompass the meaning of the answer
    statements = reference_chat.invoke(
        f"Given a question and answer, return the fundamental statements from the answer's meaning. \n question: {question} \n answer: {expected}"
    ).content
    # we try to find if the statements are supported by the context
    veredicts = reference_chat.invoke(
        f"Consider the given context and following statements, then determine whether they are supported by the information present in the context. Provide a brief explanation for each statement before arriving at the verdict (Yes/No). Let the veredict be the final word of each line. \n\n <context> \n {context} <\\context> \n\n <statements> \n {statements} \n <\\statements>"
    ).content

    # get the last word of each line
    veredicts = veredicts.split("\n")
    veredicts = [veredict.split(" ")[-1].lower() for veredict in veredicts]
    n_yes = 0
    n_no = 0
    for veredict in veredicts:
        if "yes" in veredict:
            n_yes += 1
        elif "no" in veredict:
            n_no += 1
        else:
            None

    try:
        recall = n_yes / (n_yes + n_no)
    except ZeroDivisionError:
        recall = 0

    return recall

def calc_context_conciseness(context, expected):
    # measure the size of the context compared to the expected answer
    num_tokens_context = len(reference_tokenization.encode(context))
    num_tokens_expected = len(reference_tokenization.encode(expected))

    return num_tokens_expected / num_tokens_context

# EVALUATE THE LLM
def calc_answer_relevance(answer, question):
    # measure how much the answer resembles to be answering the question
    # info: https://aclanthology.org/2024.eacl-demo.16.pdf
    hipot_question = reference_chat.invoke(
        f"Generate a question for the given answer. \n answer: {answer}"
    ).content
    hipot_question_embedding = np.array(reference_embedding.embed_query(hipot_question))
    question_embedding = np.array(reference_embedding.embed_query(question))
    return np.dot(hipot_question_embedding, question_embedding) / (np.linalg.norm(hipot_question_embedding) * np.linalg.norm(question_embedding))

def calc_faithfulness(context, question, answer):
    # measure if the answer's meaning is contained in the context meaning
    # info: https://aclanthology.org/2024.eacl-demo.16.pdf
    return calc_context_recall(context, question, answer)


# EVALUATE END-TO-END
def calc_answer_semantic_similarity(answer, expected):
    # measure similarity between the answer and the expected answer
    # info: https://docs.ragas.io/en/latest/concepts/metrics/semantic_similarity.html
    answer_embedding = np.array(reference_embedding.embed_query(answer))
    expected_embedding = np.array(reference_embedding.embed_query(expected))

    return np.dot(answer_embedding, expected_embedding) / (np.linalg.norm(answer_embedding) * np.linalg.norm(expected_embedding))

def calc_answer_conciseness(answer, expected):
    # measure the size of the answer compared to the expected answer
    num_tokens_answer = len(reference_tokenization.encode(answer))
    num_tokens_expected = len(reference_tokenization.encode(expected))

    return num_tokens_expected / num_tokens_answer

# ALL
def calc_metrics(question, expected, context, answer):
    return {
        "retriever": {
            "context_recall": calc_context_recall(context, question, expected),
            "context_conciseness": calc_context_conciseness(context, expected)
        },
        "llm": {
            "answer_relevance": calc_answer_relevance(answer, question),
            "faithfulness": calc_faithfulness(context, question, answer)
        },
        "end-to-end": {
            "answer_semantic_similarity": calc_answer_semantic_similarity(answer, expected),
            "answer_conciseness": calc_answer_conciseness(answer, expected)
        }
    }

In [None]:
@weave.op()
def eval_model(question: str, expected: str, model_output: dict) -> dict:
    return calc_metrics(question, expected, model_output["context"], model_output["answer"])

In [None]:
def run_rag_evaluation(rag, test_dataset=test_dataset):
    evaluation = Evaluation(
        dataset=test_dataset, scorers=[eval_model],
    )

    with weave.attributes({'dbs': rag.dbs_name, 'embeddings': rag.embeddings_name, 'chat_pars': rag.chat_pars, 'prompt_template': rag.prompt_template, 'retriever_pars': rag.retriever_pars, 'vectorstore': rag.vectorstore, "structure": rag.structure}):
        asyncio.run(evaluation.evaluate(rag.predict))

## First Tests

In [None]:
weave.init('op-ai-tools')

In [None]:
chat_pars = {
    "model": chat_models[1],
    "temperature": 0,
    #"max_tokens": None,
    #"timeout": None,
    "max_retries": 2
}

prompt_template = f"""Answer politely the question at the end, using only the following context. The user is not necessarily a specialist, so please avoid jargon and explain any technical terms.

<context>
{{context}} 
</context>

Question: {{question}}
"""

rag = RAGModel(
    dbs_name = dbs[1:3],
    embeddings_name = embedding_models[2],
    chat_pars=chat_pars,
    prompt_template = prompt_template,
    retriever_pars = {
        "search_kwargs" : {'k': 6}
    },
    vectorstore = 'faiss'
)

#run_rag_evaluation(rag)

## Multi-Retriever tests

In [None]:
class RAGModel_multiRetriever(weave.Model):
    dbs_retriever_pars : dict[str, dict]
    embeddings_name : str

    vectorstore : str
    prompt_template : str
    chat_pars : dict[str, str|int]

    output_filter : Callable

    structure : str = "multi-retriever-rag_"  # multiple retrievers are combined for generating the context

    @weave.op()
    def predict(self, question: str):
        retrievers = {}
        for name, retriever_pars in self.dbs_retriever_pars.items():
            retriever = build_retriever([name], self.embeddings_name, self.vectorstore, retriever_pars)
            retrievers[name] = retriever

        chain, llm = build_chat(self.chat_pars, self.prompt_template)

        if self.vectorstore == 'faiss':
            context = {}
            for name, retriever in retrievers.items():
                context[name] = retriever.invoke(question)

        context = self.output_filter(context = context, question = question)

        response = chain.invoke(
            {
                "context": context,
                "question": question,
            }
        )
        
        return {"context": str(context), "answer": response.content}

In [None]:
def run_rag_multiret_evaluation(rag, test_dataset=test_dataset):
    evaluation = Evaluation(
        dataset=test_dataset, scorers=[eval_model],
    )

    with weave.attributes({
        'dbs': list(rag.dbs_retriever_pars.keys()),
        'embeddings': rag.embeddings_name, 
        'chat_pars': rag.chat_pars, 
        'prompt_template': rag.prompt_template, 
        'retriever_pars': rag.dbs_retriever_pars,
        'vectorstore': rag.vectorstore, 
        "structure": rag.structure + rag.output_filter.__name__
    }):
        asyncio.run(evaluation.evaluate(rag.predict))

In [None]:
no_filter = lambda context, question: context
no_filter.__name__ = "no_filter"

In [None]:
def filter_trustable_posts(context, question):
    print(context)
    return context
filter_trustable_posts.__name__ = "filter_trustable_posts"

In [None]:
chat_pars = {
    "model": chat_models[1],
    "temperature": 0,
    #"max_tokens": None,
    #"timeout": None,
    "max_retries": 2
}

prompt_template = f"""Answer politely the question at the end, using only the following context. The user is not necessarily a specialist, so please avoid jargon and explain any technical terms.

<context>
{{context}} 
</context>

Question: {{question}}
"""

rag = RAGModel_multiRetriever(
    dbs_retriever_pars = {
        "posts_forum": {
            "search_kwargs" : {
                'k': 4,
                'filter': {
                    'trust_level': [2, 3, 4, 5]
                }
            }
        },
        "fragments_docs": {
            "search_kwargs" : {'k': 2}
        }
    },
    embeddings_name = embedding_models[2],
    chat_pars=chat_pars,
    prompt_template = prompt_template,
    output_filter = no_filter,
    vectorstore = 'faiss'
)

#run_rag_multiret_evaluation(rag)

## Contextual Compression Tests

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

class RAGModel_contextualcompression(weave.Model):
    structure : str = "contextual-compression-rag"  # use contextual compression for improving the retriever

    dbs_name : list
    embeddings_name : str

    vectorstore : str
    prompt_template : str
    chat_pars : dict[str, str|int]

    retriever_pars : dict

    @weave.op()
    def predict(self, question: str):
        retriever = build_retriever(self.dbs_name, self.embeddings_name, self.vectorstore, self.retriever_pars)
        chain, llm = build_chat(self.chat_pars, self.prompt_template)

        if self.vectorstore == 'faiss':
            compressor = LLMChainExtractor.from_llm(llm)
            compression_retriever = ContextualCompressionRetriever(
                base_compressor=compressor, base_retriever=retriever
            )
            context = compression_retriever.invoke(question)

        response = chain.invoke(
            {
                "context": context,
                "question": question,
            }
        )
        
        return {"context": str(context), "answer": response.content}

In [None]:
chat_pars = {
    "model": chat_models[1],
    "temperature": 0,
    #"max_tokens": None,
    #"timeout": None,
    "max_retries": 2
}

prompt_template = f"""Answer politely the question at the end, using only the following context. The user is not necessarily a specialist, so please avoid jargon and explain any technical terms.

<context>
{{context}} 
</context>

Question: {{question}}
"""

rag = RAGModel_contextualcompression(
    dbs_name = ["full_docs", "threads_forum"],
    embeddings_name = embedding_models[2],
    chat_pars=chat_pars,
    prompt_template = prompt_template,
    retriever_pars = {
        "search_kwargs" : {'k': 3}
    },
    vectorstore = 'faiss'
)

#run_rag_evaluation(rag)

## Claude Tests

In [None]:
@weave.op()
def build_retriever(dbs_name, embeddings_name, vectorstore = 'faiss', retriever_pars = {}):
    db = load_db(dbs_name, embeddings_name, vectorstore)
    if vectorstore == 'faiss':
        retriever = db.as_retriever(**retriever_pars)

    return retriever

class RAGModel_Claude(weave.Model):
    structure : str = "simple-rag" # just a retriever and a llm

    dbs_name : list
    embeddings_name : str

    vectorstore : str
    retriever_pars : dict

    prompt_template : Callable
    chat_pars : dict[str, str|int]

    @weave.op()
    def predict(self, question: str):
        retriever = build_retriever(self.dbs_name, self.embeddings_name, self.vectorstore, self.retriever_pars)
        llm = ChatAnthropic(**self.chat_pars)

        if self.vectorstore == 'faiss':
            context = retriever.invoke(question)

        response = llm.invoke(self.prompt_template(context=context, question=question))
        
        return {"context": str(context), "answer": response.content}
    
    def ask(self, question: str):
        out = self.predict(question)
        return out["answer"]
    

In [None]:
chat_pars = {
    "model": chat_models_claude[0],
    "temperature": 0,
    "max_tokens": 1024,
    "timeout": 60,
    "max_retries": 2
}

def prompt_template(context, question):
    return [
        (
            "system",
            f"You are a helpful assistant that helps access information about the Optimism Collective. Please provide polite and informative answers. Be assertive. The human is not necessarily a specialist, so please avoid jargon and explain any technical terms. \n\n Following there are some fragments retrieved from the Optimism Governance Forum and Optimism Documentation. This is expected to contain relevant information to answer the human question: \n\n {context}"
        ),
        (
            "human",
            question
        )
    ]

rag = RAGModel_Claude(
    dbs_name = dbs[1:3],
    embeddings_name = embedding_models[2],
    prompt_template = prompt_template,
    retriever_pars = {
        "search_kwargs" : {'k': 6}
    },
    vectorstore = 'faiss',
    chat_pars = chat_pars
)

run_rag_evaluation(rag)