# Notebook for exploring variations on LLM, embeddings, RAG arch and prompt

## DBs that are going to be retrieved

In [1]:
dbs = [
    "full_docs",
    "fragments_docs",
    "posts_forum"
]

## Options of embedding, chat and vectorscore to be tested

In [2]:
# we are going to test the open ai api for embeddings
embedding_models = ["text-embedding-3-small", "text-embedding-3-large", "text-embedding-ada-002"]

# we are going to test the open ai api for chat
chat_models = ["gpt-3.5-turbo-0125", "gpt-3.5-turbo-instruct", "gpt-4o", "gpt-4o-2024-05-13"]

# we are going to use faiss
vectorstores = ['faiss']

## Imports

In [3]:
# general
import pandas as pd
import numpy as np
import os
from getpass import getpass
from datetime import datetime

# embedding and chat
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
# openai api key
openai_api_key = getpass("Enter the OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = openai_api_key

# vectorstore
if 'faiss' in vectorstores:
    from langchain_community.vectorstores import FAISS

# for tracking
import weave
from weave import Evaluation

# the metrics
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

  from .autonotebook import tqdm as notebook_tqdm


## Import test data 

In [8]:
test_path = "../002_create_test_dataset/questions_test_dataset.csv"
test_dataset = pd.read_csv(test_path)

# drop origin
test_dataset = test_dataset.drop(columns=['origin'])

# change columns
test_dataset = test_dataset.rename(columns={'answer': 'ground_truth'})

# sample 50 questions
test_dataset = test_dataset.sample(50, random_state=42)

# as dict
test_dataset = test_dataset.to_dict(orient='list')
test_dataset

{'question': ['How can I get involved in the Optimism Collective?',
  'Can Optimism currently censor user transactions?',
  'Who are the members of the proposed Decentralized Finance Governance Committee for Optimism?',
  'What were some of the outcomes and feedback from Season 3 of the Grants Council?',
  "Why is there a 'no-sale' rule for growth experiments?",
  'What kind of activities can the Optimism Foundation perform due to its legal status?',
  'What types of contributions are eligible for RetroPGF?',
  'Where can I see applications for voting on Retro Funding?',
  'Do I need to claim my tokens for Airdrop #2?',
  'What are the main governance structures introduced in Season 5?',
  'How does a non-grant proposal proceed to a vote in the Optimism Governance Forum?',
  'What were some of the key activities and contributions of Women Biz to the Optimism ecosystem?',
  'Who can join alliances and participate in missions?',
  "What will happen after the Working Constitution's period

## General definitions for accessing data and creating model

In [5]:
def load_db(dbs, model_embeddings, vectorstore = 'faiss'):
    embeddings = OpenAIEmbeddings(model=model_embeddings, openai_api_key=openai_api_key)
    if vectorstore == 'faiss':
        dbs = [f"dbs/{name}_db/faiss/{model_embeddings}" for name in dbs]
        dbs = [FAISS.load_local(db_path, embeddings, allow_dangerous_deserialization=True) for db_path in dbs]
        db = dbs[0]
        for db_ in dbs[1:]:
            db.merge_from(db_)
    
    return db

In [6]:
class RAGModel():
    @weave.op()
    def __init__(self, dbs, model_embeddings, chat_pars, prompt_template, vectorstore = 'faiss'):
        self.dbs_name = dbs
        self.embeddings_name = model_embeddings
        self.vectorstore_name = vectorstore

        self.db = load_db(dbs, model_embeddings, vectorstore)

        prompt = ChatPromptTemplate.from_template(prompt_template)
        llm = ChatOpenAI(**chat_pars, openai_api_key=openai_api_key)
        self.chain = prompt | llm
    
    @weave.op()
    def find_similar_docs(self, query, **retriever_kwargs):
        if self.vectorstore_name == 'faiss':
            retriever = self.db.as_retriever(**retriever_kwargs)
            return retriever.invoke(query)
        
    @weave.op()
    def get_answer(self, question: str, retriever_kwargs={}):
        context = self.find_similar_docs(question, **retriever_kwargs)

        response = self.chain.invoke(
            {
                "context": context,
                "question": question,
            }
        )
        
        return context, response
    
    def ask(self, question: str, retriever_kwargs={}):
        context, response = self.get_answer(question, retriever_kwargs)
        return response.content
    
    def evaluate_on(self, test_dataset, retriever_kwargs={}):
        answers = []
        contexts = []
        for q in test_dataset['question']:
            context, response = self.get_answer(q, retriever_kwargs)
            answers.append(response.content)
            contexts.append([c.page_content for c in context])

        test_data = test_dataset.copy()
        test_data['contexts'] = contexts
        test_data['answer'] = answers
        return evaluate(
            Dataset.from_dict(test_data)
        )
    
    

## Tests

In [9]:
#weave.init('first-test')
chat_pars = {
    "model": chat_models[0],
    "temperature": 0,
    "max_tokens": None,
    "timeout": None,
    "max_retries": 2
}

prompt_template = f"""Answer politely the question at the end, using only the following context. The user is not necessarily a specialist, so please avoid jargon and explain any technical terms.

<context>
{{context}} 
</context>

Question: {{question}}
"""

rag = RAGModel(
    dbs = [dbs[0]],
    model_embeddings = embedding_models[0],
    chat_pars=chat_pars,
    prompt_template = prompt_template
)

#rag.ask("what is optimism?")

rag.evaluate_on(test_dataset)

Evaluating:  26%|██▌       | 51/200 [03:07<09:09,  3.69s/it]
Exception in thread Thread-6:
Traceback (most recent call last):
  File "/Users/victor/opt/anaconda3/envs/bleu-chatbot/lib/python3.12/threading.py", line 1073, in _bootstrap_inner
    self.run()
  File "/Users/victor/opt/anaconda3/envs/bleu-chatbot/lib/python3.12/site-packages/ragas/executor.py", line 95, in run
    results = self.loop.run_until_complete(self._aresults())
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/victor/opt/anaconda3/envs/bleu-chatbot/lib/python3.12/asyncio/base_events.py", line 687, in run_until_complete
    return future.result()
           ^^^^^^^^^^^^^^^
  File "/Users/victor/opt/anaconda3/envs/bleu-chatbot/lib/python3.12/site-packages/ragas/executor.py", line 83, in _aresults
    raise e
  File "/Users/victor/opt/anaconda3/envs/bleu-chatbot/lib/python3.12/site-packages/ragas/executor.py", line 78, in _aresults
    r = await future
        ^^^^^^^^^^^^
  File "/Users/vict

ExceptionInRunner: The runner thread which was running the jobs raised an exeception. Read the traceback above to debug it. You can also pass `raise_exceptions=False` incase you want to show only a warning message instead.

In [None]:
"""question = test_dataset['question'][0]
ground_truth = test_dataset['ground_truth'][0]

examples = [
    {
        "question": question,
        "ground_truth": ground_truth
    }

]

# Define any custom scoring function
@weave.op()
def score(question: str, model_output):
    contexts, answer = model_output["contexts"], model_output["answer"]
    print(model_output)
    test_data = Dataset.from_dict({
        "question": [question],
        "contexts": [contexts],
        "answer": [answer],
        "ground_truth": [ground_truth]
    })

    return evaluate(
        test_data,
        raise_exceptions=False
    )

@weave.op()
def function_to_evaluate(question: str):
    context, answer = rag.get_answer(question)
    context = [c.page_content for c in context]
    answer = answer.content

    return {"answer": answer, "contexts": context}

# Score your examples using scoring functions
evaluation = Evaluation(
    dataset=examples, scorers=[score]
)

# Start tracking the evaluation
# Run the evaluation
await evaluation.evaluate(function_to_evaluate)"""

'question = test_dataset[\'question\'][0]\nground_truth = test_dataset[\'ground_truth\'][0]\n\nexamples = [\n    {\n        "question": question,\n        "ground_truth": ground_truth\n    }\n\n]\n\n# Define any custom scoring function\n@weave.op()\ndef score(question: str, model_output):\n    contexts, answer = model_output["contexts"], model_output["answer"]\n    print(model_output)\n    test_data = Dataset.from_dict({\n        "question": [question],\n        "contexts": [contexts],\n        "answer": [answer],\n        "ground_truth": [ground_truth]\n    })\n\n    return evaluate(\n        test_data,\n        raise_exceptions=False\n    )\n\n@weave.op()\ndef function_to_evaluate(question: str):\n    context, answer = rag.get_answer(question)\n    context = [c.page_content for c in context]\n    answer = answer.content\n\n    return {"answer": answer, "contexts": context}\n\n# Score your examples using scoring functions\nevaluation = Evaluation(\n    dataset=examples, scorers=[score