## 前提条件
- 本地安装jupyter notebook <br>
执行`pip3 install notebook` <br>

- 运行notebook （不能到SageMaker中运行，因为无法打开后续的评估页面）<br>
cd 到当前repo所在的目录 <br>
执行 `jupyter notebook`

## 安装trulens-eval

In [None]:
!pip install trulens-eval==0.18.3 llama_index==0.8.69 -q

## 导入包

In [None]:
import uuid
import time
import json
from trulens_eval import Tru
from trulens_eval.tru_custom_app import instrument
# Tru().reset_database()
tru = Tru()

## 定义RAG App
- 需要实现retrieve，和query接口， trulens会记录这2个接口的输入和结果
- Api gateway有时候会超时，所以直接使用lambda
- 执行该脚本时需要对main func lambda 有invoke权限

### 更换成对应的main func arn

In [None]:
account_id = '<account_id_placeholder>'
region='us-west-2'
main_func = f'arn:aws:lambda:{region}:{account_id}:function:Ask_Assistant'
main_func

In [None]:
import boto3
lambda_client = boto3.client('lambda',regin_name=region)
# MODEL_NAME = "claude-instant"
MODEL_NAME = "claude-v2"

class RAG_from_scratch:
    @instrument
    def retrieve(self, query: str) -> list:
        results = self.call_remote_service(query, retrieve_only=True)
        ret = [result['doc'] for result in results]
        return ret
        
    def call_remote_service(self,query:str,retrieve_only:bool = False, max_token :int =1024):
        ## 构建pay load
        payload={
            "msgid":str(uuid.uuid4()),
            "chat_name":"OnlyForDEBUG",
            "prompt":query,
            "use_qa":True,
            "multi_rounds":False,
            "hide_ref":True,
            "use_stream":False,
            "max_tokens":max_token,
            "retrieve_only":retrieve_only,
            "temperature": 0.01,
            "use_trace": False,
            "system_role": "",
            "system_role_prompt": "",
            "model_name": MODEL_NAME,
            "template_id": "1702434088941-4073e3",
            "username": "test"
        }
        start = time.time()
        response = lambda_client.invoke(
                FunctionName = main_func,
                InvocationType='RequestResponse',
                Payload=json.dumps(payload)
            )
        print(f'time cost:{time.time()-start}')
        payload_json = json.loads(response.get('Payload').read())     
        body = payload_json['body']
        # print(body)
        if retrieve_only:
            extra_info =  body[0]['extra_info']
            return extra_info['recall_knowledge']
        else:
            answer = body[0]['choices'][0]['text']
            return answer
        

    @instrument
    def query(self, query: str) -> str:
        context_str = self.retrieve(query)
        completion = self.call_remote_service(query)
        return completion

rag = RAG_from_scratch()

### 测试下是否能跑通

In [None]:
rag.query('怎么提交FOOB？')

## 使用Claude 作为评估器的基础模型
- 当前trulens-eval-0.18.3版本不支持claude，因此需要对provider进行重载

In [None]:
from trulens_eval.feedback.provider.bedrock import Bedrock as fBedrock
from typing import Dict, Optional, Sequence

class NewBedrock(fBedrock):
    model_id :str = "anthropic.claude-v2:1"
    def __init__(
        self,
        *args,
        model_id: str = "anthropic.claude-v2:1",
        **kwargs
    ):
        super().__init__(
             *args,
            model_id=model_id,
            **kwargs
        )  
        
    def convert_messages(self,messages:list) ->str: 
        prompt = ''
        for msg in messages:
            if msg['role'] == 'system':
                prompt += msg['content'] + '\\n'
            elif msg['role'] == 'user':
                prompt += msg['content'] + '\\n'
        return prompt

    # LLMProvider requirement
    def _create_chat_completion(
        self,
        prompt: Optional[str] = None,
        messages: Optional[Sequence[Dict]] = None,
        **kwargs
    ) -> str:
        assert self.endpoint is not None
        
        if not prompt and messages:
            prompt = self.convert_messages(messages)
            
        print('*********** prompt to claude:***********\n',prompt)
        import json
    
        body = json.dumps({
            "prompt": f"\n\nHuman: {prompt}\n\nAssistant:",
            "max_tokens_to_sample": 2000,
            "temperature": 0.1,
            "top_p": 0.9,
        })
        
        modelId = self.model_id

        response = self.endpoint.client.invoke_model(body=body, modelId=modelId)
        
        response_body = json.loads(response.get('body').read()).get('completion')
        
        print('*********** claude response:***********\n',response_body)

        return response_body
    

In [None]:
llm_provider = NewBedrock(model_id='anthropic.claude-v2:1',region_name=region)

## 如果使用openai
- 则填入api key
- 使用llm_provider = fOpenAI()

In [None]:
import os
from trulens_eval.feedback.provider.openai import OpenAI as fOpenAI

os.environ["OPENAI_API_KEY"] = "OPENAI_API_KEY_PLACEHOLDER"
llm_provider = fOpenAI(model_engine='gpt-4-1106-preview')

## 导入golden set文件
- golden set文件需要自己准备，是一个excel文件，有两列，分别是问题和答案

In [None]:
import pandas as pd

In [None]:
df = pd.read_excel('golden_set.xlsx')

In [None]:
golden_set = [{'query':q,'response':a} for q,a in df.values]

In [None]:
from trulens_eval import Feedback, Select
from trulens_eval.feedback import Groundedness
from trulens_eval.feedback import GroundTruthAgreement
from trulens_eval.feedback.provider.openai import OpenAI as fOpenAI
from trulens_eval.feedback.provider.bedrock import Bedrock as fBedrock
from trulens_eval.feedback.provider.endpoint.bedrock import BedrockEndpoint as BedrockEndpoint
import numpy as np

# Initialize provider class
# llm_provider = fOpenAI()

# Define a groundtruth feedback function
f_groundtruth = (
    Feedback(GroundTruthAgreement(golden_set,provider=llm_provider).agreement_measure, name = "Ground Truth").on_input_output()
)

grounded = Groundedness(groundedness_provider=llm_provider)
# Define a groundedness feedback function
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons, name = "Groundedness")
    .on(Select.RecordCalls.retrieve.rets.collect())
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

# Question/answer relevance between overall question and answer.
f_qa_relevance = (
    Feedback(llm_provider.relevance_with_cot_reasons, name = "Answer Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on_output()
)

# Question/statement relevance between question and each context chunk.
f_context_relevance = (
    Feedback(llm_provider.qs_relevance_with_cot_reasons, name = "Context Relevance")
    .on(Select.RecordCalls.retrieve.args.query)
    .on(Select.RecordCalls.retrieve.rets.collect())
    .aggregate(np.mean)
)

In [None]:
from trulens_eval import TruCustomApp
from datetime import datetime
timestamp = datetime.now()
timestamp_str = timestamp.strftime("%Y-%m-%d %H:%M:%S")


app_id = f'RAG_{timestamp_str}_{MODEL_NAME}'
tru_rag = TruCustomApp(rag,
    app_id = app_id,
    feedbacks = [f_groundtruth,f_groundedness,f_qa_relevance,f_context_relevance]
    ) #f_groundtruth,f_groundedness,f_qa_relevance,f_context_relevance
    

### 先用一个query简单测试一下

In [None]:
with tru_rag as recording:
    rag.query('aws cleanrooms是什么？')

### 使用golden set测试集测试

In [None]:
import time 
with tru_rag as recording:
    for i,item in enumerate(golden_set):
        print(f"run query[{i}] [{item['query']}]")
        rag.query(item['query'])
        time.sleep(1) 

## 查看得分结果

In [None]:
tru.get_leaderboard(app_ids=[])

# 启动仪表板

In [None]:
tru.run_dashboard()

# 关闭仪表盘

In [None]:
tru.stop_dashboard()