In [None]:
#Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#SPDX-License-Identifier: MIT-0

In [None]:
%store -r sentiment_categorisation_prompt_id
%store -r sentiment_categorisation_prompt_arn
%store -r summarisation_prompt_id
%store -r summarisation_prompt_arn
%store -r extraction_prompt_id
%store -r extraction_prompt_arn

# Langfuse

Provides Observability/tracking, UI for inspection, Prompt Management and playground to test LLMs and prompts, monitoring/analytics (alpha), evaluation pipeline and user feedback management.

More info here:
https://langfuse.com/docs

(Tests done with Langfuse 2.65.1)


## Deployment options

Deploy locally (recommended) or use the SaaS version

How to deploy Locally: https://langfuse.com/docs/deployment/local

Otherwise, sign up for an account there: https://langfuse.com/

For those who want to deploy it on AWS, Aaron Su developed a CDK for that: https://github.com/aaronsu11/langfuse-on-aws

## Create a new project and add a new API key

<img src="../static/langfuse/langfuse1.png" width="800px"/>

In [None]:
!pip install -q langfuse==2.39.3
!pip install -q boto3==1.34.149
!pip install -q langchain==0.2.11 

!pip install -q langchain-community==0.2.10
!pip install -q -U langchain-aws==0.1.12

In [None]:
import boto3

#adding our utils library to sys path
import sys
sys.path.append("../src/utils/")
import llm_utils

session = boto3.Session()
bedrock_runtime_client = session.client(service_name='bedrock-runtime')
bedrock_client = session.client(service_name='bedrock')

We are using a locally deployed version for this workshop but feel free to use the SaaS version if easier.

## Retrieve prompts from Bedrock prompt template management for test

In [None]:
import boto3
bedrock_agent_client = boto3.client('bedrock-agent')

#retrieve the prompt for test
sentiment_prompt_info = bedrock_agent_client.get_prompt(
    promptIdentifier=sentiment_categorisation_prompt_id,
)


In [None]:
input_variables_sentiment = sentiment_prompt_info['variants'][0]['templateConfiguration']['text']['inputVariables']
prompt_template_sentiment = sentiment_prompt_info['variants'][0]['templateConfiguration']['text']['text']

In [None]:
print(input_variables_sentiment)
print(prompt_template_sentiment)

# Tracing & Observability

In [None]:
#load transcripts from generated folder to use in our tests
transcripts = llm_utils.load_jsonlines_file("../generated/transcripts/transcripts.jsonl")

In [None]:
#replace with your own. we're using a locally deployed instance in this workshop
secret_key="sk-lf-eb802559-2518-419c-bcec-b6dc71d4e6ea"
public_key="pk-lf-b8e69cf7-03c1-4411-bc0c-c7f8f3feb55b"
host="http://localhost:3000"

In [None]:
from langfuse import Langfuse
from langfuse.callback import CallbackHandler

#used for low level python APIs
langfuse_client = Langfuse(
  secret_key=secret_key,
  public_key=public_key,
  host=host
)

#used as a callback handler in the langchain API
langfuse_handler = CallbackHandler(
   secret_key=secret_key,
  public_key=public_key,
  host=host
)

### Using Langchain as a wrapper

In [None]:
from langchain_aws import ChatBedrockConverse

model_id = "anthropic.claude-3-haiku-20240307-v1:0"
max_tokens = 400
temperature = 0
top_p = 1

llm = ChatBedrockConverse(
    model_id=model_id,
    max_tokens = max_tokens,
    temperature = temperature,
    top_p = top_p
)

Note that the below code does not work because the The parent_run_id attribute was introduced in a recent version of LangChain and the version of langfuse is not supporting it. it might work by the time you're running this notebook.

In [None]:
""" 

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("tell me a joke about {topic}")

chain = prompt | llm | StrOutputParser()

response = chain.invoke({"topic": "bears"}, config={"callbacks": langfuse_handler})
response

"""

Alternative option using the 0.1x APIs.

In [None]:
from langchain import PromptTemplate, LLMChain

prompt = PromptTemplate(input_variables=input_variables_sentiment, template=prompt_template_sentiment)

# Create the chain
chain = LLMChain(llm=llm, prompt=prompt)

example_transcript = transcripts[0]['transcript']

output = chain.run(transcript=example_transcript, callbacks=[langfuse_handler])

print(output)


NOTES: When checking your trace in the UI, make sure you navigate the tree on the right hand side to display all nested information.

<img src = "../static/langfuse/langfuse_trace.png" width=800px />



## Using the low level python API

In [None]:
# Create generation in Langfuse
generation = langfuse_client.generation(
    name="sentiment-generation",
    model=model_id,
    model_parameters={"maxTokens": max_tokens, "temperature": temperature, "topP": top_p},
    input=[{"role": "system", "content": ""}, {"role": "user", "content": prompt.format(transcript=example_transcript)}],
    metadata={"interface": "jupyter notebook"}
)

output = chain.run(transcript=example_transcript)
 
# Update span and sets end_time
generation.end(output=output)

# To ensure that all requests are sent before the process exits, call flush()
langfuse_client.flush()

NOTES: notice the slightly different trace tree structure and where the information is stored.

<img src = "../static/langfuse/langfuse_trace2.png" width=800px />

# Prompt Management

In [None]:
#reminder of our test prompt info
sentiment_prompt_info

### Create a prompt

In [None]:
# Create a chat prompt
langfuse_prompt = langfuse_client.create_prompt(
    name=sentiment_prompt_info['name'],
    type="chat",
    prompt=[{"role": "system", "content": ""}, {"role": "user", "content": prompt_template_sentiment.format(transcript=example_transcript)}],
    labels=["staging"],
    config={
        "model": model_id,
        "temperature": temperature,
        "supported_languages": ["en"],
        "aws_arn_equivalent" : sentiment_prompt_info['arn']  # FYI, you can add any kind of metadata to the prompt
    },  
)

<img src = "../static/langfuse/langfuse_prompt.png" width=800px />

In [None]:
langfuse_prompt

In [None]:
# Get current production version of a chat prompt
chat_prompt = langfuse_client.get_prompt(sentiment_prompt_info['name'], 
                                         type="chat", # type arg infers the prompt type (default is 'text')
                                         label="staging", #not needed if you get a production prompt
                                         fallback=[{"role": "system", "content": "Default prompt"}]) #useful if somehow the service is not responding
print(chat_prompt.prompt)
print(chat_prompt.config)
print(chat_prompt.is_fallback)

In [None]:
# Insert variables into chat prompt template
compiled_chat_prompt = chat_prompt.compile(transcript=example_transcript)
compiled_chat_prompt

### Prompt integration with the generation trace api:

In [None]:
generation = langfuse_client.generation(
    name="sentiment-generation",
    prompt = chat_prompt,  #added
    model=model_id,
    model_parameters={"maxTokens": max_tokens, "temperature": temperature, "topP": top_p},
    input=[{"role": "system", "content": ""}, {"role": "user", "content": compiled_chat_prompt}],
    metadata={"interface": "jupyter notebook"}
)

output = chain.run(transcript=example_transcript)
 
# Update span and sets end_time
generation.end(output=output)

# To ensure that all requests are sent before the process exits, call flush()
langfuse_client.flush()

<img src = "../static/langfuse/langfuse_prompt_int.png" width=800px />

# Evaluation and Scoring

## Manual scoring UI

More info here: https://langfuse.com/docs/scores/annotation

NOTES: 
- You need to manually create your category, nothing out of the box with the "free" version.
- no tasks, queue management for annotations of generations.

<img src = "../static/langfuse/langfuse_eval.png" width=800px />


<img src = "../static/langfuse/langfuse_eval2.png" width=800px />



## User feedbacks at chatbot level via their LangfuseWeb SDK

https://langfuse.com/docs/scores/user-feedback



## Automated scoring/evaluation capabilities

This seems to be done primarily via integration of external solutions/libraries and the output is stored in langfuse for the record.

See below an example with langchain evaluation libraries

In [None]:
#we start by fetching the generations
def fetch_all_pages(name=None, user_id = None, limit=50):
    page = 1
    all_data = []
 
    while True:
        response = langfuse_client.get_generations(name=name, limit=limit, user_id=user_id, page=page)
        if not response.data:
            break
 
        all_data.extend(response.data)
        page += 1
 
    return all_data


generations = fetch_all_pages(name="sentiment-generation")

In [None]:
from langchain.evaluation import load_evaluator

def execute_eval_and_score(llm, eval_criteria, langfuse_client):
  
  for generation in generations:
    for criterion in eval_criteria:

      #get evaluator based on criteria?
      evaluator = load_evaluator("criteria", criteria=criterion, llm=llm)

      #evaluate output
      if (generation.input and generation.output):
        eval_result = evaluator.evaluate_strings(
            prediction=generation.output,
            input=generation.input,
        )
        
        print(f"criterion: {criterion} : {eval_result["score"]}")
  
        #call langfuse api to store the score/reasoning alongside the generation.
        langfuse_client.score(name=criterion, trace_id=generation.trace_id, observation_id=generation.id, value=eval_result["score"], comment=eval_result['reasoning'])

        langfuse_client.flush()
 

In [None]:
eval_criteria = ["relevance", "coherence"]

execute_eval_and_score(llm, eval_criteria, langfuse_client)

<img src = "../static/langfuse/langfuse_eval3.png" width=800px />