In [None]:
%pip install -Uq evaluate
%pip install -Uq rouge-score
%pip install -Uq bleu
%pip install -Uq ragas
%pip install -Uq langchain-aws

In [None]:
import json
import os
from typing import Any, Dict, List, Optional

# External Dependencies:
import pandas as pd  # For working with tabular data
import boto3, uuid
from botocore.response import StreamingBody

from datasets import Dataset, load_dataset
from random import sample
from asyncio import run

import evaluate as hf_evaluate

# Langchain
from langchain_aws.chat_models.sagemaker_endpoint import ChatSagemakerEndpoint, ChatModelContentHandler
from langchain_core.messages import HumanMessage, AIMessageChunk, SystemMessage
from langchain_aws.embeddings import BedrockEmbeddings
from langchain_community.embeddings import SagemakerEndpointEmbeddings


# Sagemaker
import sagemaker
from sagemaker.huggingface import HuggingFaceModel
from sagemaker.huggingface import get_huggingface_llm_image_uri
from sagemaker.utils import name_from_base
from sagemaker import get_execution_role

# RAGAS
import ragas
from ragas.run_config import RunConfig
from ragas.metrics.base import MetricWithLLM, MetricWithEmbeddings
from ragas import evaluate as ragas_evaluate
from ragas.metrics import Faithfulness, ResponseRelevancy
from ragas.metrics import answer_relevancy, faithfulness, context_precision
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.dataset_schema import SingleTurnSample

In [None]:

# Load the BLEU evaluation metric
bleu = hf_evaluate.load("bleu")
rouge = hf_evaluate.load("rouge")

In [None]:
evaluation_dataset = load_dataset("json", data_files="eval.json", split="train")

In [None]:
ground_truth = []
base_predictions = []
sft_predictions = []

for eval_item in evaluation_dataset:

    ground_truth.append(eval_item["ground_truth"])
    base_predictions.append(eval_item['base'])
    sft_predictions.append(eval_item['tuned'])


base_bleu_results = bleu.compute(predictions=base_predictions, references=ground_truth)
base_rouge_results = rouge.compute(predictions=base_predictions, references=ground_truth)

# Compute the BLEU score
sft_bleu_results = bleu.compute(predictions=sft_predictions, references=ground_truth)
sft_rouge_results = rouge.compute(predictions=sft_predictions, references=ground_truth)

base_scores = (base_bleu_results | base_rouge_results)
sft_scores = (sft_bleu_results | sft_rouge_results)

print("=======BASE MODEL=======")
print(base_scores)
print("=======TUNED MODEL=======")
print(sft_scores)

In [None]:
import pandas as pd
data = {'dimension':[], 'base': [], 'tuned': [], 'delta': [], 'delta_percent': []}

for key in base_scores.keys():
    if key in ["length_ratio","precisions","brevity_penalty","translation_length","reference_length"]:
        continue
        
    delta = sft_scores[key]-base_scores[key]
    delta_percent = (delta/base_scores[key])*100
    
    data['dimension'].append(key)
    data['base'].append(base_scores[key])
    data['tuned'].append(sft_scores[key])
    data['delta'].append(delta)
    data['delta_percent'].append(delta_percent)
    
df = pd.DataFrame(data)

df

## LLM-as-a-judge Metrics

In [None]:
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="3.0.1"
)


role = get_execution_role()
print(role)

hub = {
    'HF_TASK': 'text-generation', 
    'HF_MODEL_ID': 'Qwen/Qwen2.5-1.5B-Instruct'
}

model_for_deployment = HuggingFaceModel(
    #model_data=s3_location,
    role=role,
    env=hub,
    image_uri=llm_image,
)

endpoint_name = name_from_base("qwen25")

instance_type = "ml.g5.2xlarge"
number_of_gpu = 1
health_check_timeout = 300

model_for_deployment.deploy(
    endpoint_name=endpoint_name,
    initial_instance_count=1,
    instance_type=instance_type,
    container_startup_health_check_timeout=health_check_timeout,
    routing_config = {
        "RoutingStrategy":  sagemaker.enums.RoutingStrategy.LEAST_OUTSTANDING_REQUESTS
    }
)

In [None]:
llm_image = get_huggingface_llm_image_uri(
  "huggingface-tei",
  version="1.4"
)


role = get_execution_role()
print(role)

hub = {
    #'HF_TASK': 'text-generation', 
    'HF_MODEL_ID': 'Alibaba-NLP/gte-large-en-v1.5'
}

embedding_model_for_deployment = HuggingFaceModel(
    #model_data=s3_location,
    role=role,
    env=hub,
    image_uri=llm_image,
)

endpoint_name = name_from_base("gte-large-en-v1-5")

instance_type = "ml.g5.2xlarge"
number_of_gpu = 1
health_check_timeout = 300

embedding_model_for_deployment.deploy(
    endpoint_name=endpoint_name,
    initial_instance_count=1,
    instance_type=instance_type,
    container_startup_health_check_timeout=health_check_timeout,
    routing_config = {
        "RoutingStrategy":  sagemaker.enums.RoutingStrategy.LEAST_OUTSTANDING_REQUESTS
    }
)

### Deploy Qwen 

In [None]:
# import metrics
metrics=[
        ragas.metrics.answer_relevancy,
        ragas.metrics.faithfulness,
        ragas.metrics.context_precision,
        ragas.metrics.context_recall,
        ragas.metrics.answer_similarity,
        ragas.metrics.answer_correctness,
    ]

In [None]:
# util function to init Ragas Metrics
def init_ragas_metrics(metrics, llm, embedding):
    for metric in metrics:
        if isinstance(metric, MetricWithLLM):
            print(metric.name + " llm")
            metric.llm = llm
        if isinstance(metric, MetricWithEmbeddings):
            print(metric.name + " embedding")
            metric.embeddings = embedding
        run_config = RunConfig()
        metric.init(run_config)

In [None]:
from botocore.config import Config

sm = boto3.Session().client('sagemaker-runtime', config=Config(max_pool_connections=20))
endpoint_name = "qwen25-judge-model"

class ChatContentHandler(ChatModelContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt, model_kwargs: Dict) -> bytes:
        body = {
            "messages": prompt,
            "stream": True,
            **model_kwargs  # Ensure all model parameters are passed
        }
        return json.dumps(body).encode("utf-8")

    def transform_output(self, output: StreamingBody) -> AIMessageChunk:
        stop_token = "[DONE]"
        try:
            all_content = []

            # Process streaming response line by line
            for line in output.iter_lines():
                if line:
                    line = line.decode("utf-8").strip()

                    # Skip empty lines or lines without "data:"
                    if not line.startswith("data:"):
                        continue

                    # Validate and parse JSON
                    try:
                        json_data = json.loads(line[6:])
                        
                    except json.JSONDecodeError as e:
                        #print(f"Skipping invalid JSON chunk: {line}")
                        continue
                    
                    # Check for stop token
                    if json_data.get("choices", [{}])[0].get("delta", {}).get("content") == stop_token:
                        break
                    
                    # Extract content and append to the list
                    content = json_data["choices"][0]["delta"]["content"]
                    all_content.append(content)

            # Join all chunks into a single string
            full_response = "".join(all_content)
            return AIMessageChunk(content=full_response)
        except Exception as e:
            return AIMessageChunk(content=f"Error processing response: {str(e)}")


chat_content_handler = ChatContentHandler()

llm = ChatSagemakerEndpoint(
    name="Testmodel",
    endpoint_name=endpoint_name,
    client=sm,
    model_kwargs={
        "temperature": 0.7,  # Adjust temperature for balanced randomness
        "max_new_tokens": 1200,  # Ensure sufficient token generation
        "top_p": 0.95,  # Use nucleus sampling for diversity
        "do_sample": True  # Enable sampling for generative tasks
    },
    content_handler=chat_content_handler
)

In [None]:
from langchain_community.embeddings.sagemaker_endpoint import EmbeddingsContentHandler

class EmbedContentHandler(EmbeddingsContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, inputs: list[str], model_kwargs: Dict) -> bytes:
        """
        Transforms the input into bytes that can be consumed by SageMaker endpoint.
        Args:
            inputs: List of input strings.
            model_kwargs: Additional keyword arguments to be passed to the endpoint.
        Returns:
            The transformed bytes input.
        """
        # Example: inference.py expects a JSON string with a "inputs" key:
        input_str = json.dumps({"inputs": inputs, **model_kwargs})
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> List[List[float]]:
        """
        Transforms the bytes output from the endpoint into a list of embeddings.
        Args:
            output: The bytes output from SageMaker endpoint.
        Returns:
            The transformed output - list of embeddings
        Note:
            The length of the outer list is the number of input strings.
            The length of the inner lists is the embedding dimension.
        """
        # Example: inference.py returns a JSON string with the list of
        # embeddings in a "vectors" key:
        response_json = json.loads(output.read().decode("utf-8"))
        return response_json #response_json["vectors"]


embeddings_content_handler = EmbedContentHandler()

In [None]:
embed_endpoint_name = "gte-large-en-v1-5-embedding"

embed = SagemakerEndpointEmbeddings(
    endpoint_name=embed_endpoint_name,
    client=sm,
    model_kwargs={
        "temperature": 0.7,  # Adjust temperature for balanced randomness
        "max_new_tokens": 1200,  # Ensure sufficient token generation
        "top_p": 0.95,  # Use nucleus sampling for diversity
        "do_sample": True  # Enable sampling for generative tasks
    },
    content_handler=embeddings_content_handler
)

In [None]:
init_ragas_metrics(
    metrics,
    llm=LangchainLLMWrapper(llm),
    embedding=LangchainEmbeddingsWrapper(embed)
)

In [None]:
# Read the JSONL file line by line
data = []
with open('full_eval.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))



# Define metrics
metrics = [
    ragas.metrics.answer_relevancy,
    ragas.metrics.faithfulness,
    #ragas.metrics.context_precision,
    #ragas.metrics.context_recall,
    #ragas.metrics.answer_similarity,
    ragas.metrics.answer_correctness,
]

sample_size = 2000

In [None]:
# Initialize the evaluation_batch structure
evaluation_batch = {
    "question": [],
    "contexts": [],
    "answer": [],
    "reference": []
}

# BASE
for item in data:
    evaluation_batch["question"].append(item.get("question", ""))
    evaluation_batch["contexts"].append([item.get("context", "")])  # Wrap in list
    evaluation_batch["answer"].append(item.get("base", "")) 
    evaluation_batch["reference"].append(item.get("original_answer", ""))



# Create the full dataset first
ds_full = Dataset.from_dict(evaluation_batch)

# Shuffle and select 100 random samples
ds = ds_full.shuffle(seed=42).select(range(sample_size))

# Run evaluation
base_evals_results = ragas_evaluate(
    ds,
    llm=LangchainLLMWrapper(llm),
    embeddings=LangchainEmbeddingsWrapper(embed),
    metrics=metrics,
)

print(base_evals_results)

In [None]:
# Initialize the evaluation_batch structure
evaluation_batch = {
    "question": [],
    "contexts": [],
    "answer": [],
    "reference": []
}

# TUNED
for item in data:
    evaluation_batch["question"].append(item.get("question", ""))
    evaluation_batch["contexts"].append([item.get("context", "")])  # Wrap in list
    evaluation_batch["answer"].append(item.get("tuned", "")) 
    evaluation_batch["reference"].append(item.get("original_answer", ""))

# Create the full dataset first
ds_full = Dataset.from_dict(evaluation_batch)

# Shuffle and select 100 random samples
ds = ds_full.shuffle(seed=42).select(range(sample_size))

#llm = ThrottledLLM(llm, delay=1.0, max_retries=6)


# Run evaluation
tuned_evals_results = ragas_evaluate(
    ds,
    llm=LangchainLLMWrapper(llm),
    embeddings=LangchainEmbeddingsWrapper(embed),
    metrics=metrics,
)

print(tuned_evals_results)

In [None]:
base_evals_results_dict = json.loads(str(base_evals_results).replace("'","\""))
base_evals_results_dict

In [None]:
tuned_evals_results_dict = json.loads(str(tuned_evals_results).replace("'","\""))
tuned_evals_results_dict

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Metric names and corresponding values for both models
metrics = ["Answer Relevancy", "Faithfulness", "Answer Correctness"]
base_scores = [base_evals_results_dict["answer_relevancy"], base_evals_results_dict["faithfulness"], base_evals_results_dict["answer_correctness"]]
tuned_scores = [tuned_evals_results_dict["answer_relevancy"], tuned_evals_results_dict["faithfulness"], tuned_evals_results_dict["answer_correctness"]]

x = np.arange(len(metrics))  # label locations
width = 0.35  # width of the bars

fig, ax = plt.subplots(figsize=(8, 5))
bars1 = ax.bar(x - width/2, base_scores, width, label='Base Model')
bars2 = ax.bar(x + width/2, tuned_scores, width, label='Tuned Model')

# Labels and title
ax.set_ylabel('Score')
ax.set_title('RAGAS Evaluation: Base vs Tuned Model')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.set_ylim(0, 1)
ax.legend()

# Add value labels on top
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.2f}', xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3), textcoords="offset points",
                    ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()