In [None]:
!pip install -qU langchain_openai langchain_core langchain deepeval watermark

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.6 MB[0m [31m50.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
#import dependencies
import os

from watermark import watermark

from google.colab import userdata

from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

from deepeval import evaluate
from deepeval.metrics import (
    AnswerRelevancyMetric, HallucinationMetric,
    PromptAlignmentMetric, GEval
    )
from deepeval.test_case import LLMTestCase, LLMTestCaseParams

In [None]:
#set envar for API key
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY4')

## Model

In [None]:
# create openai chat model
model = ChatOpenAI(model="gpt-4o")

# create basic string output parser
parser = StrOutputParser()

# create prompt template
system_template = """
    You are a personal assistant that help answer question from user,
    Here is the user question: {question}
    """

prompt_template = ChatPromptTemplate.from_messages(
    [("system", system_template), ("user", "{question}")]
)

# using LCEL to create chain
chain = prompt_template | model | parser

## Evaluation
- we will try 4 metrics from deepeval
  - Answer Relevancy
  - Hallucination
  - Prompt Alignment
  - G-Eval

### Answer Relevancy
- required arguments
  - input
  - actual_output

In [None]:
# invoke the chain
llm_result = chain.invoke({"question": "what is the different between NLU and NLG in NLP ?"})
llm_result

"In Natural Language Processing (NLP), Natural Language Understanding (NLU) and Natural Language Generation (NLG) are two distinct components that serve different purposes:\n\n1. **Natural Language Understanding (NLU):**\n   - **Purpose:** NLU focuses on comprehending and interpreting human language input. It involves analyzing the structure and meaning of the language to extract meaningful information.\n   - **Key Tasks:** These include tasks like intent recognition, entity extraction, sentiment analysis, and language comprehension.\n   - **Examples:** When you ask a virtual assistant a question, NLU is responsible for understanding what you are asking and identifying the relevant details.\n\n2. **Natural Language Generation (NLG):**\n   - **Purpose:** NLG is concerned with creating human-like language responses from structured data or information. It focuses on the production of text that is both coherent and contextually appropriate.\n   - **Key Tasks:** These include tasks like tex

In [None]:
metric = AnswerRelevancyMetric(
    threshold=0.7,
    model="gpt-4o",
    include_reason=True
)
test_case = LLMTestCase(
    input="what is the different between NLU and NLG in NLP ?",
    actual_output=llm_result
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

Output()

1.0
The score is 1.00 because the response perfectly addresses the distinction between NLU (Natural Language Understanding) and NLG (Natural Language Generation) in NLP with complete relevance and focus. Fantastic job!


### Hallucination
- required arguments:
  - input
  - actual_output
  - context

In [None]:
# invoke the chain
llm_result = chain.invoke({"question": "what is the The Witcher 4 from CD Projekt Red ?"})
llm_result

'As of my last update in October 2023, CD Projekt Red has announced the development of a new game in The Witcher series, often referred to by fans and media as "The Witcher 4." However, the official title and specific details about the game have not been fully disclosed. CD Projekt Red has confirmed that this new installment will mark the beginning of a new saga in The Witcher universe, indicating that it may explore new storylines and characters beyond those of Geralt of Rivia, the protagonist of the previous games. The game will be developed using Unreal Engine 5, which suggests significant advancements in graphics and gameplay capabilities. Further details, including release date, plot, and setting, are expected to be revealed closer to the game\'s launch.'

In [None]:
# Replace this with the actual documents that you are passing as input to your LLM.
context=["The Witcher IV is an upcoming action role-playing game developed by CD Projekt Red and published by CD Projekt. It is the planned first installment of a new trilogy in The Witcher series and is set several years after the events of The Witcher 3: Wild Hunt."]

test_case = LLMTestCase(
    input="what is the The Witcher 4 from CD Projekt Red ?",
    actual_output=llm_result,
    context=context
)
metric = HallucinationMetric(threshold=0.5)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

Output()

0.0
The score is 0.00 because the actual output fully aligns with the context without any contradictions, confirming the development of a new game in The Witcher series by CD Projekt Red as stated in the context.


### Prompt Alignment
- required arguments
  - input
  - actual_output

In [None]:
# create prompt template
system_template_prompt_alignment = """
    You are a personal assistant that translate input from user to french language,
    Here is the user input: {input}
    """

prompt_template_alignment = ChatPromptTemplate.from_messages(
    [("system", system_template_prompt_alignment), ("user", "{input}")]
)

# using LCEL to create chain
chain_prompt_alignment = prompt_template_alignment | model | parser

In [None]:
# invoke the chain
llm_result = chain_prompt_alignment.invoke({"input": "Hello how are you today ?"})
llm_result

"Bonjour, comment allez-vous aujourd'hui ?"

In [None]:
metric = PromptAlignmentMetric(
    prompt_instructions=["You are a personal assistant that translate input from user to italian language"],
    model="gpt-4o",
    include_reason=True
)
test_case = LLMTestCase(
    input="Hello how are you today ?",
    actual_output=llm_result
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

Output()

0.0
The score is 0.00 because the LLM translated the input to French instead of Italian, not aligning with the specified prompt instructions.


### G-Eval
- required arguments
  - input
  - actual_output

In [None]:
# invoke the chain
llm_result = chain.invoke({"question": "what is the The Witcher 4 from CD Projekt Red ?"})
llm_result

'"The Witcher 4" is a highly anticipated project from CD Projekt Red, the developers behind the critically acclaimed Witcher series. While the game is not officially titled "The Witcher 4," CD Projekt Red announced in March 2022 that they are working on a new installment in The Witcher series. This new game will kick off a new saga in the Witcher universe, separate from the original trilogy that followed the story of Geralt of Rivia.\n\nThe game is expected to be developed using Unreal Engine 5, marking a shift from the company\'s proprietary REDengine technology. This decision is part of a strategic partnership with Epic Games. Details about the plot, characters, and setting of this new Witcher game have not been fully disclosed as of October 2023. However, fans can expect the same rich storytelling, deep world-building, and immersive gameplay that the series is known for.'

In [None]:
expected_output=["The Witcher IV is an upcoming action role-playing game developed by CD Projekt Red and published by CD Projekt. It is the planned first installment of a new trilogy in The Witcher series and is set several years after the events of The Witcher 3: Wild Hunt."]

correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    evaluation_steps=[
        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
        "You should also heavily penalize if the actual output not correct based on expected outpur",
        "Vague language, or contradicting OPINIONS, are OK"
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
)

test_case = LLMTestCase(
    input="what is the The Witcher 4 from CD Projekt Red ?",
    actual_output=llm_result,
    expected_output=expected_output
)

correctness_metric.measure(test_case)
print(correctness_metric.score)
print(correctness_metric.reason)

Output()

0.7698826271357894
The actual output aligns with the expected output by confirming a new Witcher game is in development by CD Projekt Red, starting a new saga. However, it lacks mention of the game's setting being several years after The Witcher 3: Wild Hunt, which is detailed in the expected output.


In [None]:
%load_ext watermark
%watermark --machine --python --iversions

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Python implementation: CPython
Python version       : 3.10.12
IPython version      : 7.34.0

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.1.85+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

deepeval        : 2.0.5
langchain_core  : 0.3.25
watermark       : 2.5.0
langchain_openai: 0.2.12
google          : 2.0.3

