In [19]:
%pip install -Uqq langchain[all] langsmith langchain-core langchain-community python-dotenv vllm

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_PROJECT'] = 'LLM_EVAL'

In [2]:
from datasets import load_dataset, DatasetDict

## Dataset Creation

In [9]:
twitter_test_set = load_dataset("MAdAiLab/twitter_disaster", split="test")
twitter_test_set
df = twitter_test_set.to_pandas()
df.columns

Index(['text', 'label'], dtype='object')

In [10]:
from langsmith import Client
import os
import pandas as pd

client = Client()

# df = pd.read_parquet('path/to/your/myfile.parquet')
input_keys = ['text'] # replace with your input column names
output_keys = ['label'] # replace with your output column names

dataset = client.upload_dataframe(
    df=df,
    input_keys=input_keys,
    output_keys=output_keys,
    name="MAdAiLab/twitter_disaster",
    description="Test set of the Twitter Disaster dataset",
    data_type="kv" # The default
)

## Setup a Custom Evaluator 

In [13]:
from typing import List
from langsmith.schemas import Example, Run
from langsmith.evaluation import evaluate
from langchain.prompts import PromptTemplate
# from langchain_community.llms import VLLM

In [41]:
# Define the evaluation function
def accuracy_evaluator(runs: List[Run], examples: List[Example]) -> dict:
    correct_predictions = 0
    total_predictions = 0

    for run, example in zip(runs, examples):
        reference = example.outputs["label"]
        prediction = run.outputs["prediction"]

        if prediction == reference:
            correct_predictions += 1
        total_predictions += 1

    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0.0

    return {"key": "accuracy", "score": accuracy}

# Define the predict function using the VLLM and prompt template
def predict(inputs: dict):
    input_text = inputs["text"]
    prompt_template = """Given the following tweet:"{text}"
    0: negative
    1: positive 
    What is your answer? Please respond with 0 or 1.
    Answer: 
    """
    prompt = prompt_template.format(text=input_text)
    response = llm.invoke(prompt)
    response = response.strip()[0]
    prediction = int(response)
    # Assuming the VLLM response is either "true" or "false"
    # if response == "0":
    #     prediction = 0
    # elif response == "1":
    #     prediction = 1
    # else:
    #     prediction = None
    return {"prediction": prediction}

In [42]:
predict({"text": "I am happy"})

Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:13<00:00, 13.11s/it]


{'prediction': 1}

In [4]:
from langchain_community.llms import VLLM

llm = VLLM(
    model="meta-llama/Meta-Llama-3-8B-Instruct",
    tensor_parallel_size=1,
    trust_remote_code=True,
    enforce_eager=True,
    gpu_memory_utilization=0.99,
    enable_prefix_caching=True,
    temperature=0,
    max_tokens=1,
)


# print(llm.invoke("What is the capital of France ?"))

INFO 05-21 19:33:31 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B-Instruct)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 05-21 19:33:32 utils.py:660] Found nccl from library /scratch/user/u.ap164907/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 05-21 19:33:33 selector.py:81] Cannot use FlashAttention-2 backend because the flash_attn package is not found. Please install it for better performance.
INFO 05-21 19:33:33 selector.py:32] Using XFormers backend.
INFO 05-21 19:33:56 weight_utils.py:199] Using model weights format ['*.safetensors']
INFO 05-21 19:34:04 model_runner.py:175] Loading model weights took 14.9595 GB
INFO 05-21 19:34:06 gpu_executor.py:114] # GPU blocks: 13153, # CPU blocks: 2048
INFO 05-21 19:34:07 model_runner.py:937] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-21 19:34:07 model_runner.py:941] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utiliza

In [5]:
prompt_template = """Given the following tweet:

"{text}"

0: negative
1: positive

What is your answer? Please respond with 0 or 1.

Answer: 
"""

In [27]:
llm.invoke(prompt_template.format(text="I am happy."), max_tokens=1)

Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 36.20it/s]


'1'

In [43]:
evaluate(
    predict, # Your classifier
    data="MAdAiLab/twitter_disaster",
    summary_evaluators=[accuracy_evaluator],

)

View the evaluation results for experiment: 'new-volcano-28' at:
https://smith.langchain.com/o/bab91b6c-3888-5d72-a6b2-b190045814f6/datasets/e259ed08-46ac-47f3-9f3d-cc3d2ddf75a0/compare?selectedSessions=0ef65405-cf38-47d8-8706-da14aaa54d07




0it [00:00, ?it/s]


[A

[A[A


[A[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A













[A[A[A[A[A[A[A[A[A[A[A[A[A[A














[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A


















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A



















Processed prompts:   0%|                                                                                                                                                          | 0/9 [00:00<?, ?it/s]





















<ExperimentResults new-volcano-28>