## In-Context Learning with Llama3-8b


Make sure to mount your Google drive if the data files are there and you're running this on Colab. Otherwise, you can specify the file paths directly in the code.


In [1]:
# Authenticate with huggingface because some models like llama3 are gated
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
# If running on colab: install the necessary packages 
!pip install vllm flash-attn

Collecting vllm
  Downloading vllm-0.4.2-cp310-cp310-manylinux1_x86_64.whl (67.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flash-attn
  Downloading flash_attn-2.5.8.tar.gz (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m74.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ninja (from vllm)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
Collecting fastapi (from vllm)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai (from vllm)
  Downloading openai-1.26.0-py3-none-any.whl (314 kB

In [None]:
import numpy as np
from vllm import SamplingParams, LLM
from tqdm import tqdm
from enum import Enum
from sklearn.metrics import f1_score
import json
from transformers import AutoTokenizer
import os
import pandas as pd


class InferenceMode(Enum):
    GREEDY = "greedy"
    TOP100 = "top100"

In [3]:
# Storing label tokens in case we use different models so we don't run the tokenizer all the time
LABEL_TOKENS_DIRPATH = "/content/drive/MyDrive/CSCI 567 Final Project/label_tokens"
# Where the instructions/prompts are
DATA_CONFIGS_DIRPATH = "/content/drive/MyDrive/CSCI 567 Final Project/configs"
# Where the models are stored
MODELS_DOWNLOAD_DIRPATH = "/content/drive/MyDrive/CSCI 567 Final Project/models"
# Where the predictions are stored
PREDS_OUTPUT_DIRPATH = "/content/drive/MyDrive/CSCI 567 Final Project/preds"

train_dataset_path = "/content/drive/MyDrive/CSCI 567 Final Project/data/CNN-and-Essays-Datasets/xsum-gen-n=5000-model=llama3-8b.csv"
test_data_dirpath = "/content/drive/MyDrive/CSCI 567 Final Project/data/CNN-and-Essays-Datasets/xsum-gen-n=5000-model=llama3-8b.csv"

dataset_name = "xsum"  # "daigt", "hewlett"
n_labeled = 4
# xsum: 9900, daigt: 10000, hewlett: 7000, cnn: 5200, gp2-essays: 2700
n_test_samples = 9900
inference_mode = InferenceMode.TOP100
label_names = ["0", "1"]
model_name = "llama3-8b"

## Data


In [None]:
def format_one_example_for_inference(d: dict, prompt_template: str, label=None):
    return prompt_template.replace("<TEXT>", d["text"])


def format_one_labeled_example(d: dict, prompt_template: str):
    return prompt_template.replace("<TEXT>", d["text"]).replace("<LABEL>", str(d["label"]))


def generate_inference_examples(
    test_data: list[dict],
    prompt_template: str,
) -> list[str]:
    """Generate prompts for inference
    Args:
        data: list[dict]
            The data to generate prompts for
        inference_mode: InferenceMode
            The inference mode to use
    """
    return [format_one_example_for_inference(item, prompt_template) for item in test_data]


def load_demontration_prompt_template(data_config_filepath: str) -> str:
    """Load the demonstration prompt template from the data config file
    Args:
        data_config_filepath: str
            The path to the data config JSON file
    Returns:
        str
            The demonstration prompt template
    """
    with open(data_config_filepath, "r") as f:
        config = json.load(f)
    return config["demonstration_prompt"]


def load_inference_prompt_template(data_config_filepath: str) -> str:
    """Load the inference prompt template from the data config file
    Args:
        data_config_filepath: str
            The path to the data config JSON file
    Returns:
        str
            The inference prompt template
    """
    with open(data_config_filepath, "r") as f:
        config = json.load(f)
    return config["inference_prompt"]


def format_prefix(labeled_data: list[dict], task_instruction: str, prompt_template: str) -> str:
    """Format the prefix for the prompt, which contains labeled examples.
    Args:
        labeled_data: list[dict]
            The labeled data
        prompt_template: str
            The prompt template for labeled exampless
    Returns:
        str
            The formatted prefix
    """
    if not labeled_data:  # For zero-shot learning
        return ""

    prefix = [task_instruction]

    for item in labeled_data:
        prefix.append(format_one_labeled_example(item, prompt_template))

    assert len(prefix) > 0
    return "\n".join(prefix)


def load_data(
    labeled_data_filepath: str,
    test_data_filepath: str,
    dataset_name: str,
    n_labeled: int,
    n_test_samples: int,
):
    """Load the data and generate prompts for inference
    Args:
        labeled_data_filepath: str
            The path to the labeled data
        test_data_filepath: str
            The path to the test data
        dataset_name: str
            The name of the dataset
        n_labeled: int
            The number of labeled examples to use
        n_test_samples: int
            The number of test examples to use
    Returns:
        tuple[list[str], list[str], list[int]]
            The prompts, true labels, and sample ids
    """
    data_config_filepath = f"{DATA_CONFIGS_DIRPATH}/{dataset_name}_config.json"
    demonstrations = load_file(labeled_data_filepath)[:n_labeled]
    if n_test_samples == -1:
        test_data = load_file(test_data_filepath)
    else:
        if labeled_data_filepath == test_data_filepath:
            print(
                "Detected same files for training and testing. Picking different subsets in order to not have overlaps"
            )
            test_data = load_file(test_data_filepath)[-n_test_samples:]
        else:
            test_data = load_file(test_data_filepath)[:n_test_samples]

    demonstration_prompt_template = load_demontration_prompt_template(data_config_filepath)
    inference_prompt_template = load_inference_prompt_template(data_config_filepath)
    task_instruction = load_instruction(data_config_filepath)

    true_labels = [str(item["label"]) for item in test_data]

    # Prefix contains the labeled examples
    prefix = format_prefix(demonstrations, task_instruction, demonstration_prompt_template)

    test_examples = generate_inference_examples(test_data, inference_prompt_template)

    if len(prefix) > 0:  # few-shot case
        prompts = [prefix + "\n" + example for example in test_examples]
    else:  # zero-shot case
        prompts = test_examples

    return prompts, true_labels  # , sample_ids


def get_label_tokens(model_name, model_path, dataset_name, label_names) -> dict[str, int]:
    """Get the token ids for the labels in the dataset from a pre-existing file. Generate them and store them if they don't exist.
    Args:
        model_name: str
            The name of the model
        model_path: str
            The path to the model. Used only if we need to load the tokenizer model.
        dataset_name: str
            The name of the dataset
        label_names: list[str]
            The names of the labels
    Returns:
        dict[str, int]
            A map from a label to its token id
    """
    label_tokens_filepath = f"{LABEL_TOKENS_DIRPATH}/{model_name}_{dataset_name}_label_tokens.json"
    if os.path.exists(label_tokens_filepath):
        with open(label_tokens_filepath, "r") as f:
            label_tokens = json.load(f)
            print(f"Loaded label tokens: {label_tokens}")
    else:
        print(f"Label tokens file not found, generating for model: {model_name}")
        hf_tokenizer = AutoTokenizer.from_pretrained(model_path)

        # Encode each label and skip the first token because it's the prefix space
        label_tokens = {label: hf_tokenizer.encode(label)[1:] for label in label_names}

        print(f"Generated file tokens: {label_tokens}")

        # Store them in a dictionary for easy access
        with open(label_tokens_filepath, "w") as f:
            json.dump(label_tokens, f)
        print(f"Stored file tokens to: {label_tokens_filepath}")

    return label_tokens


def load_instruction(data_config_filepath: str) -> str:
    """Load the task instruction from the data config file
    Args:
        data_config_filepath: str
            The path to the data config JSON file
    Returns:
        str
            The task instruction that comes before the examples in the prompt
    """
    with open(data_config_filepath, "r") as f:
        config = json.load(f)
    return config["instruction"]


def load_file(filename: str) -> list[dict]:
    """Load a file into a list of dictionaries
    Args:
        filename: str
            The path to the file
    Returns:
        list[dict]
            The list of data points as dictionaries
    """
    print(f"Loading: {filename}...")
    if filename.endswith(".jsonl"):
        data = pd.read_json(filename, orient="records", encoding="utf-8").to_dict("records")
    elif filename.endswith(".csv"):
        data = pd.read_csv(filename, encoding="utf-8").to_dict("records")

    print(f"Detected: {len(data)} rows.")
    if "generated" in data[0]:
        for datapoint in data:
            datapoint["label"] = datapoint["generated"]

    data0 = [d for d in data if d["label"] == 0]
    data0 = sorted(data0, key=lambda kv: len(kv["text"].split()))
    data1 = [d for d in data if d["label"] == 1]
    data1 = sorted(data1, key=lambda kv: len(kv["text"].split()))

    print(
        f"Label split: {len(data0) / len(data)} human-written, {len(data1) / len(data)} AI-generated"
    )

    all_data = []
    for d0, d1 in zip(data0, data1):
        all_data.append(d0)
        all_data.append(d1)

    # In an imbalanced dataset the previous loop will only run for the length of the shortest list, so we add this to add the remaining samples
    if len(data0) != len(data1):
        if len(data0) < len(data1):
            smol = data0
            huge = data1
        elif len(data0) > len(data1):
            smol = data1
            huge = data0

        for i in range(len(smol), len(huge)):
            all_data.append(huge[i])

    # NOTE: We're hardcoding this for a dataset we know contains extremely long essays to not run into a context length error
    if "CNN Dataset.csv" in filename or "Essays Dataset.csv" in filename:
        all_data = all_data[:5210]

    print(f"Final number of datapoints: {len(all_data)=}")
    return all_data

In [59]:
prompts, true_labels = load_data(
    labeled_data_filepath=train_dataset_path,
    test_data_filepath=test_data_dirpath,
    dataset_name=dataset_name,
    n_labeled=n_labeled,
    n_test_samples=n_test_samples,
    inference_mode=inference_mode,
    label_names=label_names,
)

Loading: /content/drive/MyDrive/CSCI 567 Final Project/data/CNN-and-Essays-Datasets/CNN Dataset.csv...
Detected: 5330 rows.
Label split: 0.5628517823639775 human-written, 0.4371482176360225 AI-generated
Final number of datapoints: len(all_data)=5210
Detected same files for training and testing. Picking different subsets in order to not have overlaps
Loading: /content/drive/MyDrive/CSCI 567 Final Project/data/CNN-and-Essays-Datasets/CNN Dataset.csv...
Detected: 5330 rows.
Label split: 0.5628517823639775 human-written, 0.4371482176360225 AI-generated
Final number of datapoints: len(all_data)=5210


## Model


In [38]:
def write_ood_and_missing_labels_to_file(
    preds_output_filepath, ood_predictions, labels_notin_top100
):
    preds_output_filepath = preds_output_filepath.replace(".jsonl", "-ood-missing-labels.txt")

    # Store ood predictions in the same predictions file
    if ood_predictions:
        with open(preds_output_filepath, "a") as f:
            f.write(f"{'-' * 80}\n")
            f.write("Out-of-distribution predictions:\n")
            for i, pred in ood_predictions:
                f.write(f"i: {i}, ood_pred: {pred}\n")
            f.write(f"{'-' * 80}\n")

    # Store number of labels not in the top 100 tokens in the same predictions file
    if labels_notin_top100:
        with open(preds_output_filepath, "a") as f:
            f.write(f"{'-' * 80}\n")
            f.write("Labels not in top 100 tokens:\n")
            for i, label in labels_notin_top100:
                f.write(f"pred idx: {i}, label: {label}\n")
            f.write(f"{'-' * 80}\n")


def do_inference(
    inference_mode: InferenceMode,
    llm: LLM,
    sampling_params: SamplingParams,
    prompts: list[str],
    preds_output_filepath: str,
    true_labels: list[str],
    label_tokens: dict[str, list[int]],
) -> tuple[list[str], list[int], list[tuple[int, str]], list[tuple[int, str]]]:
    """Perform inference on the model.
    Args:
        inference_mode: InferenceMode
            The mode of inference
        llm: LLM
            The language model
        sampling_params: SamplingParams
            The sampling parameters
        prompts: list[str]
            The prompts for the model
        preds_output_filepath: str
            The path to the file where the predictions will be written
        true_labels: list[str]
            The true labels
        label_tokens: dict[str, list[int]]
            A map from a label to its token ids
    Returns:
        tuple[list[str], list[str], list[tuple[int, str]], list[tuple[int, str]]]
            The predictions, true labels, labels not in the top 100 tokens, and OOD predictions
    """
    label2idx = {label: idx for idx, label in enumerate(list(label_tokens.keys()))}

    print("Generating outputs...")
    outputs = llm.generate(prompts, sampling_params)

    predictions = []
    ood_predictions = []
    labels_notin_top100 = []

    if inference_mode == InferenceMode.GREEDY:
        print("Greedy inference")

        for i, output in enumerate(tqdm(outputs)):
            generated_text = output.outputs[0].text.strip()
            if generated_text not in label2idx:
                ood_predictions.append((i, generated_text))

            # Take into account case where instead of the model generates "\n" instead of "Input:", as if it were to write a new example
            if "Input:" in generated_text:
                pred = generated_text.split("Input:")[0]
            else:
                pred = generated_text

            pred_obj = {
                "generated_text": generated_text,
                "pred": pred,
                "true_label": true_labels[i],
            }

            with open(preds_output_filepath, "a") as f:
                f.write(json.dumps(pred_obj) + "\n")

            predictions.append(pred)

        print(f"Number of OOD predictions: {len(ood_predictions)}")

        write_ood_and_missing_labels_to_file(
            preds_output_filepath, ood_predictions, labels_notin_top100
        )

    elif inference_mode == InferenceMode.TOP100:
        print("Top100 inference")

        for i, output in enumerate(tqdm(outputs)):
            generated_text = output.outputs[0].text
            logprobs = get_logprobs_from_output(output.outputs[0], label_tokens)
            label_probs, missing_labels = get_label_probs(logprobs, label2idx)
            if missing_labels:
                if len(missing_labels) == len(label2idx):
                    ood_predictions.append((i, generated_text))
                for missing_label in missing_labels:
                    labels_notin_top100.append((i, missing_label))

            idx2label = {v: k for k, v in label2idx.items()}
            pred = idx2label[np.argmax(label_probs)].replace("_", "")
            pred_obj = {
                "generated_text": generated_text,
                "pred": pred,
                "true_label": true_labels[i],
                # "id": sample_ids[i],
                "logprobs": logprobs,
            }

            with open(preds_output_filepath, "a") as f:
                f.write(json.dumps(pred_obj) + "\n")

            predictions.append(pred)

        print(
            f"Number of OOD predictions: {len(ood_predictions)}, number of labels not in top 100 tokens: {len(labels_notin_top100)}"
        )

        write_ood_and_missing_labels_to_file(
            preds_output_filepath, ood_predictions, labels_notin_top100
        )

    else:
        raise ValueError(f"Invalid inference mode: {inference_mode}")

    return predictions, true_labels, labels_notin_top100, ood_predictions


def get_sampling_params(inference_mode: InferenceMode):
    """Get the sampling parameters for the model.
    Args:
        inference_mode: InferenceMode
            The mode of inference
    Returns:
        SamplingParams
            The sampling parameters
    """
    if inference_mode == InferenceMode.GREEDY:
        params = SamplingParams(temperature=0.0, stop="\n")
    elif inference_mode == InferenceMode.TOP100:
        params = SamplingParams(temperature=0.0, logprobs=100, max_tokens=1)
    else:
        raise ValueError(f"Invalid inference mode: {inference_mode}")

    return params


def get_logprobs_from_output(output, label_tokens):
    """Get the logprobs and the probs of the labels from the output of the LLM model.
    Args:
        output: CompletionOutput
            The output of the LLM model for a single example
        label_tokens: dict[str, int]
            A map from a label to its token id
    Returns:
        dict[str, tuple]
            A map from a label to a tuple of (logprob, prob, index in top 100 tokens)
    """
    log_probs = output.logprobs
    # Get the logprobs of the labels
    label_logprobs = {}
    for label, label_tokens in label_tokens.items():
        # We're only interested in the first token of the label, since this should only be called for single token labels
        label_token = label_tokens[0]
        # We're iterating through the dictionary because it is in descending order of logprob, so we can also get the index
        for i, token in enumerate(log_probs[0].keys()):
            if label_token == token:
                label_logprobs[label] = (
                    log_probs[0][label_token].logprob,
                    np.exp(log_probs[0][label_token].logprob),
                    i,
                )  # logprob, index in top 100 tokens

    return dict(sorted(label_logprobs.items(), key=lambda kv: kv[1][-1]))


def get_label_logprobs_from_prompt(label_token_values, prompt_logprobs) -> list[tuple]:
    """Get the logprobs of the labels from the prompt logprobs.
    Args:
        label_token_values: list[int]
            The token values of a given label
        prompt_logprobs: list[dict[int, float]]
            A list of dictionaries, where each dictionary contains a map from a token to its logprob
    Returns:
        list[tuple]
            A list of tuples, where each tuple contains a label token and its logprob
    """
    label_logprobs = []  # [(token: logprob)]

    # Get the last N elements of the prompt logprobs list, where N is the number of tokens in the word we're looking for
    n_last_logprobs = prompt_logprobs[-len(label_token_values) :]
    # print(f"Prompt logprobs: {n_last_logprobs}")
    for i, tokendict in enumerate(n_last_logprobs):

        # Sanity check, technically we should never hit this
        if label_token_values[i] not in tokendict:
            print(f"{i=}")
            print(f"Prompt logprobs: {tokendict}")
            print(f"Label tokens: {label_token_values}")
            raise ValueError(f"Token {label_token_values[i]} not found in the prompt logprobs")

        label_logprobs.append((label_token_values[i], tokendict[label_token_values[i]]))

    return label_logprobs


def get_label_probs(logprobs, label2idx):
    """Get the non-normalized predicted probabilities for each label.
    Args:
        logprobs: dict[str, tuple]
            A map from a label to a tuple of (logprob, prob, index in top 100 tokens)
        label2idx: dict[str, int]
            A map from a label to its index in the list of labels.
    Returns:
        list[list[float]], list[tuple]
            A list of lists, where each list contains the non-normalized predicted probabilities for each label.
            Also returns a list of tuples containing the indices of the test samples that had missing labels, plus the missing label.
    """
    missing_labels = []

    # Get the probs only and ensure that all labels are within the top 100 tokens in the prediction
    pred_probs = [0] * len(label2idx)
    for label in label2idx:
        if label in logprobs:
            pred_probs[label2idx[label]] = logprobs[label][1]  # non-normalized
        else:
            # print(f"Label not found: {label}")
            missing_labels.append(label)

    return pred_probs, missing_labels


def get_model_config(model_name: str):
    """Get the configuration for the model.
    Args:
        model_name: str
            The name of the model
    Returns:
        dict
            The configuration for the model
    """
    model_config = {}
    if model_name == "mixtral-8x7b":
        # gpu_memory_utilization=0.6, tensor_parallel_size=4
        model_config = {
            "model": "mistralai/Mixtral-8x7B-v0.1",
            "gpu_memory_utilization": 0.6,
            "tensor_parallel_size": 4,
        }
    elif model_name == "gemma-7b":
        # gpu_memory_utilization=0.45, max_model_len=4096
        model_config = {
            "model": "google/gemma-7b",
            "gpu_memory_utilization": 0.45,
            "max_model_len": 4096,
        }
    elif model_name == "mistral-7b":
        # gpu_memory_utilization=0.7
        model_config = {
            "model": "mistralai/Mistral-7B-v0.1",
            "gpu_memory_utilization": 0.7,
        }
    elif model_name == "LLaMA2-13B":
        # tensor_parallel_size=2
        model_config = {
            "model": "meta-llama/Llama-2-13b-hf",
            "tensor_parallel_size": 2,
        }
    elif model_name == "LLaMA2-70B":
        # gpu_memory_utilization=0.85, tensor_parallel_size=4
        model_config = {
            "model": "meta-llama/Llama-2-70b-hf",
            "gpu_memory_utilization": 0.9,
            "tensor_parallel_size": 4,
        }
    elif model_name == "LLaMA2-7B":
        model_config = {
            "model": "meta-llama/Llama-2-7b-hf",
        }
    elif model_name == "llama3-8b":
        model_config = {
            "model": "meta-llama/Meta-Llama-3-8B",
        }
    elif model_name == "mistral-7b-instruct-v2":
        model_config = {
            "model": "mistralai/Mistral-7B-Instruct-v0.2",
        }
    elif model_name == "llama3-8b-instruct":
        model_config = {
            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
        }
    else:
        raise ValueError(f"Model name not recognized: {model_name}")

    model_config["download_dir"] = MODELS_DOWNLOAD_DIRPATH
    model_config["max_logprobs"] = 100
    return model_config

In [39]:
model_config = get_model_config(model_name)
sampling_params = get_sampling_params(inference_mode)

llm = LLM(**model_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

INFO 05-06 21:58:33 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='meta-llama/Meta-Llama-3-8B', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir='/content/drive/MyDrive/CSCI 567 Final Project/models', load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B)


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

INFO 05-06 21:58:37 utils.py:660] Found nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 05-06 21:58:38 selector.py:81] Cannot use FlashAttention-2 backend because the flash_attn package is not found. Please install it for better performance.
INFO 05-06 21:58:38 selector.py:32] Using XFormers backend.
INFO 05-06 21:58:40 weight_utils.py:199] Using model weights format ['*.safetensors']
INFO 05-06 22:04:59 model_runner.py:175] Loading model weights took 14.9595 GB
INFO 05-06 22:05:00 gpu_executor.py:114] # GPU blocks: 9557, # CPU blocks: 2048
INFO 05-06 22:05:02 model_runner.py:937] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-06 22:05:02 model_runner.py:941] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing e

In [40]:
label_tokens = get_label_tokens(model_name, model_config["model"], dataset_name, label_names)
label_tokens = dict(sorted(label_tokens.items(), key=lambda x: x[0]))

label_tokens

Loaded label tokens: {'0': [15], '1': [16]}


{'0': [15], '1': [16]}

In [60]:
preds_output_filepath = f"{PREDS_OUTPUT_DIRPATH}/test-preds-dataset={test_data_dirpath.split('/')[-1].replace('.csv', '')}-model={model_name}.jsonl"
predictions, true_labels, labels_notin_top100, ood_predictions = do_inference(
    inference_mode=inference_mode,
    llm=llm,
    sampling_params=sampling_params,
    prompts=prompts,
    preds_output_filepath=preds_output_filepath,
    true_labels=true_labels,
    sample_ids=None,
    label_tokens=label_tokens,
)

Generating outputs...


Processed prompts: 100%|██████████| 5200/5200 [05:45<00:00, 15.04it/s]


Top100 inference


100%|██████████| 5200/5200 [00:18<00:00, 274.22it/s]


Number of OOD predictions: 1, number of labels not in top 100 tokens: 2


## Evaluation


In [61]:
def evaluate(true_labels, predictions):
    print("Evaluating...")
    uncalibrated_acc = np.mean(
        [1 if pred == true_label else 0 for pred, true_label in zip(predictions, true_labels)]
    )
    micro_f1 = f1_score(true_labels, predictions, average="micro")
    weighted_f1 = f1_score(true_labels, predictions, average="weighted")

    print(f"Uncalibrated accuracy: {uncalibrated_acc:.4f}")
    print(f"Uncalibrated micro F1 score: {micro_f1:.4f}")
    print(f"Uncalibrated weighted F1 score: {weighted_f1:.4f}")
    return uncalibrated_acc, micro_f1, weighted_f1

In [62]:
uncalibrated_acc, micro_f1, weighted_f1 = evaluate(true_labels, predictions)

Evaluating...
Uncalibrated accuracy: 0.5465
Uncalibrated micro F1 score: 0.5465
Uncalibrated weighted F1 score: 0.5476


In [63]:
from collections import Counter

Counter(predictions)

Counter({'0': 2719, '1': 2481})