In [None]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
import numpy as np
from tqdm import tqdm
import csv  # Import csv for writing to file

# Load SST-5 dataset
dataset = load_dataset("SetFit/sst5")
train_data = dataset["train"][:2000]
test_data = dataset["test"][:2000]
print("DATASET LOADED!")

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the embedding model (E5-large-v2) and tokenizer
embedder_id = "intfloat/e5-large-v2"
tokenizer = AutoTokenizer.from_pretrained(embedder_id)
embed_model = AutoModel.from_pretrained(embedder_id).to(device)
embed_model.eval()

# Helper function to compute mean-pooled embeddings for a list of texts
def encode_texts(text_list, prefix):
    """Return a tensor of embeddings for given texts (with specified prefix)."""
    inputs = tokenizer([f"{prefix}{text}" for text in text_list],
                       padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = embed_model(**inputs)
        last_hidden = outputs.last_hidden_state
        attention_mask = inputs.attention_mask.unsqueeze(-1)
        last_hidden = last_hidden * attention_mask
        sum_embeddings = last_hidden.sum(dim=1)
        lengths = attention_mask.sum(dim=1)
        embeddings = sum_embeddings / lengths
    return embeddings

# Compute embeddings for train and test examples with progress bar
train_texts = [text for text in train_data["text"]]
test_texts = [text for text in test_data["text"]]

# Wrap the embedding computation with tqdm
text_sets = [("passage: ", train_texts, "Training"), ("query: ", test_texts, "Test")]
embeddings = []
for prefix, texts, desc in tqdm(text_sets, desc="Computing embeddings"):
    embs = encode_texts(texts, prefix)
    embeddings.append(embs)
train_embs, test_embs = embeddings

# Convert embeddings to numpy for similarity computations
train_embs_np = train_embs.cpu().numpy()
test_embs_np = test_embs.cpu().numpy()
N, d = train_embs_np.shape  # number of train examples, embedding dimension
M = len(test_embs_np)       # number of test examples

def select_ceil_simple(z, K, lambda_val):
    """Select K demonstration indices using CEIL Simple (No Log) rule."""
    global train_embs_np, N
    d = train_embs_np.shape[1]
    V_inv = (1.0 / lambda_val) * np.eye(d)
    selected = []
    remaining = np.ones(N, dtype=bool)
    for _ in range(K):
        X_rem = train_embs_np[remaining]
        scores_query = X_rem.dot(z)
        B = X_rem.dot(V_inv)
        xTvix = np.sum(B * X_rem, axis=1)
        scores = scores_query + lambda_val * (1.0 + xTvix)
        idx = np.argmax(scores)
        global_idx = np.nonzero(remaining)[0][idx]
        selected.append(global_idx)
        remaining[global_idx] = False
        x_sel = train_embs_np[global_idx]
        u = V_inv.dot(x_sel)
        V_inv = V_inv - np.outer(u, u) / (1.0 + np.dot(x_sel, u))
    return selected

def select_submodular(z, K, lambda_val):
    """Select K demonstration indices using Submodular (No Log) rule."""
    global train_embs_np, N
    d = train_embs_np.shape[1]
    V_inv = (1.0 / lambda_val) * np.eye(d)
    selected = []
    remaining = np.ones(N, dtype=bool)
    for _ in range(K):
        if not np.any(remaining):
            break
        X_rem = train_embs_np[remaining]
        B = X_rem.dot(V_inv)
        xTvix = np.sum(B * X_rem, axis=1)
        p = B.dot(z)
        scores = (p**2) / (1.0 + xTvix) + lambda_val * (1.0 + xTvix)
        idx = np.argmax(scores)
        global_idx = np.nonzero(remaining)[0][idx]
        selected.append(global_idx)
        remaining[global_idx] = False
        x_sel = train_embs_np[global_idx]
        u = V_inv.dot(x_sel)
        V_inv = V_inv - np.outer(u, u) / (1.0 + np.dot(x_sel, u))
    return selected

# Load the language model and tokenizer
tokenizer_llm = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
model_llm = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B").to(device)

# Define label names and mapping
label_names = ["very negative", "negative", "neutral", "positive", "very positive"]
label_to_id = {name: i for i, name in enumerate(label_names)}

# Modified function to predict label and return additional information
def predict_label_for_test(test_index, demo_indices):
    """Construct prompt with demos and get model's predicted label, prompt, and generated text."""
    prompt_lines = []
    for idx in demo_indices:
        text = train_data["text"][idx]
        label = label_names[train_data["label"][idx]]
        prompt_lines.append(f"Review: {text}\nSentiment: {label}\n")
    test_text = test_data["text"][test_index]
    prompt_lines.append(f"Review: {test_text}\nSentiment: ")
    prompt = "".join(prompt_lines)
    output = model_llm.generate(**tokenizer_llm(prompt, return_tensors="pt").to(device),
                                max_new_tokens=3,
                                do_sample=False, early_stopping=True)
    generated = tokenizer_llm.decode(output[0][len(tokenizer_llm(prompt, return_tensors="pt")["input_ids"][0]):],
                                    skip_special_tokens=True)
    pred = generated.strip().lower()
    predicted_label = None
    for name in label_names:
        if pred.startswith(name):
            predicted_label = name
            break
    if predicted_label is None:
        for name in label_names:
            if name in pred:
                predicted_label = name
                break
    if predicted_label is None:
        predicted_label = "neutral"
    return label_to_id[predicted_label], prompt, generated

# Evaluate on test set with progress bar and write to CSV
lambda_val = 1.0
M_eval = min(400, M)
correct_ceil = 0
correct_submod = 0

with open('results.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['ceil_indices', 'submod_indices', 'ceil_prompt', 'submod_prompt', 'ceil_output', 'submod_output'])
    for t in tqdm(range(M_eval), desc="Evaluating test samples"):
        q_emb = test_embs_np[t]
        ceil_indices = select_ceil_simple(q_emb, K=10, lambda_val=lambda_val)
        submod_indices = select_submodular(q_emb, K=10, lambda_val=lambda_val)
        pred_ceil, prompt_ceil, generated_ceil = predict_label_for_test(t, ceil_indices)
        pred_submod, prompt_submod, generated_submod = predict_label_for_test(t, submod_indices)
        true_label = test_data["label"][t]
        if pred_ceil == true_label:
            correct_ceil += 1
        if pred_submod == true_label:
            correct_submod += 1
        # Convert indices to comma-separated strings and write to CSV
        ceil_indices_str = ','.join(map(str, ceil_indices))
        submod_indices_str = ','.join(map(str, submod_indices))
        writer.writerow([ceil_indices_str, submod_indices_str, prompt_ceil, prompt_submod, generated_ceil, generated_submod])

acc_ceil = correct_ceil / M_eval
acc_submod = correct_submod / M_eval
print()
print()
print(f"Accuracy with CEIL Simple (lambda={lambda_val}): {acc_ceil:.3f}")
print(f"Accuracy with Submodular (lambda={lambda_val}): {acc_submod:.3f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/421 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


train.jsonl:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

dev.jsonl:   0%|          | 0.00/171k [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/343k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8544 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1101 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2210 [00:00<?, ? examples/s]

DATASET LOADED!


tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Computing embeddings: 100%|██████████| 2/2 [00:22<00:00, 11.32s/it]


tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating test samples:   0%|          | 1/400 [00:04<31:08,  4.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating test samples:   0%|          | 2/400 [00:10<34:07,  5.15s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating test samples:   1%|          | 3/400 [00:13<30:01,  4.54s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating test samples:   1%|          | 4/400 [00:18<28:47,  4.36s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene



Accuracy with CEIL Simple (lambda=1.0): 0.265
Accuracy with Submodular (lambda=1.0): 0.258





In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Sweep lambda from 0.5 to 2.0 with step 0.1
lambda_values = np.arange(0.5, 2.1, 0.1)
acc_ceil_list = []
acc_submod_list = []

M_eval = min(400, M)  # use 400 test examples (or full test set if smaller)
for lambda_val in lambda_values:
    correct_ceil = 0
    correct_submod = 0
    for t in range(M_eval):
        q_emb = test_embs_np[t]
        # Select demos for each method with current lambda
        ceil_idxs = select_ceil_simple(q_emb, K=10, lambda_val=lambda_val)
        submod_idxs = select_submodular(q_emb, K=10, lambda_val=lambda_val)
        # Get predictions
        pred_ceil = predict_label_for_test(t, ceil_idxs)
        pred_submod = predict_label_for_test(t, submod_idxs)
        true_label = test_data["label"][t]
        if pred_ceil == true_label:
            correct_ceil += 1
        if pred_submod == true_label:
            correct_submod += 1
    # Record accuracy for this lambda
    acc_ceil = correct_ceil / M_eval
    acc_submod = correct_submod / M_eval
    acc_ceil_list.append(acc_ceil)
    acc_submod_list.append(acc_submod)
    print(f"lambda={lambda_val:.1f}: CEIL acc={acc_ceil:.3f}, Submod acc={acc_submod:.3f}")

# Plot accuracy vs lambda for both methods
plt.figure(figsize=(6,4))
plt.plot(lambda_values, acc_ceil_list, marker='o', label='CEIL Simple (No Log)')
plt.plot(lambda_values, acc_submod_list, marker='s', label='Submodular (No Log)')
plt.title('Accuracy vs. Diversity Weight (λ) on SST-5')
plt.xlabel('Diversity weight λ')
plt.ylabel('Accuracy')
plt.ylim(0.0, 1.0)
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

KeyboardInterrupt: 