In [1]:
!pip install datasets sentence-transformers transformers tqdm

import re
import numpy as np
import torch
from datasets import load_dataset
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM



In [2]:
# ---------- Key Functions for Experiment (Cell 2) ----------

def build_dpp_kernel(demo_embeds, test_embed, scale_factor=1.0):
    """
    Constructs the DPP kernel matrix L' given demo embeddings and a test embedding.
    L'[i,j] = r[i] * sim_mat[i,j] * r[j], where
    r[i] = dot(demo_embeds[i], test_embed) and
    sim_mat = demo_embeds @ demo_embeds.T.
    """
    n = len(demo_embeds)
    r = np.einsum("ij,j->i", demo_embeds, test_embed)  # relevance scores
    sim_mat = demo_embeds @ demo_embeds.T                # similarity among demos
    Lprime = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            Lprime[i, j] = r[i] * sim_mat[i, j] * r[j]
    return Lprime

def dpp_map_inference(Lprime, k_select=4):
    """
    Greedy MAP inference for DPP.
    Iteratively selects demos that maximize the incremental gain in log-determinant.
    """
    n = Lprime.shape[0]
    selected = []
    current_logdet = 0.0

    def submatrix_logdet(indices):
        if len(indices) == 0:
            return 0.0
        subm = Lprime[np.ix_(indices, indices)]
        return np.linalg.slogdet(subm)[1]  # logabsdet

    candidates = set(range(n))
    for _ in range(k_select):
        best_gain = -1e9
        best_item = None
        for i in candidates:
            new_set = selected + [i]
            gain = submatrix_logdet(new_set) - current_logdet
            if gain > best_gain:
                best_gain = gain
                best_item = i
        selected.append(best_item)
        candidates.remove(best_item)
        current_logdet += best_gain
    return selected

# Original submodular method with logarithm term
def submodular_diverse_select(demo_embeds, test_embed, k=4, lambd=1.0):
    """
    Selects demos using a λ-weighted rule that promotes both relevance and diversity.
    Score = (test_embed^T * V_S^-1 * x)^2 / (1 + x^T * V_S^-1 * x) + λ * log(1 + x^T * V_S^-1 * x)
    V_S is updated iteratively starting from lambd*I.
    """
    n = demo_embeds.shape[0]
    d = demo_embeds.shape[1]
    selected = []
    V_S = lambd * np.eye(d)
    candidate_indices = set(range(n))
    for _ in range(k):
        invV_S = np.linalg.inv(V_S)
        best_val = -1e9
        best_idx = None
        for i in candidate_indices:
            x = demo_embeds[i]
            num = (test_embed @ invV_S @ x)**2
            denom = 1.0 + (x @ invV_S @ x)
            score = num / denom + lambd * np.log(denom)
            if score > best_val:
                best_val = score
                best_idx = i
        selected.append(best_idx)
        x_sel = demo_embeds[best_idx]
        V_S = V_S + np.outer(x_sel, x_sel)
        candidate_indices.remove(best_idx)
    return selected

# New function: submodular method without the logarithm term
def submodular_diverse_select_no_log(demo_embeds, test_embed, k=4, lambd=1.0):
    """
    Selects demos using a λ-weighted rule that promotes both relevance and diversity.
    Score = (test_embed^T * V_S^-1 * x)^2 / (1 + x^T * V_S^-1 * x) + λ * (1 + x^T * V_S^-1 * x)
    V_S is updated iteratively starting from lambd*I.
    """
    n = demo_embeds.shape[0]
    d = demo_embeds.shape[1]
    selected = []
    V_S = lambd * np.eye(d)
    candidate_indices = set(range(n))
    for _ in range(k):
        invV_S = np.linalg.inv(V_S)
        best_val = -1e9
        best_idx = None
        for i in candidate_indices:
            x = demo_embeds[i]
            num = (test_embed @ invV_S @ x)**2
            denom = 1.0 + (x @ invV_S @ x)
            score = num / denom + lambd * denom
            if score > best_val:
                best_val = score
                best_idx = i
        selected.append(best_idx)
        x_sel = demo_embeds[best_idx]
        V_S = V_S + np.outer(x_sel, x_sel)
        candidate_indices.remove(best_idx)
    return selected

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
llm_model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")

def llm_classify_with_prompt(selected_demos, test_sentence):
    """
    Build a strict prompt using the selected demos and the test sentence.
    The prompt instructs the model to output only '0' for negative or '1' for positive.

    Prompt format:
      "Below are some movie review examples with their sentiment (output only '0' or '1'):
       1) <sentence> => <label>
       ...
       Now classify this new movie review:
       <test_sentence>
       Answer (0/1 only):"

    Returns the prompt and the LLM's output.
    """
    demo_str = ""
    for i, (sent, lbl) in enumerate(selected_demos, 1):
        demo_str += f"{i}) {sent} => {lbl}\n"
    prompt = (
        "Below are some movie review examples with their sentiment (output only '0' for negative and '1' for positive):\n"
        f"{demo_str}\n"
        "Now classify this new movie review:\n"
        f"{test_sentence}\n"
        "Please only give a number and nothing else."
        "Answer (0/1 only):"
    )
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        output_ids = llm_model.generate(inputs.input_ids, max_new_tokens=20, do_sample=False)
    result = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return prompt, result.strip()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
# ---------- Cell 3: Synthetic Dataset & Inference Comparison with Powerful Embedder ----------

# Synthetic dataset: demonstration texts and labels
demo_texts = [
    "I absolutely loved this film, it was brilliant",
    "The movie was terrible and a waste of time",
    "What a wonderful and uplifting story for cat lovers",
    "I hated every second of this dull flick about soccer",
    "Brilliant performance by the lead actor in a silent drama",
    "A boring script that put me to sleep after 10 minutes",
    "Incredible visuals of outer space and a mesmerizing plot",
    "Lackluster direction with no redeeming qualities in the dialogue",
    "One of the best experiences in cinema for sci-fi enthusiasts",
    "Mediocre acting, nothing special here",
    "Loved the music and the cinematography, especially the violin solos",
    "The dialogue was cheesy and cringe-worthy, especially the romance bits",
]
demo_labels = [
    "positive", "negative", "positive", "negative", "positive", "negative",
    "positive", "negative", "positive", "negative", "positive", "negative"
]

# Synthetic test inputs (e.g., movie reviews)
test_inputs = [
    "The film was engaging but the pacing was off.",
    "I found the movie to be an utter disappointment.",
    "An absolute masterpiece with brilliant acting.",
    "The storyline was weak and the dialogues were cringe-worthy."
]

# Use an extremely powerful, high-dimensional embedder
# "all-roberta-large-v1" produces 1024-dim embeddings
embed_model_name = "sentence-transformers/all-roberta-large-v1"
embedder = SentenceTransformer(embed_model_name)
demo_embeddings = embedder.encode(demo_texts, convert_to_numpy=True)
print("Demo embeddings computed. Shape:", demo_embeddings.shape)

# Evaluate the demo selection methods on the synthetic test set.
for test_input in test_inputs:
    test_embed = embedder.encode([test_input], convert_to_numpy=True)[0]

    # Run CEIL's DPP MAP Inference
    Lprime = build_dpp_kernel(demo_embeddings, test_embed)
    dpp_indices = dpp_map_inference(Lprime, k_select=4)
    dpp_demos = [(demo_texts[i], demo_labels[i]) for i in dpp_indices]

    # Run Submodular Method (with log)
    submod_indices = submodular_diverse_select(demo_embeddings, test_embed, k=4, lambd=1.0)
    submod_demos = [(demo_texts[i], demo_labels[i]) for i in submod_indices]

    # Run Submodular Method (no log)
    submod_no_log_indices = submodular_diverse_select_no_log(demo_embeddings, test_embed, k=4, lambd=1.0)
    submod_no_log_demos = [(demo_texts[i], demo_labels[i]) for i in submod_no_log_indices]

    print("------------------------------------------------------------")
    print("Test Input:", test_input)
    print("CEIL DPP Selected Indices:", dpp_indices)
    print("CEIL DPP Demos:", dpp_demos)
    print("Submodular (with log) Selected Indices:", submod_indices)
    print("Submodular (with log) Demos:", submod_demos)
    print("Submodular (no log) Selected Indices:", submod_no_log_indices)
    print("Submodular (no log) Demos:", submod_no_log_demos)
    print("------------------------------------------------------------\n")


Demo embeddings computed. Shape: (12, 1024)
------------------------------------------------------------
Test Input: The film was engaging but the pacing was off.
CEIL DPP Selected Indices: [1, 10, 8, 7]
CEIL DPP Demos: [('The movie was terrible and a waste of time', 'negative'), ('Loved the music and the cinematography, especially the violin solos', 'positive'), ('One of the best experiences in cinema for sci-fi enthusiasts', 'positive'), ('Lackluster direction with no redeeming qualities in the dialogue', 'negative')]
Submodular (with log) Selected Indices: [1, 10, 5, 2]
Submodular (with log) Demos: [('The movie was terrible and a waste of time', 'negative'), ('Loved the music and the cinematography, especially the violin solos', 'positive'), ('A boring script that put me to sleep after 10 minutes', 'negative'), ('What a wonderful and uplifting story for cat lovers', 'positive')]
Submodular (no log) Selected Indices: [1, 10, 2, 5]
Submodular (no log) Demos: [('The movie was terrible 

In [4]:
from datasets import load_dataset
from tqdm import tqdm
import re

# Load SST2 dataset (from GLUE)
ds_sst2 = load_dataset("nyu-mll/glue", "sst2")
# Use first 1000 examples for demo set and next 1000 for test set
demo_set = ds_sst2["train"].select(range(1000))
test_set = ds_sst2["train"].select(range(1000, 1100))
print("Demo set size:", len(demo_set))
print("Test set size:", len(test_set))

# For SST2, the text is in "sentence" and label is either 0 or 1.
demo_texts = [ex["sentence"] for ex in demo_set]
demo_labels = [str(ex["label"]) for ex in demo_set]  # convert label to string for prompt use

# Compute embeddings for demo set using the high-dimensional embedder defined in Cell 2
demo_embeddings = embedder.encode(demo_texts, convert_to_numpy=True)
print("Demo embeddings computed. Shape:", demo_embeddings.shape)

# Evaluation: We'll use both methods to select demos and then use the LLM to classify test examples.
total_correct_dpp = 0
total_correct_submod = 0
total_correct_submod_nolog = 0
total_cases = len(test_set)

for instance in tqdm(test_set, desc="Evaluating SST2"):
    test_sentence = instance["sentence"]
    gt_label = instance["label"]  # 0 or 1 (as integer)
    test_embed = embedder.encode([test_sentence], convert_to_numpy=True)[0]

    # Method A: CEIL's DPP MAP Inference
    Lprime = build_dpp_kernel(demo_embeddings, test_embed)
    dpp_indices = dpp_map_inference(Lprime, k_select=4)
    dpp_demos = [(demo_texts[i], demo_labels[i]) for i in dpp_indices]
    _, dpp_output = llm_classify_with_prompt(dpp_demos, test_sentence)
    try:
        dpp_pred = int(re.findall(r"(\d+)", dpp_output)[-1])
    except:
        dpp_pred = None

    # Method B: Submodular Method (with log)
    submod_indices = submodular_diverse_select(demo_embeddings, test_embed, k=4, lambd=1.0)
    submod_demos = [(demo_texts[i], demo_labels[i]) for i in submod_indices]
    _, submod_output = llm_classify_with_prompt(submod_demos, test_sentence)
    try:
        submod_pred = int(re.findall(r"(\d+)", submod_output)[-1])
    except:
        submod_pred = None

    # Method C: Submodular Method (no log)
    submod_nolog_indices = submodular_diverse_select_no_log(demo_embeddings, test_embed, k=4, lambd=1.0)
    submod_nolog_demos = [(demo_texts[i], demo_labels[i]) for i in submod_nolog_indices]
    _, submod_nolog_output = llm_classify_with_prompt(submod_nolog_demos, test_sentence)
    try:
        submod_nolog_pred = int(re.findall(r"(\d+)", submod_nolog_output)[-1])
    except:
        submod_nolog_pred = None

    if dpp_pred is not None and dpp_pred == gt_label:
        total_correct_dpp += 1
    if submod_pred is not None and submod_pred == gt_label:
        total_correct_submod += 1
    if submod_nolog_pred is not None and submod_nolog_pred == gt_label:
        total_correct_submod_nolog += 1

    # print("------------------------------------------------------------")
    # print(f"Test Sentence: {test_sentence}")
    # print(f"Ground Truth Label: {gt_label}")
    # print("\nLLM Output (CEIL DPP Method):", dpp_output)
    # print("LLM Output (Submodular with log Method):", submod_output)
    # print("LLM Output (Submodular without log Method):", submod_nolog_output)
    # print("------------------------------------------------------------\n")

dpp_acc = 100 * total_correct_dpp / total_cases
submod_acc = 100 * total_correct_submod / total_cases
submod_nolog_acc = 100 * total_correct_submod_nolog / total_cases

print("\n============================================================")
print(f"Final Accuracy over {total_cases} test cases:")
print(f"  CEIL's DPP Method Accuracy: {dpp_acc:.2f}%")
print(f"  Submodular Method (with log) Accuracy: {submod_acc:.2f}%")
print(f"  Submodular Method (no log) Accuracy: {submod_nolog_acc:.2f}%")


Demo set size: 1000
Test set size: 100
Demo embeddings computed. Shape: (1000, 1024)


Evaluating SST2:   0%|          | 0/100 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_to


Final Accuracy over 100 test cases:
  CEIL's DPP Method Accuracy: 48.00%
  Submodular Method (with log) Accuracy: 49.00%
  Submodular Method (no log) Accuracy: 49.00%



