# No-Data Algorithm 

>Applications to West Frisian

Implementation of the second experiment of the paper, where we take a low-resource language and a standard rubric and show that this thing can detect lies.

In [1]:
# Shared functions/imports that you'll need
import json
import random
from tqdm import tqdm

from llmclient import LLMClient, get_llm_response

from no_data_utils import print_metrics
from no_data_prompts import (get_evaluator_prompt_single_criteria,
                             get_evaluator_prompt_all_criteria,
                             ALL_CRIT_DEFAULT_RESPONSE)


random.seed(123)
rounds = 3 # EV Rounds

def retrieve(prompt, llm, DEFAULT_RESPONSE={"Label": 0}, return_raw=False):
    '''
    Get the response from the LLM, accounting for failures and returning default responses as appropriate.
    raw is for debugging only.
    '''
    response = None
    tmp_response = None
    tries = 0
    while True:
        if tries > 5: break
        try:
            response = get_llm_response(llm, prompt)
            tmp_response = response
            response = response.replace("```json", "").replace("```", "")
            response = json.loads(response.strip(), strict=False)
            if response is not None:
                break
        except:
            response = None
            tries += 1
    if response is None:
        if return_raw:
            return DEFAULT_RESPONSE, tmp_response
        return DEFAULT_RESPONSE, True
    return response, False


nl_data = json.load(open("west_frisian_dataset_with_rubric_shuffled.json", "r", encoding="utf-8")) #nl stands for "natural-language", not the language code for Dutch (nl-NL)
other_exemplars = json.load(open("other_frisian_exemplars.json", "r", encoding="utf-8"))
good_exemplars = json.load(open("good_frisian_exemplars.json", "r", encoding="utf-8"))

# Ok so we use train for test and test to generate exemplars. 
# Code is too spaghetti to fix now
train_data, test_data = nl_data[500:], nl_data[:500]

## Baseline -- The Known Case

We need to get the known cases. From here we'll characterise the model and note that this is the absolute maximum a model can get _if_ you knew the data.
We will test multiple prompts.

### All criteria

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def baseline_all_criteria(models, use_other=False):
    suff = "other_rubric" if use_other else "good_rubric"

    for MODEL in models:
        params = {"max_tokens": 128, "temperature": 0}
        llm = LLMClient(params, MODEL)
        labels = []
        for ix in tqdm(range(len(train_data)), desc=MODEL):
            entry = train_data[ix]
            pred_entry = {k:v for k, v in entry.items()}

            # No breakdown
            llm.update_params(params)
            prompt = get_evaluator_prompt_all_criteria(entry, request_breakdown=False, 
                                                       request_reasons=False, use_other=use_other)
            response, _ = retrieve(prompt, llm)
            labels.append(response["Label"])
            pred_entry["NoBreakdown"] = response
            # Breakdown no reasons
            llm.update_params(params)
            prompt = get_evaluator_prompt_all_criteria(entry, request_breakdown=True, 
                                                       request_reasons=False, use_other=use_other)
            response, _ = retrieve(prompt, llm, DEFAULT_RESPONSE={"c1": 0, "c2a": 0, "c2b": 0, "c3": 0, "c4": 0, "c5": 0})
            pred_entry["BreakdownNoReasons"] = response

            # Breakdown + reasons
            prompt = get_evaluator_prompt_all_criteria(entry, request_breakdown=True, 
                                                       request_reasons=True, use_other=use_other)
            response, _ = retrieve(prompt, llm, DEFAULT_RESPONSE=ALL_CRIT_DEFAULT_RESPONSE)
            pred_entry["BreakdownReasons"] = response

            with open(f'{MODEL}_all_crit_{suff}.json', "a", encoding="utf-8") as f:
                f.write(json.dumps(pred_entry, ensure_ascii=False) + "\n")


def score_all_crit(models):
    true_labels = []
    for e in train_data:
        true_labels.append(e["Label"])

    for model in models:
        for rub in [("all_crit_good_rubric", "IF"), ("all_crit_other_rubric", "OOF")]:
            print(f" --- {rub[-1]} ---")
            pred_data = [json.loads(l) for l in open(f"{model}_{rub[0]}.json", "r", encoding="utf-8").readlines()]
            breakdown_no_reasons, breakdown_reasons = {}, {}
            no_breakdown = []
            breakdown_no_reasons = []
            breakdown_reasons = []

            for e in pred_data:
                no_breakdown.append(e["NoBreakdown"]["Label"])
                flip1, flip2 = False, False
                for crit in ["c1", "c2a", "c2b", "c3", "c4", "c5"]:
                    if e["BreakdownNoReasons"][crit] == 0:
                        flip1 = True
                        break
                    if e["BreakdownReasons"][crit] == 0:
                        flip2 = True
                        break
                breakdown_no_reasons.append(0 if flip1 else 1)
                breakdown_reasons.append(0 if flip2 else 1)

            print(model)
            print("--- no breakdown ---")
            acc = accuracy_score(no_breakdown, true_labels)
            f1 = f1_score(true_labels, no_breakdown)
            print(acc, f1)

            print("--- breakdown no reasons ---")
            acc = accuracy_score(breakdown_no_reasons, true_labels)
            f1 = f1_score(true_labels, breakdown_no_reasons)
            print(acc, f1)

            print("--- breakdown reasons ---")
            acc = accuracy_score(breakdown_reasons, true_labels)
            f1 = f1_score(true_labels, breakdown_reasons)
            print(acc, f1)


models = ["gpt-41-longco-2025-04-14"]
baseline_all_criteria(models, use_other=False)
baseline_all_criteria(models, use_other=True)

score_all_crit(models)

dev-gpt-41-longco-2025-04-14: 100%|██████████| 515/515 [2:17:10<00:00, 15.98s/it]
dev-gpt-41-longco-2025-04-14: 100%|██████████| 515/515 [2:21:16<00:00, 16.46s/it]

 --- IF ---
dev-gpt-41-longco-2025-04-14
--- no breakdown ---
0.7631067961165049 0.8038585209003215
--- breakdown no reasons ---
0.5339805825242718 0.6816976127320955
--- breakdown reasons ---
0.46601941747572817 0.0
 --- OOF ---
dev-gpt-41-longco-2025-04-14
--- no breakdown ---
0.5009708737864078 0.007722007722007722
--- breakdown no reasons ---
0.4854368932038835 0.00749063670411985
--- breakdown reasons ---
0.5145631067961165 0.6719160104986877





### Per-criterion

In [None]:
def baseline_per_criteria(models, use_other=False):
    suff = "other_rubric" if use_other else "good_rubric"

    for MODEL in models:
        params = {"max_tokens": 128, "temperature": 0}
        llm = LLMClient(params, MODEL)
        labels, labels_reasons, true_labels = {}, {}, {}
        for ix in tqdm(range(len(train_data)), desc=MODEL):
            entry = train_data[ix]
            pred_entry = {k:v for k, v in entry.items()}

            pred_entry["NoReasons"] = {}
            for crit in ["c1", "c2a", "c2b", "c3", "c4", "c5"]:
                prompt = get_evaluator_prompt_single_criteria(entry, 
                                                              crit, 
                                                              request_reasons=False,
                                                              use_other=use_other)
                response, _ = retrieve(prompt, llm, DEFAULT_RESPONSE={crit: 0})
                pred_entry["NoReasons"][crit] = response[crit]

                if crit not in labels: labels[crit] = []
                labels[crit].append(pred_entry["NoReasons"][crit])
                if crit not in true_labels: true_labels[crit] = []
                true_labels[crit].append(entry["Rubric"][crit])

            pred_entry["Reasons"] = {}
            for crit in ["c1", "c2a", "c2b", "c3", "c4", "c5"]:
                # Breakdown no reasons
                prompt = get_evaluator_prompt_single_criteria(entry, 
                                                              crit, request_reasons=True,
                                                              use_other=use_other)
                response, _ = retrieve(prompt, llm, DEFAULT_RESPONSE={crit: 0, crit + "_reason": "FAIL"})
                pred_entry["Reasons"][crit] = response[crit]

                if crit not in labels_reasons: labels_reasons[crit] = []
                labels_reasons[crit].append(pred_entry["Reasons"][crit])

            with open(f'{MODEL}_per_crit_{suff}.json', "a", encoding="utf-8") as f:
                f.write(json.dumps(pred_entry, ensure_ascii=False) + "\n")


def score_per_criteria(models):

    true_labels_per_crit = {k: [] for k in ["c1", "c2a", "c2b", "c3", "c4", "c5"]}
    true_labels = []
    for e in train_data:
        true_labels.append(e["Label"])
        for crit in ["c1", "c2a", "c2b", "c3", "c4", "c5"]:
            true_labels_per_crit[crit].append(e["Rubric"][crit])

    for model in models:
        for rub in [("per_crit_good_rubric", "IF"), ("per_crit_other_rubric", "OOF")]:
            print(f" ==== {rub[-1]} ===")
            pred_data = [json.loads(l) for l in open(f"{model}_{rub[0]}.json", "r", encoding="utf-8").readlines()]

            print(f"------- {model} ---------")
            no_reasons, reasons = [], []
            preds_per_crit_reasons = {k: [] for k in ["c1", "c2a", "c2b", "c3", "c4", "c5"]}
            preds_per_crit_no_reasons = {k: [] for k in ["c1", "c2a", "c2b", "c3", "c4", "c5"]}
            for e in pred_data:
                # Label prediction
                flip1, flip2 = False, False
                for crit in ["c1", "c2a", "c2b", "c3", "c4", "c5"]:
                    preds_per_crit_reasons[crit].append(e["Reasons"][crit])
                    preds_per_crit_no_reasons[crit].append(e["NoReasons"][crit])
                    if e["Reasons"][crit] == 0:
                        flip1 = True
                    if e["NoReasons"][crit] == 0:
                        flip2 = True
                reasons.append(0 if flip1 else 1)
                no_reasons.append(0 if flip2 else 1)

            print("--- no reasons ---")
            acc = round(accuracy_score(no_reasons, true_labels), 2)
            f1 =  round(f1_score(no_reasons, true_labels), 2)
            for k, v in preds_per_crit_no_reasons.items():
                c_acc = round(accuracy_score(preds_per_crit_no_reasons[k], 
                                            true_labels_per_crit[k]), 2)
                c_f1 = round(f1_score(preds_per_crit_no_reasons[k], 
                                    true_labels_per_crit[k]), 2)
                print(f" --- {k}: {c_acc}, {c_f1}")

            print("--- reasons ---")
            acc = round(accuracy_score(reasons, true_labels), 2)
            f1 = round(f1_score(reasons, true_labels), 2)
            print(acc, f1)
            for k, v in preds_per_crit_reasons.items():
                c_acc = round(accuracy_score(preds_per_crit_reasons[k], 
                                            true_labels_per_crit[k]), 2)
                c_f1 = round(f1_score(preds_per_crit_reasons[k], 
                                    true_labels_per_crit[k]), 2)
                print(f" --- {k}: {c_acc}, {c_f1}")


models = ["dev-gpt-41-longco-2025-04-14"]
baseline_per_criteria(models, use_other=False)
baseline_per_criteria(models, use_other=True)
score_per_criteria(models)

dev-gpt-41-longco-2025-04-14: 100%|██████████| 515/515 [3:15:44<00:00, 22.81s/it]
dev-gpt-41-longco-2025-04-14: 100%|██████████| 515/515 [2:48:29<00:00, 19.63s/it]

 ==== IF ===
------- dev-gpt-41-longco-2025-04-14 ---------
--- no reasons ---
 --- c1: 0.97, 0.99
 --- c2a: 0.81, 0.89
 --- c2b: 0.86, 0.91
 --- c3: 0.93, 0.96
 --- c4: 0.95, 0.97
 --- c5: 0.85, 0.92
--- reasons ---
0.8 0.81
 --- c1: 0.95, 0.97
 --- c2a: 0.75, 0.86
 --- c2b: 0.85, 0.9
 --- c3: 0.92, 0.96
 --- c4: 0.95, 0.97
 --- c5: 0.8, 0.89
 ==== OOF ===
------- dev-gpt-41-longco-2025-04-14 ---------
--- no reasons ---
 --- c1: 0.03, 0.02
 --- c2a: 0.68, 0.81
 --- c2b: 0.2, 0.02
 --- c3: 0.83, 0.91
 --- c4: 0.87, 0.91
 --- c5: 0.11, 0.12
--- reasons ---
0.5 0.0
 --- c1: 0.03, 0.02
 --- c2a: 0.67, 0.8
 --- c2b: 0.19, 0.01
 --- c3: 0.84, 0.91
 --- c4: 0.71, 0.78
 --- c5: 0.15, 0.2





# No-Data Algorithm

### Shared stuff

In [2]:
from no_data_prompts import (get_generator_prompt,
                             get_evaluator_prompt_all_criteria,
                             get_evaluator_prompt_single_criteria,
                             good_rubric_nl,
                             other_rubric_nl,
                             get_generator_prompt_with_picker)


def no_data_algorithm_with_flips(X, evaluator, generator, max_rounds=3, phi=0.3, 
                                 use_other=False):
    '''
    The actual implementation of the No-Data algorithm, flipping labels.
    '''
    labels = []
    flips = []
    successes = []
    for i in tqdm(range(len(X))):
        x = X[i]
        success, label, reason = ev_protocol(x, evaluator, generator, max_rounds, 
                                             use_other=use_other)
        if success:
            flips.append([False, reason] if "Random" not in reason else [True, reason])
            labels.append(label)
            successes.append(1)
        else:
            successes.append(0)
            if random.uniform(0, 1) < phi:
                labels.append(label)
                flips.append([False, "Random flip"])
            else:
                label = not label
                labels.append(label)
                flips.append([True, "Random flip"])

    return labels, successes, flips


def ev_protocol(x, evaluator, generator, max_rounds, use_other=False):
    ''' 
    Quick and dirty implementation of the EV Protocol. In here, 
    we understand lying as generating a datapoint that is not within
    the correct phenomenon. 
    '''
    # Overwrite the criteria in the criterion-free style
    x = label_criteria_for(x, use_other=False)

    for _ in range(max_rounds):
        # Step 1: generate x \cong x' based on the belief of y
        y_tilde = evaluator(x, call_verifier=False, use_other=use_other)
        # Always generate with the rubric you know
        x_tilde = generator(x, y_tilde, use_other=use_other)
        # Step 2: get the challenge
        check = random.choice([check1, check2])
        # Step 3: git checkd
        success, reason = check(x, x_tilde, y_tilde=y_tilde, use_other=use_other, 
                                evaluator=evaluator)
        # Last bugfix: if the label is zero, the checks should fail... right?
        if not success: break
    return success, y_tilde, (reason, x, x_tilde)


In [3]:
def label_criteria_for(x, use_other=False):
    crits = {}
    for i, c in enumerate(["c1", "c2a", "c2b", "c3", "c4", "c5"]):
        prompt = get_evaluator_prompt_single_criteria(entry=x, 
                                                      criterion=c, 
                                                      request_reasons=True, 
                                                      use_other=use_other)
        label_x, _ = retrieve(prompt, llm_verifier,  
                              DEFAULT_RESPONSE={c: 0, f"{c}_reason": "FAIL"})
        crits[c] = label_x[c]
    x["Rubric"] = crits
    return x


def check1(x, x_tilde, y_tilde=None, evaluator=None, round=None, use_other=False):
    ''' 
    Assert that a subset matching the criteria exists in both
    x_tilde and x.
    I.e., verify the isomorphism for IF. 
    This one is trickier to implement because we need to know in advance whether 
    the rubric is decomposable.
    '''
    faileds = 0
    for i, c in enumerate(["c1", "c2a", "c2b", "c3", "c4", "c5"]):
        if use_other and c == "c3":
            if int(x_tilde["Rubric"][c]) != int(x["Rubric"]["c2a"]^x["Rubric"]["c2b"]):
                faileds += 1
        else:
            if int(x["Rubric"][c]) != int(x_tilde["Rubric"][c]):
                faileds += 1

    if faileds > 0:
        return False, f"Ch1 Failed {faileds} times"
    return True, ""


def check2(x, x_tilde, y_tilde, evaluator=None, round=None, use_other=False):
    ''' 
    Assert that the output of the criteria matches in both cases.
    Take in an estimated and given datapoints x_tilde, x, and a rubric.
    Since we do not know the label, we exact match (up to MAX_C2_ALLOWABLE_FAILS) the criterion.
    Return True/False and the reason.
    '''
    def get_encoding(c):
        enc = "".join([str(v) for k, v in c.items() if "reason" not in k])
        return enc

    faileds = 0    
    tilde_encoding = get_encoding(x_tilde["Rubric"])

    for i, c in enumerate(["c1", "c2a", "c2b", "c3", "c4", "c5"]):
        label_x = x["Rubric"]
        if int(label_x[c]) != int(tilde_encoding[i]):
            faileds += 1

    if faileds > 0:
        return False, f"Ch1 Failed {faileds} times"
    return True, ""


def get_score_from_crits(r):
    label = 1
    for crit in ["c1", "c2a", "c2b", "c3", "c4", "c5"]:
        if r[crit] == 0:
            label = 0
            break
    return label


def llm_evaluator_fn(x, call_verifier=False, use_other=False):
    """ Take in a datapoint, return an estimated label y_tilde.
    General purpose function to get a label using the (unknown) aggregator function.
    Note that the LLM 'knows' this aggregator.
    This is the EVALUATOR estimating the label, NOT the verifier
    """
    
    if not call_verifier:
        prompt = get_evaluator_prompt_all_criteria(x,
                                                   request_breakdown=True,
                                                   request_reasons=True, 
                                                   use_other=use_other)
        response, _ = retrieve(prompt, llm_evaluator, DEFAULT_RESPONSE=ALL_CRIT_DEFAULT_RESPONSE)
    else:
        prompt = get_evaluator_prompt_all_criteria(x,
                                                   request_breakdown=True,
                                                   request_reasons=True, 
                                                   use_other=False) # Never true for the verifier
        response, _ = retrieve(prompt, llm_verifier, DEFAULT_RESPONSE=ALL_CRIT_DEFAULT_RESPONSE)

    label_x = get_score_from_crits(response)
    return label_x


def llm_generator_fn(x, y_tilde, use_other=False):
    """ Take in a datapoint, estimated label y_tilde, and a rubric in the form of "use other".
    Return an x_tilde.
    """
    default_response = {
        "Prompt": "",
        "Output": ""
    }
    # First get the criteria, according to itself
    crits = {}
    for i, c in enumerate(["c1", "c2a", "c2b", "c3", "c4", "c5"]):
        prompt = get_evaluator_prompt_single_criteria(entry=x, 
                                                      criterion=c, 
                                                      request_reasons=True, 
                                                      use_other=use_other) 
        label_x, fail_state = retrieve(prompt, llm_evaluator, DEFAULT_RESPONSE={c: 0, f"{c}_reason": "FAIL"})
        if fail_state: print(f"evaluator fail {c}")
        crits[c] = label_x[c]
        crits[c + "_reason"] = label_x[c + "_reason"]

    prompt = get_generator_prompt(x, y_tilde,
                                  criteria=crits, 
                                  use_other=use_other)
    response, fail_state = retrieve(prompt, llm_generator, DEFAULT_RESPONSE=default_response)
    if "Output" not in response or "Prompt" not in response:
        response = default_response
    response["Rubric"] = crits
    response["Label"] = y_tilde
    return response


### Experiments

In [None]:
verifier_model = "gpt-41-longco-2025-04-14"
evaluator_model = "gpt-41-longco-2025-04-14"

params = {"max_tokens": 256, "temperature": 0}
llm_evaluator = LLMClient(params, evaluator_model)
params = {"max_tokens": 5000, "temperature": 0}
llm_generator = LLMClient(params, evaluator_model)
params = {"max_tokens": 256, "temperature": 0}
llm_verifier = LLMClient(params, verifier_model)

In [5]:
phi = 0.7 

good_flips, other_flips = 0, 0
test_set = [(None, p["Label"]) for p in train_data]
good_flips, other_flips = None, None

good_labels, good_successes, good_flips = no_data_algorithm_with_flips(train_data, 
                                                evaluator=llm_evaluator_fn,
                                                generator=llm_generator_fn,
                                                phi=phi,
                                                max_rounds=rounds,
                                                use_other=False)
print_metrics(good_labels, good_successes, test_set, "In", good_flips)

other_labels, other_successes, other_flips = no_data_algorithm_with_flips(train_data, 
                                                  evaluator=llm_evaluator_fn,
                                                  generator=llm_generator_fn,
                                                  phi=phi,
                                                  max_rounds=rounds,
                                                  use_other=True)

print_metrics(other_labels, other_successes, test_set, "Out", other_flips)

100%|██████████| 515/515 [12:15:54<00:00, 85.74s/it]   


In-phenomenon test score: 0.744 | Successes: 86.796 | F1: 78.073 | Flips: 4.854


100%|██████████| 515/515 [9:29:35<00:00, 66.36s/it]   

Out-phenomenon test score: 0.491 | Successes: 1.165 | F1: 39.07 | Flips: 33.592





In [None]:
phi = 0.3

good_flips, other_flips = 0, 0
test_set = [(None, p["Label"]) for p in train_data]
good_flips, other_flips = None, None

good_labels, good_successes, good_flips = no_data_algorithm_with_flips(train_data, 
                                                evaluator=llm_evaluator_fn,
                                                generator=llm_generator_fn,
                                                phi=phi, 
                                                use_other=False,
                                                max_rounds=rounds)
print_metrics(good_labels, good_successes, test_set, "In", good_flips)

other_labels, other_successes, other_flips = no_data_algorithm_with_flips(train_data, 
                                                                        evaluator=llm_evaluator_fn,
                                                                        generator=llm_generator_fn,
                                                                        phi=phi,
                                                                        max_rounds=rounds,
                                                                        use_other=True)

print_metrics(other_labels, other_successes, test_set, "Out", other_flips)

100%|██████████| 515/515 [12:57:41<00:00, 90.61s/it]   


In-phenomenon test score: 0.724 | Successes: 87.767 | F1: 76.568 | Flips: 9.32


100%|██████████| 515/515 [9:34:32<00:00, 66.94s/it]   

Out-phenomenon test score: 0.493 | Successes: 1.359 | F1: 57.835 | Flips: 70.291





In [None]:
phi = 0.9

good_flips, other_flips = 0, 0
test_set = [(None, p["Label"]) for p in train_data]
good_flips, other_flips = None, None

good_labels, good_successes, good_flips = no_data_algorithm_with_flips(train_data, 
                                                evaluator=llm_evaluator_fn,
                                                generator=llm_generator_fn,
                                                phi=phi, 
                                                use_other=False,
                                                max_rounds=rounds)

print_metrics(good_labels, good_successes, test_set, "In", good_flips)

other_labels, other_successes, other_flips = no_data_algorithm_with_flips(train_data, 
                                                  evaluator=llm_evaluator_fn,
                                                  generator=llm_generator_fn,
                                                  phi=phi,
                                                  use_other=True,
                                                  max_rounds=rounds)

print_metrics(other_labels, other_successes, test_set, "Out", other_flips)

100%|██████████| 515/515 [12:08:51<00:00, 84.92s/it]   


In-phenomenon test score: 0.769 | Successes: 86.214 | F1: 79.796 | Flips: 1.748


100%|██████████| 515/515 [9:28:20<00:00, 66.22s/it]   

Out-phenomenon test score: 0.507 | Successes: 1.165 | F1: 16.447 | Flips: 9.126



