In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import functools
import re

from IPython.display import display

from cryptic.evaluation.evaluations import GeorgeHoQA
from cryptic.models.oai import OpenAIQA
from cryptic.models.selectors import SelfConsistencySelector
from cryptic.models.validators import SubWordConsistencyValidator
from cryptic.prompts.interface import PromptInterface

In [None]:
CSV_FILENAME = "data/examples/qc_1711.csv"
DUMMY_CLUE = "A farmer's son"
NUM_CLUES = 3
NUM_ANSWERS = 2

Check that injection of clue and num letters into prompt template is working correctly:

In [None]:
prompt_interface = PromptInterface("cryptic.prompts.cot.ben_v1")
prompt = prompt_interface.inject_prompt(DUMMY_CLUE, 5)
print(prompt)

Check extraction of answer is working correctly

In [None]:
out = prompt_interface.prompt_templates["example_output"]

In [None]:
res = prompt_interface.extract_answer(out)
res

In [None]:
prompt_interface.decompose(res["wordplay"])

In [None]:
validator = SubWordConsistencyValidator(prompt_interface)

In [None]:
evaluation = GeorgeHoQA(CSV_FILENAME, num_answers=NUM_ANSWERS, num_clues=NUM_CLUES)

In [None]:
model = OpenAIQA(
    prompt_interface=prompt_interface,
    model_name="text-davinci-003",
    max_tokens=64,
    validators=[validator],
    validator_names=["consistent_decomposition"]
)

In [None]:
df = evaluation.qa_frame.df
df.head(3)

First feed a single clue through the API to look at model output and check that we are correctly parsing responses.

This might need to be tweaked: e.g. removal of newline / space chars etc,
to convert into a standard answer format

In [None]:
clue, num_letters = evaluation.qa_frame.sample(1).df.iloc[0][["clue", "num_letters"]]

In [None]:
response = model.get_response(clue, num_letters, num_answers=2)
response

In [None]:
out = response["choices"][0]["text"]

In [None]:
res = prompt_interface.extract_answer(response["choices"][0]["text"])
res

In [None]:
prompt_interface.decompose(res["wordplay"])

In [None]:
validator.validate(res["answer"], res["predicted_definition"], res["wordplay"])

Once we're confident that answers are being extracted correctly we can run a whole set of clues through

In [None]:
metrics, answer_df = evaluation.run(model)

In [None]:
model.validator_names

In [None]:
pred_cols = ["prediction", "predicted_definition", "wordplay"]
vis_cols = pred_cols + model.validator_names + ["prediction_satisfies_constraints"]

In [None]:
for i in range(NUM_ANSWERS):
    display(answer_df[["clue", "answer"] + [c + f"-{i}" for c in vis_cols]])

I want to somehow use the validator columns to prioritise answers.

I think it might be useful to push the dataframe to long form.

In [None]:
long_df = pd.wide_to_long(answer_df, vis_cols, i="rowid", j="index", sep="-")

In [None]:
long_df

In [None]:
pred_cols = ["prediction", "predicted_definition", "wordplay"] + model.validator_names + ["prediction_satisfies_constraints"]

In [None]:
selector = SelfConsistencySelector(model.validator_names)

In [None]:
selector.select(answer_df)