This notebook contains code to generate feedback from the Collie dataset. It depends on having the Collie repo (https://github.com/princeton-nlp/Collie) cloned in the same directory as this notebook. 

This notebook takes five of the datasets and correspodning tasks from Collie, mainly:
wiki_co7/guten_c07 - contain words
wiki_co4 - character count
ccnews_c08 - all sentences have 1st word (actual: response starts with given 1st word)
ccnews_c06a - sentences have at least set number of words with at most set number of characters
wiki_c09 - sentence count, no words (actual: only no words words)

However, because it is difficult to distinguish what is a sentence vs an abbreviation with a period and space, we exclude the sentence constraints or modify them

In [4]:
import dill
from pathlib import Path
import sys

In [5]:
sys.path.append("Collie")
with open("Collie/data/all_data.dill", "rb") as f:
    all_data = dill.load(f)

In [6]:
out = open("generated_out/collie_gen.py", "w+")
out.write("from dataset.feedback import *\n\ncollie_feedback = [\n")

52

In [7]:
def get_feedback(prompt, domain, categories, metric, metric_value, comparison="Comparison.greater_than"):
    return """
    Feedback(
        content="{prompt}",
        domain="{domain}",
        effect="placeholder to fix ValidationError",
        scope=Scope.regional,
        categories={categories},
        type=Type.quantitative,
        metric={metric},
        metric_value={metric_value},
        comparison={comparison}
    ),
""".format(prompt=prompt, domain=domain, categories=categories, metric=metric, metric_value=metric_value, comparison=comparison)

In [8]:
import numpy as np
key = "wiki_c07"
chosen = np.random.choice(np.arange(len(all_data[key])), size=20, replace=False)
for i in chosen:
    obj = all_data[key][i]
    topic = obj['metadata']["title"]
    words = obj['prompt'].split("containing the word")[1][:-1].strip()
    prompt = f"When talking about the book {topic}, make sure all sentences contain the words {words}"
    domain = f"Talking about the book {topic}"
    out_str = get_feedback(prompt, domain, f"['collie', '{key}']", "Metric.contains_all_strings", """[{words}]""".format(words=words))
    out.write(out_str + "\n")

In [9]:
key = "wiki_c04"
chosen = np.random.choice(np.arange(len(all_data[key])), size=20, replace=False)
for i in chosen:
    obj = all_data[key][i]
    topic = obj['metadata']["title"]
    words = obj['prompt'].split("sentence with")[1][:-1].strip()
    prompt = f"When talking about the book {topic}, make sure all sentences have {words}"
    domain = f"Talking about the book {topic}"
    length = words.split("exactly")[1].split("characters")[0].strip()
    out_str = get_feedback(prompt, domain, f"['collie', '{key}']", "Metric.is_length", f"{length}")
    out.write(out_str + "\n")

In [10]:
key = "ccnews_c08/wiki_c08"
arr = []
arr.extend(all_data["wiki_c08"])
arr.extend(all_data["ccnews_c08"])
chosen = np.random.choice(np.arange(len(arr)), size=20, replace=False)
for i in chosen:
    obj = arr[i]
    topic = obj['metadata']["title"]
    words = obj['prompt'].split("1st word to be")[1][:-1].strip()
    prompt = f"When talking about {topic}, make sure the first sentences has a 1st word of {words}"
    domain = f"Talking about {topic}"
    out_str = get_feedback(prompt, domain, f"['collie', '{key}']", "Metric.first_words",words.lower())
    out.write(out_str + "\n")

In [11]:
key = "ccnews_c06a"
chosen = np.random.choice(np.arange(len(all_data[key])), size=20, replace=False)
for i in chosen:
    obj = all_data[key][i]
    topic = obj['metadata']["title"]
    targets = obj["targets"]
    prompt = f"When talking about how {topic}, make sure all sentences have at least {targets[0]} words with all words having at most {targets[1]} characters"
    domain = f"Talking about how {topic}"
    out_str = get_feedback(prompt, domain, f"['collie', '{key}']", "[Metric.word_count,Metric.word_length]", f"[{targets[0]}, {targets[1]}]")
    out.write(out_str + "\n")

In [12]:
key = "wiki_c09"
chosen = np.random.choice(np.arange(len(all_data[key])), size=20, replace=False)
for i in chosen:
    obj = all_data[key][i]
    topic = obj['metadata']["title"]
    targets = obj["targets"]
    words = "\'" + "', '".join(targets[1:]) + "'"
    prompt = f"When talking about {topic}, do not use the words {words}"
    domain = f"Talking about {topic}"
    out_str = get_feedback(prompt, domain, f"['collie', '{key}']", "Metric.contains_none_strings", f"{targets[1:]}")
    out.write(out_str + "\n")

In [13]:
out.write("]\n")
out.close()