In [1]:
import csv
import config
import lexicon
import utils
import prompt

from collections import defaultdict
from transformers import AutoTokenizer



In [2]:
LEMMA_PATH = "../data/things/things-lemmas-annotated.csv"

In [3]:
# load all unique concepts
concepts = defaultdict(lexicon.Concept)
with open(LEMMA_PATH, "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        if row["remove"] != "1":
            concepts[row["lemma"]] = utils.lemma2concept(row)
concepts = dict(concepts)

In [4]:
'''
lemma: identifier
singular: singular form
plural: plural form
article: singular form with an appropriate article (a/an)
generic: if generic info involving this concept uses a plural or a singular form: p if plural, s if singular
taxonomic_phrase: taxonomic phrase for this concept (is a type of/are a type of)
'''
concepts['panda']

Concept(lemma='panda', singular='panda', plural='pandas', article='a panda', generic='p', taxonomic_phrase='are a type of')

In [5]:
# surface forms when expressed in our stimuli:

concepts['panda'].generic_surface_form(), concepts['garlic'].generic_surface_form() # you might have to remove "a/an" from this 

('pandas', 'garlic')

In [6]:
# pairing a concept with a property

# define a property
prop = lexicon.Property(
    property_name="daxable",
    singular="is daxable",
    plural="are daxable",
)

concepts['panda'].property_sentence(prop), concepts['garlic'].property_sentence(prop)


('pandas are daxable', 'a garlic is daxable')

In [9]:
# create stimuli based on prompts

# load prompts based on template defined in config.py
template_config = config.PROMPTS["variation-qa-1"]
prompt_template = prompt.Prompt(
    template=template_config["template"], zero_shot=template_config["zero_shot"]
)

# generate
prompt_template.create_stimulus(
    premise=concepts["panda"], conclusion=concepts["animal"], prop=prop
)

'Answer the question. Given that pandas are daxable, is it true that animals are daxable?\nAnswer with Yes/No.\n'

In [12]:
# generate stimulus in a format that chat models expect:
tokenizer = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2", cache_dir="/home/shared/km_cache"
)  # ignore the cache part, it's some cursed gpu cluster issue for me

prompt_template.create_stimulus(
    premise=concepts["panda"],
    conclusion=concepts["animal"],
    prop=prop,
    tokenizer=tokenizer,
)

'<s> [INST] Given that pandas are daxable, is it true that animals are daxable? Answer with Yes/No: [/INST]'