This notebook saves prompt templates.

In [2]:
import datasets
import json
from pathlib import Path

# web_nlg

In [12]:
dataset = datasets.load_dataset("web_nlg", "release_v3.0_en", "train", cache_dir = "../../cache")
train_df = dataset["train"].to_pandas()
icl_per_prompt = 1

instructions = [
    "Following is a set of knowledge graph triples. Generate a coherent piece of text that contains all of the information in the triples. Only use information from the provided triples.",
    "Generate a coherent paragraph incorporating the information from the given set of knowledge graph triples.",
    "Formulate a cohesive passage using the details provided in the set of knowledge graph triples.",
]

prompt_fpath = Path("assets/web_nlg_1_shot.json") 
prompts = []
for i in range(3):
    icl_df = train_df.sample(icl_per_prompt, random_state=i)
    prompt = instructions[i] + "\n\n"
    for row in icl_df.to_dict(orient="records"):

        text = row["lex"]["text"][0]
        
        prompt += "### Triples\n"
        for triple in row["modified_triple_sets"]['mtriple_set'][0]:
            prompt += f"{triple}\n"
        prompt += "\n### Text\n" + text + "\n"
        prompt += "\n### Triples\n{triples}\n\n### Text"
    prompts.append(prompt)

prompt_fpath.write_text(json.dumps(prompts, indent=4))

1082

# e2e_nlg

## 5 ICL

In [2]:
icl_per_prompt = 2
INSTRUCTION = "Transform the meaning representation into a sentence."
dataset = datasets.load_dataset("e2e_nlg", split="train", cache_dir = "../../cache").to_pandas()
dataset.head(3)

prompt_fpath = Path("assets/e2e_nlg_1_shot.json") 
prompts = []
for i in range(5):
    icl_df = dataset.sample(icl_per_prompt, random_state=i)
    prompt = INSTRUCTION + "\n\n"
    for row in icl_df.to_dict(orient="records"):
        prompt += "Meaning representation: {meaning_representation}\nNatural language: {human_reference}\n\n".format(**row)
    prompt += "Meaning representation: {meaning_representation}\nNatural language:"
    prompts.append(prompt)

prompt_fpath.write_text(json.dumps(prompts, indent=4))

3611

## 7 ICL

In [3]:
icl_per_prompt = 2
INSTRUCTION = "Transform the meaning representation into a sentence."
dataset = datasets.load_dataset("e2e_nlg", split="train", cache_dir = "../../cache").to_pandas()
dataset.head(3)

prompt_fpath = Path("assets/e2e_nlg_1_shot_7_prompts.json") 
prompts = []
for i in range(7):
    icl_df = dataset.sample(icl_per_prompt, random_state=i)
    prompt = INSTRUCTION + "\n\n"
    for row in icl_df.to_dict(orient="records"):
        prompt += "Meaning representation: {meaning_representation}\nNatural language: {human_reference}\n\n".format(**row)
    prompt += "Meaning representation: {meaning_representation}\nNatural language:"
    prompts.append(prompt)

prompt_fpath.write_text(json.dumps(prompts, indent=4))

4926

## 3 ICL

In [25]:
icl_per_prompt = 2
INSTRUCTION = "Transform the meaning representation into a sentence."
dataset = datasets.load_dataset("e2e_nlg", split="train", cache_dir = "../../cache").to_pandas()
dataset.head(3)

Unnamed: 0,meaning_representation,human_reference
0,"name[The Vaults], eatType[pub], priceRange[mor...",The Vaults pub near Café Adriatic has a 5 star...
1,"name[The Cambridge Blue], eatType[pub], food[E...","Close to Café Brazil, The Cambridge Blue pub s..."
2,"name[The Eagle], eatType[coffee shop], food[Ja...",The Eagle is a low rated coffee shop near Burg...


In [24]:
prompt_fpath = Path("assets/e2e_nlg_1_shot_3_prompts.json") 
prompts = []
for _ in range(3):
    icl_df = dataset.sample(icl_per_prompt)
    prompt = INSTRUCTION + "\n\n"
    for row in icl_df.to_dict(orient="records"):
        prompt += "Meaning representation: {meaning_representation}\nNatural language: {human_reference}\n\n".format(**row)
    prompt += "Meaning representation: {meaning_representation}\nNatural language:"
    prompts.append(prompt)

prompt_fpath.write_text(json.dumps(prompts, indent=4))

2151

10 ICL

In [4]:
icl_per_prompt = 2
INSTRUCTION = "Transform the meaning representation into a sentence."
dataset = datasets.load_dataset("e2e_nlg", split="train", cache_dir = "../../cache").to_pandas()
dataset.head(3)

prompt_fpath = Path("assets/e2e_nlg_1_shot_10_prompts.json") 
prompts = []
for i in range(10):
    icl_df = dataset.sample(icl_per_prompt, random_state=i)
    prompt = INSTRUCTION + "\n\n"
    for row in icl_df.to_dict(orient="records"):
        prompt += "Meaning representation: {meaning_representation}\nNatural language: {human_reference}\n\n".format(**row)
    prompt += "Meaning representation: {meaning_representation}\nNatural language:"
    prompts.append(prompt)

prompt_fpath.write_text(json.dumps(prompts, indent=4))

6986

# cnn/dailymail

In [36]:
icl_per_prompt = 2
INSTRUCTIONS = [
    "Write a short summary of the article.",
    "What are the highlights of the article?",
    "Produce a few-sentence description of the article."
]

dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train", cache_dir = "../../cache").to_pandas()
dataset.head(3)

Unnamed: 0,article,highlights,id
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,ee8871b15c50d0db17b0179a6d2beab35065f1e9
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...",06352019a19ae31e527f37f7571c6dd7f0c5da37


In [37]:
prompt_fpath = Path("assets/cnn_dailymail_0_shot.json") 
prompts = []
for instruction in INSTRUCTIONS:
    icl_df = dataset.sample(icl_per_prompt)
    prompt = "{article}" + f"\n\n{instruction}\nSummary:"
    prompts.append(prompt)

prompt_fpath.write_text(json.dumps(prompts, indent=4))

221

In [2]:
INSTRUCTIONS = [
    "Write a short summary of the article.",
    "What are the highlights of the article?",
    "Produce a few-sentence description of the article.",
    "Summarize the main takeaways from the article.",
    "What are the main points of the above article?
]

dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train", cache_dir = "../../cache").to_pandas()
dataset.head(3)

prompt_fpath = Path("assets/cnn_dailymail_0_shot_5_prompts.json") 
prompts = []
for instruction in INSTRUCTIONS:
    prompt = "{article}" + f"\n\n{instruction}\nSummary:"
    prompts.append(prompt)

prompt_fpath.write_text(json.dumps(prompts, indent=4))

SyntaxError: unterminated string literal (detected at line 6) (4097407462.py, line 6)

# xsum

In [4]:
INSTRUCTION = "Summarize the article."
icl_per_prompt = 1
dataset = datasets.load_dataset("EdinburghNLP/xsum", split="train", cache_dir = "../../cache").to_pandas()
dataset.head(3)

Unnamed: 0,document,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548


In [5]:
prompt_fpath = Path("assets/xsum_1_shot.json") 
prompts = []
for _ in range(3):
    icl_df = dataset.sample(icl_per_prompt)
    prompt = INSTRUCTION + "\n\n"
    for row in icl_df.to_dict(orient="records"):
        prompt += "Article: {document}\nSummary: {summary}\n\n".format(**row)
    prompt += "Article: {document}\nSummary:"
    prompts.append(prompt)

prompt_fpath.write_text(json.dumps(prompts, indent=4))

7906

In [3]:
INSTRUCTION = "Summarize the article."
icl_per_prompt = 1
dataset = datasets.load_dataset("EdinburghNLP/xsum", split="train", cache_dir = "../../cache").to_pandas()

prompt_fpath = Path("assets/xsum_1_shot_5_prompts.json") 
prompts = []
for _ in range(5):
    icl_df = dataset.sample(icl_per_prompt)
    prompt = INSTRUCTION + "\n\n"
    for row in icl_df.to_dict(orient="records"):
        prompt += "Article: {document}\nSummary: {summary}\n\n".format(**row)
    prompt += "Article: {document}\nSummary:"
    prompts.append(prompt)

prompt_fpath.write_text(json.dumps(prompts, indent=4))

Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████████| 304M/304M [00:01<00:00, 202MB/s]
Downloading data: 100%|████████████████████████████████████████████████████████████████████████████████████| 16.7M/16.7M [00:00<00:00, 45.0MB/s]
Downloading data: 100%|████████████████████████████████████████████████████████████████████████████████████| 17.0M/17.0M [00:00<00:00, 33.7MB/s]
Generating train split: 100%|████████████████████████████████████████████████████████████████| 204045/204045 [00:01<00:00, 191960.47 examples/s]
Generating validation split: 100%|█████████████████████████████████████████████████████████████| 11332/11332 [00:00<00:00, 207794.40 examples/s]
Generating test split: 100%|███████████████████████████████████████████████████████████████████| 11334/11334 [00:00<00:00, 192706.72 examples/s]


15593

# gsm8k

In [6]:
cots = [
        """Question: Ivan has a bird feeder in his yard that holds two cups of birdseed. Every week, he has to refill the emptied feeder. Each cup of birdseed can feed fourteen birds, but Ivan is constantly chasing away a hungry squirrel that steals half a cup of birdseed from the feeder every week. How many birds does Ivan’s bird feeder feed weekly?
Let's think step by step
The squirrel steals 1/2 cup of birdseed every week, so the birds eat 2 - 1/2 = 1 1/2 cups of birdseed.
Each cup feeds 14 birds, so Ivan’s bird feeder feeds 14 * 1 1/2 = 21 birds weekly.
The answer is 21""",
        """Question: Samuel took 30 minutes to finish his homework while Sarah took 1.3 hours to finish it. How many minutes faster did Samuel finish his homework than Sarah?
Let's think step by step
Since there are 60 minutes in 1 hour, then 1.3 hours is equal to 1.3 x 60 = 78 minutes.
Thus, Samuel is 78 – 30 = 48 minutes faster than Sarah.
The answer is 48""",
        """Question: Julia bought 3 packs of red balls, 10 packs of yellow balls, and 8 packs of green balls. There were 19 balls in each package. How many balls did Julie buy in all?
Let's think step by step
The total number of packages is 3 + 10 + 8 = 21.
Julia bought 21 × 19 = 399 balls.
The answer is 399""",
        """Question: Lexi wants to run a total of three and one-fourth miles. One lap on a particular outdoor track measures a quarter of a mile around. How many complete laps must she run?
Let's think step by step
There are 3/ 1/4 = 12 one-fourth miles in 3 miles.
So, Lexi will have to run 12 (from 3 miles) + 1 (from 1/4 mile) = 13 complete laps.
The answer is 13""",
        """Question: Asia bought a homecoming dress on sale for $140. It was originally priced at $350. What percentage off did she get at the sale?
Let's think step by step
Asia saved $350 - $140 = $210 on the dress.
That means she saved $210 / $350 = 0.60 or 60% off on the dress.
The answer is 60""",
    ]

In [12]:
prompts = []
for cot in cots:
    prompt = f"Answer the math problem.\n\n{cot}\n\n"
    prompt += "Questions: {question}\nLet's think step by step"
    prompts.append(prompt)

prompt_fpath = Path("assets/gsm8k_1_shot_cot_simple.json") 
prompt_fpath.write_text(json.dumps(prompts, indent=4))

2336

# common gen

In [9]:
dataset = datasets.load_dataset("allenai/common_gen", split="train", cache_dir = "../../cache").to_pandas()
icl_per_prompt = 2

instructions = [
        "Following is a set of words. Generate a coherent sentence that contains all of the words.",
        "Generate a coherent sentence incorporating the words from the given set.",
        "Formulate a cohesive sentence using the words provided.",
        "Create a unified sentence that encompasses all the words.",
        "Compose a seamless sentence integrating the words given.",
]

dataset.head(3)

Unnamed: 0,concept_set_idx,concepts,target
0,0,"[ski, mountain, skier]",Skier skis down the mountain
1,0,"[ski, mountain, skier]",A skier is skiing down a mountain.
2,0,"[ski, mountain, skier]",Three skiers are skiing on a snowy mountain.


In [13]:
prompt_fpath = Path("assets/common_gen_1_shot_2_prompts.json") 
prompts = []
for i in range(5):
    icl_df = dataset.sample(icl_per_prompt, random_state=i)
    prompt = instructions[i] + "\n"
    for row in icl_df.to_dict(orient="records"):
        prompt += "\n### Words\n"
        prompt += str(row["concepts"]) + "\n"
        prompt += "\n### Sentence\n" + row["target"] + "\n"
    prompt += "\n### Words\n{concepts}\n\n### Sentence"
    prompts.append(prompt)

prompt_fpath.write_text(json.dumps(prompts, indent=4))

1766

## Squad

In [2]:
prompt_fpath = Path("assets/squad.json") 
prompts = ["{context}\n\nQuestion: {question}\nAnswer:"]
prompt_fpath.write_text(json.dumps(prompts, indent=4))

52

In [15]:
icl_per_prompt = 2
INSTRUCTIONS = [
    "Answer the question given the context. Be as brief as possible.",
    "Respond to the inquiry considering the surrounding circumstances. Make sure your answer is correct.",
    "You are an LLM who is supposed to be as factual as possible. Address the question in light of the given context.",
]
    

dataset = datasets.load_dataset("hazyresearch/based-squad", split="validation", cache_dir = "../../cache").to_pandas()
dataset = dataset.iloc[:len(dataset) // 2] # Train is first half
dataset.head(3)

Unnamed: 0,doc_id,text,value,title,context,question
0,56be4db0acb8001400a502ec,...,Denver Broncos,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...
1,56be4db0acb8001400a502ee,...,"Santa Clara, California",Super_Bowl_50,Super Bowl 50 was an American football game to...,Where did Super Bowl 50 take place?
2,56be4db0acb8001400a502ef,...,Denver Broncos,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team won Super Bowl 50?


In [17]:
prompt_fpath = Path("assets/squad_1_shot.json") 
prompts = [
    """Answer the question given the context. Be as brief as possible.

Context: While experimenting, Tesla inadvertently faulted a power station generator, causing a power outage. In August 1917, Tesla explained what had happened in The Electrical Experimenter: "As an example of what has been done with several hundred kilowatts of high frequency energy liberated, it was found that the dynamos in a power house six miles away were repeatedly burned out, due to the powerful high frequency currents set up in them, and which caused heavy sparks to jump through the windings and destroy the insulation!"
Question: What did Tesla accidentally cause?
Answer: power outage

Context: {context}
Question: {question}
Answer:""", 

    """Respond to the inquiry considering the surrounding circumstances. Make sure your answer is correct.

Context: {context}
Question: {question}
Answer:""",

    """You are an LLM who is supposed to be as factual as possible. Address the question in light of the given context.

Question: {question}
Context: {context}
Answer:"""
    
]
for idx in range(3):
    continue
    icl_df = dataset.sample(icl_per_prompt, random_state = idx)
    prompt = INSTRUCTIONS[idx] + "\n\n"
    for row in icl_df.to_dict(orient="records"):
        prompt += "Context: {context}\nQuestion: {question}\nAnswer: {value}\n\n".format(**row)
    prompt += "Context: {context}\nQuestion: {question}\nAnswer:"
    print(prompt)
    print("-"*30)
    prompts.append(prompt)

prompt_fpath.write_text(json.dumps(prompts, indent=4))

1065

## Definition Extraction

In [3]:
prompt_fpath = Path("assets/definition_extraction_0_shot.json") 
prompts = [
    "For the sentence below, identify the term that is being defined.\n\nSentence: {text}\nTerm:",
    "{text}\n\nQuestion: What are the term(s) being defined in the sentence above?\nAnswer:",
    "{text}\n\nWhat word or phrase is being defined in the sentence above?"
]
prompt_fpath.write_text(json.dumps(prompts, indent=4))

272