In [307]:
import os
from dotenv import load_dotenv
import openai
import pathlib

load_dotenv("./../.env")
assert os.getenv("OPENAI_API_KEY") != None

openai.api_key = os.getenv("OPENAI_API_KEY")

project_root = pathlib.Path('..').resolve()
cache_dir = project_root / 'tmp/cache'
cache_dir.mkdir(parents=True, exist_ok=True)

MODEL = "code-davinci-002"

In [308]:
import json
import sys
import time
from typing import Tuple
import yaml
import hashlib
import logging

logger = logging.getLogger('cached_openai')
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
# Clear existing handlers on logger
logger.handlers = []
logger.addHandler(handler)


def cache_file_for(**kwargs) -> Tuple[str, pathlib.Path]:
    as_json = json.dumps(kwargs, sort_keys=True)
    key = hashlib.sha256(as_json.encode()).hexdigest()
    key = key[:10]
    filename = f"{key}.yaml"
    cache_file = cache_dir / filename
    return (key, cache_file)

def is_cached(**kwargs):
    (key, cache_file) = cache_file_for(**kwargs)
    return cache_file.exists()

def from_cache(**kwargs):
    (key, cache_file) = cache_file_for(**kwargs)
    if not cache_file.exists():
        raise Exception(f"Expected cache to exist for {key}")
    
    logger.debug(f"Cache hit for completion {key}")
    cache_data = yaml.safe_load(cache_file.read_text())
    completion = openai.util.convert_to_openai_object(cache_data['response'])
    return completion

def write_cache(completion, **kwargs):
    (key, cache_file) = cache_file_for(**kwargs)
    cache_data = {
        'request':  kwargs,
        'response': completion.to_dict_recursive()
    }
    serialized = yaml.dump(cache_data)
    cache_file.write_text(serialized)

def get_completion_with_retry(retry_backoff=2, **kwargs):
    try:
        return openai.Completion.create(**kwargs)
    except openai.error.RateLimitError as error:
        logger.warning(f"Rate limit error: {error}. Sleeping {retry_backoff} seconds then retrying...")
        time.sleep(retry_backoff)
        return get_completion_with_retry(retry_backoff=retry_backoff * 2, **kwargs)

def cached_completion(cache_only=False, **kwargs) -> Tuple[openai.Completion, bool]:
    (key, cache_file) = cache_file_for(**kwargs)

    if cache_file.exists():
        completion = from_cache(**kwargs)
        return (completion, False)
    else:
        if cache_only:
            raise Exception(
                f"Cache miss for completion {key} and configured to only use cache!")

        logger.info(f"Fetching completion {key} from API")
        completion = get_completion_with_retry(**kwargs)
        write_cache(completion, **kwargs)
        return (completion, True)


In [309]:
# Dict of prompt names to Dicts with keys
# - "template" (str)
# - "completions" (list of openai.Completion)

prompts = {}

In [310]:
zero_shot_prompt_template = '''
#!/bin/zsh

# The code at the end of this file was generated by an AI assistant.
#
# The AI is conservative and always tries to generate the simplest possible code
# that will satisfy the description preceeding it.
#
# The AI prefers single line solutions when possible, but will provide multi-line
# solutions if necessary.
#
# If the code generated by the AI is potentially destructive (e.g. it kills processes,
# deletes files, etc.), the AI will print the commend "# destructive" on a line before
# the code. Otherwise, the AI always always generates code following the line that
# says "# CODE:"

# DESCRIPTION: {description}
# CODE:
'''

prompts['zero_shot'] = { 'template': zero_shot_prompt_template, 'completions': [] }

In [311]:
ideal_completions = [
    {
        'description': 'launch a ruby repl',
        'code': 'irb',
        'destructive': False
    },
    {
        'description': 'list files. one on each line. nothing else',
        'code': 'ls -1',
        'destructive': False
    },
    {
        'description': 'list untracked git files in src/',
        'code': 'git ls-files --others --exclude-standard src/',
        'destructive': False
    },
    {
        'description': 'tree view of files in src/ using exa',
        'code': 'exa --tree src/',
        'destructive': False
    },
    {
        'description': 'print the current working directory',
        'code': 'pwd',
        'destructive': False
    },
    {
        'description': 'show the contents of a file named \'data.txt\'',
        'code': 'cat data.txt',
        'destructive': False
    },
    {
        'description': 'create a new directory named \'docs\'',
        'code': 'mkdir docs',
        'destructive': False
    },
    {
        'description': 'remove all files in the \'tmp\' directory',
        'code': 'rm -r tmp/*',
        'destructive': True
    },
    {
        'description': 'copy all files from \'src/\' to \'dest/\'',
        'code': 'cp -r src/* dest/',
        'destructive': False
    },
    {
        'description': 'delete a file named \'archive.tar.gz\'',
        'code': 'rm archive.tar.gz',
        'destructive': True
    },
    {
        'description': 'list all files in the current directory, including hidden files',
        'code': 'ls -a',
        'destructive': False
    },
    {
        'description': 'show the contents of a file named \'README.md\' with line numbers',
        'code': 'cat -n README.md',
        'destructive': False
    },
    {
        'description': 'create a new file named \'newfile.txt\'',
        'code': 'touch newfile.txt',
        'destructive': False
    },
    {
        'description': 'list all running processes',
        'code': 'ps aux',
        'destructive': False
    },
    {
        'description': 'find all occurrences of the word "hello" in a file named "text.txt"',
        'code': 'grep "hello" text.txt',
        'destructive': False
    },
    {
        'description': 'create a new empty file named "notes.txt"',
        'code': 'touch notes.txt',
        'destructive': False
    },
    {
        'description': 'remove a directory named "old/" and all its contents',
        'code': 'rm -r old/',
        'destructive': True
    },
    {
        'description': 'show the disk usage of all files and directories in the current directory, sorted by size',
        'code': 'du -sh * | sort -h',
        'destructive': False
    },
    {
        'description': 'archive a directory named "src/" and compress it using gzip',
        'code': 'tar -czf archive.tar.gz src/',
        'destructive': False
    },
    {
        'description': 'convert a Markdown file named "doc.md" to HTML',
        'code': 'pandoc -f markdown -t html -o doc.html doc.md',
        'destructive': False
    },
    {
        'description': 'rename a file named "oldname.txt" to "newname.txt"',
        'code': 'mv oldname.txt newname.txt',
        'destructive': False
    },
    {
        'description': 'delete a file named "important_document.docx"',
        'code': 'rm important_document.docx',
        'destructive': True
    },
    {
        'description': 'terminate a process named "myscript.py"',
        'code': 'pkill -f myscript.py',
        'destructive': True
    },
    {
        'description': 'delete a row from a PostgreSQL table named "users" where the ID is 123',
        'code': 'psql -U username -d mydb -c "DELETE FROM users WHERE id = 123;"',
        'destructive': True
    },
    {
        'description': 'truncate a table named "logs" in a MySQL database named "mydb"',
        'code': 'mysql -u username -p mydb -e "TRUNCATE TABLE logs;"',
        'destructive': True
    },
    {
        'description': 'forcibly delete all files in a directory named "backup/"',
        'code': 'rm -rf backup/*',
        'destructive': True
    }
]

In [312]:
from typing import Callable
import time


def test_inputs_for_prompt_strategy(
        inputs,
        get_completion: Callable[[str], Tuple[openai.Completion, bool]],
        label: str,
        throttle_time=5
):
    for item in inputs:
      input = item['description']
      expected = item['code']
      (completion, did_hit_api) = get_completion(input)
      prompts[label]['completions'].append(completion)

      if did_hit_api:
        logger.warning(f"Hit API. Sleeping for {throttle_time} seconds so we do not hit the rate limit")
        time.sleep(throttle_time)

      actual = completion.choices[0].text
      print(f"Prompt: {input}")
      print(f"Should be marked as destructive: {item['destructive']}")
      print(f"Expected: {expected}")
      print(f"Actual: {actual}")
      print("\n===\n")

In [313]:
def test_zero_shot(input, cache_only=False): return cached_completion(
    engine=MODEL,
    prompt=zero_shot_prompt_template.format(description=input),
    temperature=0,
    cache_only=cache_only
)


test_inputs_for_prompt_strategy(
    ideal_completions,
    test_zero_shot,
    "zero_shot"
)


Prompt: launch a ruby repl
Should be marked as destructive: False
Expected: irb
Actual: 
irb

# DESCRIPTION: launch a python repl
#

===

Prompt: list files. one on each line. nothing else
Should be marked as destructive: False
Expected: ls -1
Actual: ls -1

# DESCRIPTION: list files. one on each

===

Prompt: list untracked git files in src/
Should be marked as destructive: False
Expected: git ls-files --others --exclude-standard src/
Actual: git ls-files --others --exclude-standard --directory src

===

Prompt: tree view of files in src/ using exa
Should be marked as destructive: False
Expected: exa --tree src/
Actual: exa -T -a -l --git --git-ignore --

===

Prompt: print the current working directory
Should be marked as destructive: False
Expected: pwd
Actual: pwd

# DESCRIPTION: print the current working directory
#

===

Prompt: show the contents of a file named 'data.txt'
Should be marked as destructive: False
Expected: cat data.txt
Actual: cat data.txt

# DESCRIPTION: show the 

So my first observation is that it keeps generating another description line which is annoying. I wonder if few shot would help

In [314]:
few_shot_prompt_template = '''
#!/bin/zsh

# The code at the end of this file was generated by an AI assistant.
#
# The AI is conservative and always tries to generate the simplest possible code
# that will satisfy the description preceeding it.
#
# The AI prefers single line solutions when possible, but will provide multi-line
# solutions if necessary.
#
# If the code generated by the AI is potentially destructive (e.g. it kills processes,
# deletes files, etc.), the AI will print the commend "# destructive" on a line before
# the code. Otherwise, the AI always always generates code following the line that
# says "# CODE:"

# DESCRIPTION: compile a C program named "program.c" and save the output to "myprog"
# CODE:
gcc -o myprog program.c

# DESCRIPTION: create a new Python virtual environment in a directory named "env"
# CODE:
python -m venv env

# DESCRIPTION: delete all files in the directory "tmp/"
# CODE:
# destructive
rm -r tmp

# DESCRIPTION: print a period every second for 30 seconds
# CODE:
for i in {{1..30}}; do
  echo "."
  sleep 1
done

# DESCRIPTION: {description}
# CODE:
'''

prompts['few_shot'] = { 'template': few_shot_prompt_template, 'completions': [] }

In [315]:
def test_few_shot(input, cache_only=False): return cached_completion(
    engine=MODEL,
    prompt=few_shot_prompt_template.format(description=input),
    temperature=0,
    cache_only=cache_only
)

test_inputs_for_prompt_strategy(
    ideal_completions,
    test_few_shot,
    'few_shot'
)


Prompt: launch a ruby repl
Should be marked as destructive: False
Expected: irb
Actual: irb

# DESCRIPTION: print the contents of the file "

===

Prompt: list files. one on each line. nothing else
Should be marked as destructive: False
Expected: ls -1
Actual: ls -1

# DESCRIPTION: list files. one on each

===

Prompt: list untracked git files in src/
Should be marked as destructive: False
Expected: git ls-files --others --exclude-standard src/
Actual: git ls-files --others --exclude-standard src/


===

Prompt: tree view of files in src/ using exa
Should be marked as destructive: False
Expected: exa --tree src/
Actual: exa -T src/

# DESCRIPTION: print the first

===

Prompt: print the current working directory
Should be marked as destructive: False
Expected: pwd
Actual: pwd

# DESCRIPTION: print the contents of the file "

===

Prompt: show the contents of a file named 'data.txt'
Should be marked as destructive: False
Expected: cat data.txt
Actual: cat data.txt

# DESCRIPTION: show t

A few observations this time around:

* Still trying to start another prompt with the `# DESCRIPTION:` stuff at the end
* Some of the commands are getting cut off so we probably need a higher token limit
* Probably should add a stop token to avoid the `# DESCRIPTION` crap
* I'm getting rate limited a lot by the API because I guess my token count is high. Once I get the output right I should golf down the prompt

In [316]:
few_shot_with_stop_words_prompt_template = '''
#!/bin/zsh

# The code at the end of this file was generated by an AI assistant.
#
# The AI is conservative and always tries to generate the simplest possible code
# that will satisfy the description preceeding it.
#
# The AI prefers single line solutions when possible, but will provide multi-line
# solutions if necessary.
#
# If the code generated by the AI is potentially destructive (e.g. it kills processes,
# deletes files, etc.), the AI will print the commend "# destructive" on a line before
# the code. Otherwise, the AI always always generates code following the line that
# says "# CODE:"

# DESCRIPTION: compile a C program named "program.c" and save the output to "myprog"
# CODE:
gcc -o myprog program.c<STOP>

# DESCRIPTION: create a new Python virtual environment in a directory named "env"
# CODE:
python -m venv env<STOP>

# DESCRIPTION: delete all files in the directory "tmp/"
# CODE:
# destructive
rm -r tmp<STOP>

# DESCRIPTION: print a period every second for 30 seconds
# CODE:
for i in {{1..30}}; do
  echo "."
  sleep 1
done<STOP>

# DESCRIPTION: {description}
# CODE:
'''

prompts['few_shot_with_stop_words'] = { 'template': few_shot_with_stop_words_prompt_template, 'completions': [] }

In [317]:
def test_few_shot_with_stop_words(input, cache_only=False): return cached_completion(
    engine=MODEL,
    prompt=few_shot_with_stop_words_prompt_template.format(description=input),
    temperature=0,
    stop=["<STOP>"],
    max_tokens=50,
    cache_only=cache_only
)


test_inputs_for_prompt_strategy(
    ideal_completions,
    test_few_shot_with_stop_words,
    'few_shot_with_stop_words',
    throttle_time=10
)


Prompt: launch a ruby repl
Should be marked as destructive: False
Expected: irb
Actual: irb

===

Prompt: list files. one on each line. nothing else
Should be marked as destructive: False
Expected: ls -1
Actual: ls -1

===

Prompt: list untracked git files in src/
Should be marked as destructive: False
Expected: git ls-files --others --exclude-standard src/
Actual: git ls-files --others --exclude-standard src

===

Prompt: tree view of files in src/ using exa
Should be marked as destructive: False
Expected: exa --tree src/
Actual: exa -T src

===

Prompt: print the current working directory
Should be marked as destructive: False
Expected: pwd
Actual: pwd

===

Prompt: show the contents of a file named 'data.txt'
Should be marked as destructive: False
Expected: cat data.txt
Actual: cat data.txt

===

Prompt: create a new directory named 'docs'
Should be marked as destructive: False
Expected: mkdir docs
Actual: mkdir docs

===

Prompt: remove all files in the 'tmp' directory
Should be ma

Observations after this round:

* ✅ Completely solved the issue with it proceeding to generate another `# DESCRIPTION` line
* ✅ Solutions seem pretty great. Good enough that it doesn't seem like a priority anymore
* Rate limit is unbearable. Gotta golf down the prompts
* The destructive marking isn't great yet. Specifically the SQL examples

In [318]:
import tiktoken

prompt_templates = {
    "zero-shot": {
        "template": zero_shot_prompt_template,
        "get_completion": test_zero_shot
    },
    "few-shot": {
        "template": few_shot_prompt_template,
        "get_completion": test_few_shot
    },
    "few-shot-with-stop-words": {
        "template": few_shot_with_stop_words_prompt_template,
        "get_completion": test_few_shot_with_stop_words
    }
}

encoding = tiktoken.encoding_for_model(MODEL)
# Rate limit for code-davinci-002 is:
#   - 20 requests/minute
#   - 40,000 tokens/minute
# @see https://platform.openai.com/docs/guides/rate-limits/overview
token_rate_limit = 40_000


def n_tokens(str):
    return len(encoding.encode(str))


average_prompt_size = round(sum([n_tokens(item['description'])
                                 for item in ideal_completions]) / len(ideal_completions))

average_token_usage = {}
all_completions = []

for template_name, template_data in prompt_templates.items():
    template = template_data['template']
    get_completion = template_data['get_completion']
    template_size = n_tokens(template.format(description=""))
    template_outputs = []
    for item in ideal_completions:
        (completion, did_hit_api) = get_completion(
            item['description'], cache_only=True)
        template_outputs.append(completion.choices[0].text)
        all_completions.append(completion)
    average_template_output_size = round(
        sum([n_tokens(item)
            for item in template_outputs]) / len(template_outputs)
    )

    average_token_usage[template_name] = average_prompt_size + template_size + average_template_output_size

average_token_usage


{'zero-shot': 185, 'few-shot': 322, 'few-shot-with-stop-words': 331}

In [319]:
requests_per_minute_for_template = { key: round(token_rate_limit / value) for key, value in average_token_usage.items() }
requests_per_minute_for_template

{'zero-shot': 216, 'few-shot': 124, 'few-shot-with-stop-words': 121}

In [320]:
pricing = 0.02 # $0.02 per 1,000 tokens
total_tokens_consumed = sum([completion.usage.total_tokens for completion in all_completions])
cost = round((total_tokens_consumed / 1000) * pricing, 2)
cost_per_request = round(cost / len(all_completions), 2)
total_tokens_consumed, cost, cost_per_request


(21679, 0.43, 0.01)

### Conclusion

Ok... so even the largest template is only doing 331 tokens per request so I'm definitely not hitting the token rate limit. I should be able to do 2 RPS even for the big prompt. I also was sleeping 5 seconds between each request before and still hitting the rate limit so I'm kind of confused.

My only guess [based on their docs](https://help.openai.com/en/articles/6891753-rate-limit-advice):

> Rate limits can be quantized, meaning they are enforced over shorter periods of time (e.g. 60,000 requests/minute may be enforced as 1,000 requests/second). Sending short bursts of requests or contexts (prompts+max_tokens) that are too long can lead to rate limit errors, even when you are technically below the rate limit per minute.

So maybe it is somehow quantizing down to tokens/second (which would be 666 tok/sec) and somehow two prompts timed in an unfortuante way, even with a sleep, result in a rate limit? Still seems bizarre but I am going to guess that maybe this is partially an artifact of me batch testing.

Note to self: Send multiple prompts at a time in the future when testing

#### Cost

It would cost me $0.43 to just do 78 test requests here (model is in beta so I guess it is free now). About a dollar a day if I use this 100 times a day. Not ideal but whatever, I still want to try golfing down the prompt


In [321]:
few_shot_no_prose_template = '''
# zsh

# DESCRIPTION: compile a C program named "program.c" and save the output to "myprog"
# CODE:
gcc -o myprog program.c<STOP>

# DESCRIPTION: create a new Python virtual environment in a directory named "env"
# CODE:
python -m venv env<STOP>

# DESCRIPTION: delete all files in the directory "tmp/"
# CODE:
# destructive
rm -r tmp<STOP>

# DESCRIPTION: print a period every second for 30 seconds
# CODE:
for i in {{1..30}}; do
  echo "."
  sleep 1
done<STOP>

# DESCRIPTION: {description}
# CODE:
'''

prompts['few_shot_no_prose'] = { 'template': few_shot_no_prose_template, 'completions': [] }

In [322]:
def test_few_shot_no_prose(input, cache_only=False): return cached_completion(
    engine=MODEL,
    prompt=few_shot_no_prose_template.format(description=input),
    temperature=0,
    stop=["<STOP>"],
    max_tokens=50,
    cache_only=cache_only
)

test_inputs_for_prompt_strategy(
    ideal_completions,
    test_few_shot_no_prose,
    "few_shot_no_prose",
    throttle_time=0
)

Prompt: launch a ruby repl
Should be marked as destructive: False
Expected: irb
Actual: irb

===

Prompt: list files. one on each line. nothing else
Should be marked as destructive: False
Expected: ls -1
Actual: ls -1

===

Prompt: list untracked git files in src/
Should be marked as destructive: False
Expected: git ls-files --others --exclude-standard src/
Actual: git ls-files --others --exclude-standard src

===

Prompt: tree view of files in src/ using exa
Should be marked as destructive: False
Expected: exa --tree src/
Actual: exa -T src

===

Prompt: print the current working directory
Should be marked as destructive: False
Expected: pwd
Actual: pwd

===

Prompt: show the contents of a file named 'data.txt'
Should be marked as destructive: False
Expected: cat data.txt
Actual: cat data.txt

===

Prompt: create a new directory named 'docs'
Should be marked as destructive: False
Expected: mkdir docs
Actual: mkdir docs

===

Prompt: remove all files in the 'tmp' directory
Should be ma

### No prose prompt

Ok, removing the prose produce basically the same results. Only thing left I think is making the destructive marker better

In [323]:
few_shot_low_prose_template = '''
# zsh
#
# All code that deletes, kills, or does an unrevertable update should be
# prefixed with a comment saying "# destructive"

# DESCRIPTION: compile a C program named "program.c" and save the output to "myprog"
# CODE:
gcc -o myprog program.c<STOP>

# DESCRIPTION: create a new Python virtual environment in a directory named "env"
# CODE:
python -m venv env<STOP>

# DESCRIPTION: delete all files in the directory "tmp/"
# CODE:
# destructive
rm -r tmp<STOP>

# DESCRIPTION: print a period every second for 30 seconds
# CODE:
for i in {{1..30}}; do
  echo "."
  sleep 1
done<STOP>

# DESCRIPTION: {description}
# CODE:
'''

prompts['few_shot_low_prose'] = { 'template': few_shot_low_prose_template, 'completions': [] }

In [324]:
def test_few_shot_low_prose(input, cache_only=False): return cached_completion(
    engine=MODEL,
    prompt=few_shot_low_prose_template.format(description=input),
    temperature=0,
    stop=["<STOP>"],
    max_tokens=50,
    cache_only=cache_only
)

test_inputs_for_prompt_strategy(
    ideal_completions,
    test_few_shot_low_prose,
    "few_shot_low_prose",
    throttle_time=0
)

Prompt: launch a ruby repl
Should be marked as destructive: False
Expected: irb
Actual: irb

===

Prompt: list files. one on each line. nothing else
Should be marked as destructive: False
Expected: ls -1
Actual: ls -1

===

Prompt: list untracked git files in src/
Should be marked as destructive: False
Expected: git ls-files --others --exclude-standard src/
Actual: git ls-files --others --exclude-standard src

===

Prompt: tree view of files in src/ using exa
Should be marked as destructive: False
Expected: exa --tree src/
Actual: exa -T src

===

Prompt: print the current working directory
Should be marked as destructive: False
Expected: pwd
Actual: pwd

===

Prompt: show the contents of a file named 'data.txt'
Should be marked as destructive: False
Expected: cat data.txt
Actual: cat data.txt

===

Prompt: create a new directory named 'docs'
Should be marked as destructive: False
Expected: mkdir docs
Actual: mkdir docs

===

Prompt: remove all files in the 'tmp' directory
Should be ma

### Low prose reflection

Ok seems like great output again and also it is marking things as destructive properly. Nice

In [325]:
destructive_comment = "# destructive\n"
results = []

# loop over ideal_completions with each item and the index
for index, example in enumerate(ideal_completions):
    result = {
        "input": example['description'],
        "expected": example['code'],
        "is_destructive": example['destructive'],
    }

    for prompt_label, data in prompts.items():
        completion = data['completions'][index]
        code = completion.choices[0].text
        marked_destructive = code.startswith(destructive_comment)
        if marked_destructive:
            code = code.replace(destructive_comment, "")
        result[f"{prompt_label}__actual"] = code
        result[f"{prompt_label}__marked_destructive"] = marked_destructive

    results.append(result)

results

[{'input': 'launch a ruby repl',
  'expected': 'irb',
  'is_destructive': False,
  'zero_shot__actual': '\nirb\n\n# DESCRIPTION: launch a python repl\n#',
  'zero_shot__marked_destructive': False,
  'few_shot__actual': 'irb\n\n# DESCRIPTION: print the contents of the file "',
  'few_shot__marked_destructive': False,
  'few_shot_with_stop_words__actual': 'irb',
  'few_shot_with_stop_words__marked_destructive': False,
  'few_shot_no_prose__actual': 'irb',
  'few_shot_no_prose__marked_destructive': False,
  'few_shot_low_prose__actual': 'irb',
  'few_shot_low_prose__marked_destructive': False},
 {'input': 'list files. one on each line. nothing else',
  'expected': 'ls -1',
  'is_destructive': False,
  'zero_shot__actual': 'ls -1\n\n# DESCRIPTION: list files. one on each',
  'zero_shot__marked_destructive': False,
  'few_shot__actual': 'ls -1\n\n# DESCRIPTION: list files. one on each',
  'few_shot__marked_destructive': False,
  'few_shot_with_stop_words__actual': 'ls -1',
  'few_shot_with_

In [326]:
expected_actual_pairs = []

actual_keys = [key for key in results[0].keys() if key.endswith("__actual")]

for result in results:
    actual_values = [result[key] for key in actual_keys]
    pairs = [(result['expected'], actual) for actual in actual_values]
    expected_actual_pairs.extend(pairs)

expected_actual_pairs = list(set(expected_actual_pairs))
expected_actual_pairs.sort()
mismatches = [pair for pair in expected_actual_pairs if pair[0] != pair[1]]
ambiguous_mismatches = [pair for pair in mismatches if not "# DESCRIPTION" in pair[1]]
for pair in ambiguous_mismatches:
    print(pair)

('cp -r src/* dest/', 'cp -r src/* dest')
('exa --tree src/', 'exa -T -a -l --git --git-ignore --')
('exa --tree src/', 'exa -T src')
('git ls-files --others --exclude-standard src/', 'git ls-files --others --exclude-standard --directory src')
('git ls-files --others --exclude-standard src/', 'git ls-files --others --exclude-standard src')
('git ls-files --others --exclude-standard src/', 'git ls-files --others --exclude-standard src/\n')
('grep "hello" text.txt', 'grep -i "hello" text.txt')
('grep "hello" text.txt', 'grep -o "hello" text.txt')
('mysql -u username -p mydb -e "TRUNCATE TABLE logs;"', 'mysql -u root -p -e "use mydb; trunc')
('mysql -u username -p mydb -e "TRUNCATE TABLE logs;"', 'mysql -u root -p mydb -e "')
('mysql -u username -p mydb -e "TRUNCATE TABLE logs;"', 'mysql -u root -p mydb -e "TRUNCATE TABLE logs"')
('pandoc -f markdown -t html -o doc.html doc.md', 'pandoc doc.md -o doc.html')
('pandoc -f markdown -t html -o doc.html doc.md', 'pandoc doc.md -o doc.html\n\n# 

In [327]:
# Hand curated
equivalent_outputs = [
    ('cp -r src/* dest/', 'cp -r src/* dest'),
    ('exa --tree src/', 'exa -T src'),
    ('git ls-files --others --exclude-standard src/', 'git ls-files --others --exclude-standard --directory src'),
    ('git ls-files --others --exclude-standard src/', 'git ls-files --others --exclude-standard src'),
    ('pandoc -f markdown -t html -o doc.html doc.md', 'pandoc doc.md -o doc.html'),
    ('pkill -f myscript.py', 'pkill myscript.py'),
    ('ps aux', 'ps -A'),
    ('psql -U username -d mydb -c "DELETE FROM users WHERE id = 123;"', 'psql -c "DELETE FROM users WHERE id'),
    ('psql -U username -d mydb -c "DELETE FROM users WHERE id = 123;"', 'psql -c "DELETE FROM users WHERE id = 123"'),
    ('rm -r old/', 'rm -r old'),
    ('tar -czf archive.tar.gz src/', 'tar -czvf src.tar.gz src')
]

In [328]:
import pandas as pd
df = pd.DataFrame(results)
df.head()

Unnamed: 0,input,expected,is_destructive,zero_shot__actual,zero_shot__marked_destructive,few_shot__actual,few_shot__marked_destructive,few_shot_with_stop_words__actual,few_shot_with_stop_words__marked_destructive,few_shot_no_prose__actual,few_shot_no_prose__marked_destructive,few_shot_low_prose__actual,few_shot_low_prose__marked_destructive
0,launch a ruby repl,irb,False,\nirb\n\n# DESCRIPTION: launch a python repl\n#,False,irb\n\n# DESCRIPTION: print the contents of th...,False,irb,False,irb,False,irb,False
1,list files. one on each line. nothing else,ls -1,False,ls -1\n\n# DESCRIPTION: list files. one on each,False,ls -1\n\n# DESCRIPTION: list files. one on each,False,ls -1,False,ls -1,False,ls -1,False
2,list untracked git files in src/,git ls-files --others --exclude-standard src/,False,git ls-files --others --exclude-standard --dir...,False,git ls-files --others --exclude-standard src/\n,False,git ls-files --others --exclude-standard src,False,git ls-files --others --exclude-standard src,False,git ls-files --others --exclude-standard src,False
3,tree view of files in src/ using exa,exa --tree src/,False,exa -T -a -l --git --git-ignore --,False,exa -T src/\n\n# DESCRIPTION: print the first,False,exa -T src,False,exa -T src,False,exa -T src,False
4,print the current working directory,pwd,False,pwd\n\n# DESCRIPTION: print the current workin...,False,pwd\n\n# DESCRIPTION: print the contents of th...,False,pwd,False,pwd,False,pwd,False


In [334]:
def evaluate_generation(expected, actual):
    if expected == actual:
        return True
    if (expected, actual) in equivalent_outputs:
        return True
    return False

for prompt_label in prompts.keys():
    df[f"{prompt_label}__match"] = df.apply(lambda row: evaluate_generation(row['expected'], row[f"{prompt_label}__actual"]), axis=1)
    df[f"{prompt_label}__destructive_match"] = df.apply(lambda row: row[f"{prompt_label}__marked_destructive"] == row['is_destructive'], axis=1)

df_match_results = df[["input"] + [f"{prompt_label}__match" for prompt_label in prompts.keys()] + ["expected", "few_shot_low_prose__actual"]]
df_match_results.columns = [col.replace("__match", "") for col in df_match_results.columns]

df_destructive_results = df[["input"] + [f"{prompt_label}__destructive_match" for prompt_label in prompts.keys()]]
df_destructive_results.columns = [col.replace("__destructive_match", "") for col in df_destructive_results.columns]

In [335]:
def color_cells(val):
    if not isinstance(val, bool):
        return ''
    color = 'green' if val else 'red'
    return 'background-color: %s' % color

df_match_results.style.applymap(color_cells).set_properties(**{'text-align': 'left'})

Unnamed: 0,input,zero_shot,few_shot,few_shot_with_stop_words,few_shot_no_prose,few_shot_low_prose,expected,few_shot_low_prose__actual
0,launch a ruby repl,False,False,True,True,True,irb,irb
1,list files. one on each line. nothing else,False,False,True,True,True,ls -1,ls -1
2,list untracked git files in src/,True,False,True,True,True,git ls-files --others --exclude-standard src/,git ls-files --others --exclude-standard src
3,tree view of files in src/ using exa,False,False,True,True,True,exa --tree src/,exa -T src
4,print the current working directory,False,False,True,True,True,pwd,pwd
5,show the contents of a file named 'data.txt',False,False,True,True,True,cat data.txt,cat data.txt
6,create a new directory named 'docs',False,False,True,True,True,mkdir docs,mkdir docs
7,remove all files in the 'tmp' directory,False,False,False,False,False,rm -r tmp/*,rm -r tmp
8,copy all files from 'src/' to 'dest/',False,False,True,True,True,cp -r src/* dest/,cp -r src/* dest
9,delete a file named 'archive.tar.gz',False,False,True,True,True,rm archive.tar.gz,rm archive.tar.gz


In [331]:
df_destructive_results.style.applymap(color_cells).set_properties(**{'text-align': 'left'})

Unnamed: 0,input,zero_shot,few_shot,few_shot_with_stop_words,few_shot_no_prose,few_shot_low_prose
0,launch a ruby repl,True,True,True,True,True
1,list files. one on each line. nothing else,True,True,True,True,True
2,list untracked git files in src/,True,True,True,True,True
3,tree view of files in src/ using exa,True,True,True,True,True
4,print the current working directory,True,True,True,True,True
5,show the contents of a file named 'data.txt',True,True,True,True,True
6,create a new directory named 'docs',True,True,True,True,True
7,remove all files in the 'tmp' directory,False,True,True,False,True
8,copy all files from 'src/' to 'dest/',True,True,True,True,True
9,delete a file named 'archive.tar.gz',False,True,True,False,True
