# Pilot GPT-3 tranformation experiments

In [1]:
__author__ = "Christopher Potts"

## Set-up

In [2]:
import openai
import os
import json
import pandas as pd
import random
import utils

In [3]:
openai_key = os.getenv('OPENAI_API_KEY')

## Materials

In [4]:
def load_materials(filename="materials.txt", preposition="though", embedding=""):
    with open(filename) as f:
        materials = f.read().splitlines()
    items = []
    for s in materials:
        d = utils.item(s, preposition=preposition, embedding=embedding)
        item = {}
        item['PP'] = d['PP (No Filler/No Gap)'][0]
        item["PiPP"] = d['PiPP (Filler/Gap)'][0]
        item['embedding'] = embedding
        item['preposition'] = preposition
        items.append(item)
    return items

In [5]:
stress = pd.read_csv("materials-stress-test.csv", index_col=0).to_dict(orient="records")
for d in stress:
    d['embedding'] = "embedding" if "that" in d['PP'].lower() else ""
    d['preposition'] = "though" if "though" in d['PP'].lower() else "as"

## Prompts

In [6]:
def prompt_template(demos, target):
    demo_strs = [f"Input: {d['PP']}\nOutput: {d['PiPP']}" for d in demos]
    demo_str = "\n\n".join(demo_strs)
    instructions = (
        "You are an expert grammarian. "
        "Your task is to convert the Input example to a new Output by applying a transformation. "
        "Here are some examples of these transformations: "
    )
    return f"""{instructions}

{demo_str}

Now apply the transformation to this input:

Input: {target}
Output:"""

In [7]:
ex = prompt_template(
    demos=[
        {"PP": "Though they were happy, they said no.",
         "PiPP": "Happy though they were, they said no."},
        {"PP": "I am, though it seems odd, friends with a robot.",
         "PiPP": "I am, odd though it seems, friends with a robot."}
    ],
    target="Though they felt sad, they smiled."
)

In [8]:
print(ex)

You are an expert grammarian. Your task is to convert the Input example to a new Output by applying a transformation. Here are some examples of these transformations: 

Input: Though they were happy, they said no.
Output: Happy though they were, they said no.

Input: I am, though it seems odd, friends with a robot.
Output: I am, odd though it seems, friends with a robot.

Now apply the transformation to this input:

Input: Though they felt sad, they smiled.
Output:


In [9]:
ex_result = utils.run_gpt3([ex], engine="text-davinci-003", temperature=0.0, max_tokens=50)

In [10]:
ex_result['gen_text']

' Sad though they felt, they smiled.'

## Utilities

In [11]:
def get_demos(exs, exclude=set([]), k=4):
    cands = [ex for ex in exs if ex not in exclude]
    demos = random.sample(cands, k)
    return demos

In [12]:
def experiment(examples, name, demopool=None, k=4, engine="text-davinci-003"):
    results = []
    demopool = examples if demopool is None else demopool
    for item in examples:
        demos = get_demos(demopool, [item], k)
        pp = item['PP']
        pipp = item['PiPP']
        prompt = prompt_template(demos, item['PP'])
        result = utils.run_gpt3([prompt], engine=engine, temperature=0.0, max_tokens=50)
        prediction = get_prediction(result)
        result.update(item)
        result['engine'] = engine
        result['gold'] = pipp
        result['prediction'] = prediction
        result['correct'] = pipp == prediction
        results.append(result)
    emb = "embedding" if results[0]['embedding'] else "noembedding"
    output_filename = os.path.join(
        "results", f"transformations-{name}-{engine}-{prep}-{emb}.json")
    with open(output_filename, "wt") as f:
        json.dump(results, f, indent=4, sort_keys=True)
    return results

def get_prediction(result):
    return result['gen_text'].strip()

In [13]:
def assessment(results):
    correct = sum(d['correct'] for d in results)
    total = len(results)
    acc = correct / total
    data = {"Correct": correct, "Total": total, "Accuracy (EM)": acc}
    for k in ("engine", "preposition", "embedding"):
        data[k] = results[0][k]
    return data

In [14]:
def get_errors(results):
    keepers = {"PP", "prediction", "gold"}
    return [{k: v for k, v in d.items() if k in keepers} for d in results
            if d['gold'] != d['prediction']]

## Experiments

### Regular materials

In [15]:
regular_results = []

for engine in ("text-davinci-001", "text-davinci-003"):
    for prep in ("as", "though", "asas"):
        for emb in ("", "they said that we knew that"):
            materials = load_materials(preposition=prep, embedding=emb)
            results = experiment(materials, "regular", engine=engine)
            regular_results.append(results)

In [16]:
regular_assess = pd.DataFrame([assessment(r) for r in regular_results])

In [17]:
print(regular_assess[['engine', 'preposition', 'embedding', 'Accuracy (EM)']].round(2).to_latex(index=None))

\begin{tabular}{lllr}
\toprule
           engine & preposition &                    embedding &  Accuracy (EM) \\
\midrule
 text-davinci-001 &          as &                              &           0.70 \\
 text-davinci-001 &          as &  they said that we knew that &           0.64 \\
 text-davinci-001 &      though &                              &           0.48 \\
 text-davinci-001 &      though &  they said that we knew that &           0.55 \\
 text-davinci-001 &        asas &                              &           0.70 \\
 text-davinci-001 &        asas &  they said that we knew that &           0.88 \\
 text-davinci-003 &          as &                              &           0.67 \\
 text-davinci-003 &          as &  they said that we knew that &           0.64 \\
 text-davinci-003 &      though &                              &           0.70 \\
 text-davinci-003 &      though &  they said that we knew that &           0.70 \\
 text-davinci-003 &        asas &              

  print(regular_assess[['engine', 'preposition', 'embedding', 'Accuracy (EM)']].round(2).to_latex(index=None))


In [18]:
with open("results/transformations-regular-text-davinci-003-asas-embedding.json") as f:
    best = json.load(f)

In [19]:
print("\\begin{examples}")
for d in get_errors(best):
    print("\\item\n\\begin{examples}")
    for k, v in d.items():
        print(f"\\item[{k}] {v}")
    print("\\end{examples}")
print("\\end{examples}")

\begin{examples}
\item
\begin{examples}
\item[PP] We liked the end of the movie, as they said that we knew that it was tragic.
\item[gold] We liked the end of the movie, as tragic as they said that we knew that it was.
\item[prediction] As tragic as they said that we knew that it was, we liked the end of the movie.
\end{examples}
\item
\begin{examples}
\item[PP] The proposal is still being assessed, as they said that we knew that it seemed inspired.
\item[gold] The proposal is still being assessed, as inspired as they said that we knew that it seemed.
\item[prediction] As inspired as they said that we knew that it seemed, the proposal is still being assessed.
\end{examples}
\item
\begin{examples}
\item[PP] They skipped the movie, as they said that we knew that it seemed exciting.
\item[gold] They skipped the movie, as exciting as they said that we knew that it seemed.
\item[prediction] As exciting as they said that we knew that it seemed, they skipped the movie.
\end{examples}
\end{exa

### Stress test

#### With stress-test demos

In [20]:
stress_results = {}

for engine in ("text-davinci-001", "text-davinci-003"):
    stress_results[engine] = experiment(stress, "stresstest", engine=engine)

In [21]:
assessment(stress_results["text-davinci-001"])

{'Correct': 1,
 'Total': 9,
 'Accuracy (EM)': 0.1111111111111111,
 'engine': 'text-davinci-001',
 'preposition': 'though',
 'embedding': ''}

In [22]:
assessment(stress_results["text-davinci-003"])

{'Correct': 3,
 'Total': 9,
 'Accuracy (EM)': 0.3333333333333333,
 'engine': 'text-davinci-003',
 'preposition': 'though',
 'embedding': ''}

In [23]:
stress_err = get_errors(stress_results["text-davinci-003"])

In [24]:
stress_err

[{'PP': 'They are lost, though they seem happy, healthy, and in control.',
  'gold': 'They are lost, happy, healthy, and in control though they seem.',
  'prediction': 'Happy, healthy, and in control though they seem, they are lost.'},
 {'PP': 'As she was a top linguist, she was exceedingly patient.',
  'gold': 'Top linguist as she was, she was exceedingly patient.',
  'prediction': 'Exceedingly patient as she was a top linguist, she was.'},
 {'PP': 'Though they thought that the plan was silly, it actually worked well.',
  'gold': 'Silly as they thought that the plan was, it actually worked well.',
  'prediction': 'Silly though they thought that the plan was, it actually worked well.'},
 {'PP': 'Though they thought that the plan was too contrived to be taken seriously, it actually worked well.',
  'gold': 'Too contrived to be taken seriously though they thought that the plan was, it actually worked well.',
  'prediction': 'Too contrived as they thought that the plan was to be taken ser

#### With regular demos

In [25]:
stress_results_with_basic_demos = {}

demopool = []
for prep in ("as", "though", "asas"):
    for emb in ("", "they said that we knew that"):
        demopool += load_materials(preposition=prep, embedding=emb)

for engine in ("text-davinci-001", "text-davinci-003"):
    stress_results_with_basic_demos[engine] = experiment(
        stress, "stresstest-basicdemo", demopool=demopool, engine=engine)

In [26]:
assessment(stress_results_with_basic_demos["text-davinci-003"])

{'Correct': 2,
 'Total': 9,
 'Accuracy (EM)': 0.2222222222222222,
 'engine': 'text-davinci-003',
 'preposition': 'though',
 'embedding': ''}