# What does GPT-3 really understand about negation?

In [1]:
__author__ = "Christopher Potts"

__date__ = "2023-01-10"

## Set-up

In [2]:
import glob
import openai
import os
import pandas as pd
import re


from tenacity import retry, stop_after_attempt, wait_random_exponential

In [3]:
# Add you API key here to re-run the experiments:

openai.api_key = None

In [4]:
pd.set_option('max_colwidth', 999)
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

## Dataset

In [5]:
def load_dataset(dirname="temp/monli/scone"):
    filenames = glob.glob(f"{dirname}/nmonli_test_edited_full_*.csv")
    dfs = []
    for filename in filenames:
        split = os.path.basename(filename).replace("nmonli_test_edited_full_", "").replace(".csv", "")
        df = pd.read_csv(filename, index_col=0)
        df['split'] = split
        dfs.append(df)
    return pd.concat(dfs)
    
DATASET = load_dataset()

In [6]:
sentence1_colname = 'sentence1_edited'
sentence2_colname = 'sentence2_edited'
gold_colname = 'gold_label_edited'

In [7]:
DATASET = DATASET[['split', sentence1_colname, sentence2_colname, gold_colname]]

In [8]:
overview = DATASET.groupby('split').apply(lambda s: s.groupby(gold_colname).sample(1, random_state=1)).reset_index(drop=True)
overview = overview[['split', sentence1_colname, gold_colname, sentence2_colname]]

In [9]:
# LaTeX version of the above for the write-up:

print(overview.to_latex(index=None))

\begin{tabular}{llll}
\toprule
             split &                                                                              sentence1\_edited & gold\_label\_edited &                                                                                   sentence2\_edited \\
\midrule
    both\_not\_scope &                           A dog not weaing a collar caught a basketball moving not very fast. &        entailment &                                      A dog not weaing a collar caught a ball moving not very fast. \\
    both\_not\_scope &                      The girl, not the boy, got a stuffed dog as a gift that was not wrapped. &           neutral &                      The girl, not the boy, got a stuffed malamute as a gift that was not wrapped. \\
        double\_neg &                                            It is a lie that dog did not catch any basketball. &        entailment &                                                       It is a lie that dog did not catch any ball

In [10]:
DATASET[gold_colname].value_counts()

entailment    400
neutral       400
Name: gold_label_edited, dtype: int64

In [11]:
print(DATASET.split.value_counts().to_latex())

\begin{tabular}{lr}
\toprule
{} &  split \\
\midrule
both\_not\_scope    &    200 \\
one\_scope\_one\_not &    200 \\
double\_neg        &    200 \\
not\_scoped        &    200 \\
\bottomrule
\end{tabular}



In [12]:
print(pd.crosstab(DATASET.split, DATASET[gold_colname]).to_latex())

\begin{tabular}{lrr}
\toprule
gold\_label\_edited &  entailment &  neutral \\
split             &             &          \\
\midrule
both\_not\_scope    &         100 &      100 \\
double\_neg        &         100 &      100 \\
not\_scoped        &         100 &      100 \\
one\_scope\_one\_not &         100 &      100 \\
\bottomrule
\end{tabular}



## Prompts

In [13]:
toy_example = {sentence1_colname: "We didn't eat pizza", sentence2_colname: "We didn't eat food", 'split': 'one_scope_one_not'}

### Conditional questions

In [14]:
def conditional_question_prompt(row, i=None):   
    prem = _mid_sentence_normalization(row[sentence1_colname])
    hyp = _mid_sentence_normalization(row[sentence2_colname])    
    if i is None:
        prompt = f"Is it true that if {prem}, then {hyp}?"
    # Few-shot demonstration case:
    else:
        ans = "Yes" if row[gold_colname] == 'entailment' else "Maybe"
        prompt = f"Q{i}: Is it true that if {prem}, then {hyp}?\nA{i}: {ans}"
    return prompt


def _mid_sentence_normalization(s):
    s = s[0].lower() + s[1: ]
    s = s.rstrip(".")
    return s

In [15]:
print(conditional_question_prompt(toy_example))

Is it true that if we didn't eat pizza, then we didn't eat food?


### Few-shot conditional questions

In [16]:
def few_shot_conditional_question_prompt(row, n=3):
    splitname = row['split']
    demos = _sample_demos(row, n)
    strs = [conditional_question_prompt(row, i+1) for i, row in demos.iterrows()]
    strs.append("Q: " + conditional_question_prompt(row) + "\nA:")
    return "\n\n".join(strs)


def _sample_demos(row, n):
    # Demonstrations are different examples from the same split,
    # and we get 2 per label:
    split = DATASET[
        (DATASET.split == row['split']) & 
        (DATASET[sentence1_colname] != row[sentence1_colname]) & 
        (DATASET[sentence2_colname] != row[sentence2_colname])
    ]        
    demos = split.groupby(gold_colname).sample(2).reset_index()
    return demos

In [17]:
print(few_shot_conditional_question_prompt(toy_example))

Q1: Is it true that if a not so tall person reading a paper is not currently sitting inside a building, then a not so tall person reading a paper is not currently sitting inside a cinema?
A1: Yes

Q2: Is it true that if a not so tall person reading a paper is not currently sitting inside a building, then a not so tall person reading a paper is not currently sitting inside a cathedral?
A2: Yes

Q3: Is it true that if a not so tall person reading a paper is not currently sitting inside a manor, then a not so tall person reading a paper is not currently sitting inside a building?
A3: Maybe

Q4: Is it true that if the girl will not get a stuffed dog as a gift, but not because she failed the exam, then the girl will not get a stuffed mammal as a gift, but not because she failed the exam?
A4: Maybe

Q: Is it true that if we didn't eat pizza, then we didn't eat food?
A:


### Hypothesis questions

In [18]:
def hypothesis_question_prompt(row, i=None):   
    prem = _mid_sentence_normalization(row[sentence1_colname])
    hyp = _mid_sentence_normalization(row[sentence2_colname])    
    if i is None:
        prompt = f"Assume that {prem}. Is it then definitely true that {hyp}? Answer Yes or No."
    # Few-shot demonstration case:
    else:
        ans = "Yes" if row[gold_colname] == 'entailment' else "No"
        prompt = f"Q{i}: Assume that {prem}. Is it then definitely true that {hyp}? Answer Yes or No.\nA{i}: {ans}"
    return prompt

In [19]:
print(hypothesis_question_prompt(toy_example))

Assume that we didn't eat pizza. Is it then definitely true that we didn't eat food? Answer Yes or No.


### Few-shot hypothesis questions

In [20]:
def few_shot_hypothesis_question_prompt(row, n=3):
    splitname = row['split']
    demos = _sample_demos(row, n)
    strs = [hypothesis_question_prompt(row, i+1) for i, row in demos.iterrows()]
    strs.append("Q: " + hypothesis_question_prompt(row) + "\nA:")
    return "\n\n".join(strs)

In [21]:
print(few_shot_hypothesis_question_prompt(toy_example))

Q1: Assume that the players who did not score did not have a ball. Is it then definitely true that the players who did not score did not have a basketball? Answer Yes or No.
A1: Yes

Q2: Assume that the man does not own a mammal and does not own a cat. Is it then definitely true that the man does not own a dog and does not own a cat? Answer Yes or No.
A2: Yes

Q3: Assume that a dog not on the playground did not catch any racquetball. Is it then definitely true that a dog not on the playground did not catch any ball? Answer Yes or No.
A3: No

Q4: Assume that the man does not own a rottweiler and does not own a cat. Is it then definitely true that the man does not own a dog and does not own a cat? Answer Yes or No.
A4: No

Q: Assume that we didn't eat pizza. Is it then definitely true that we didn't eat food? Answer Yes or No.
A:


### Conditional truth evaluation

In [22]:
def conditional_truth_evaluation_prompt(row, i=None):
    prem = _mid_sentence_normalization(row[sentence1_colname])
    hyp = _mid_sentence_normalization(row[sentence2_colname])
    if i is None:    
        prompt = f"If {prem}, then {hyp}. Is this true?" 
    # Few-shot demonstration case:
    else:
        ans = "Yes" if row[gold_colname] == 'entailment' else "Maybe"
        prompt = f"C{i}: If {prem}, then {hyp}. Is this true?\nA{i}: {ans}"
    return prompt

In [23]:
print(conditional_truth_evaluation_prompt(toy_example))

If we didn't eat pizza, then we didn't eat food. Is this true?


In [24]:
def few_shot_conditional_truth_evaluation_prompt(row, n=3):
    splitname = row['split']
    demos = _sample_demos(row, n)
    strs = [conditional_truth_evaluation_prompt(row, i+1) for i, row in demos.iterrows()]
    strs.append("C:" + conditional_truth_evaluation_prompt(row) + "\nA:")
    return "\n\n".join(strs)

### Few-shot conditional truth evaluation

In [25]:
print(few_shot_conditional_truth_evaluation_prompt(toy_example))

C1: If the man does not own a dog and does not own a cat, then the man does not own a maltese and does not own a cat. Is this true?
A1: Yes

C2: If the man does not own a dog and does not own a cat, then the man does not own a terrier and does not own a cat. Is this true?
A2: Yes

C3: If a not so tall person reading a paper is not currently sitting inside a hotel, then a not so tall person reading a paper is not currently sitting inside a building. Is this true?
A3: Maybe

C4: If a not so tall person reading a paper is not currently sitting inside a steakhouse, then a not so tall person reading a paper is not currently sitting inside a building. Is this true?
A4: Maybe

C:If we didn't eat pizza, then we didn't eat food. Is this true?
A:


### Brown et al.-style

Adapted from https://arxiv.org/abs/2005.14165

In [26]:
def brown_et_al_style_prompt(row, i=None):
    prem = row[sentence1_colname]
    hyp = row[sentence2_colname]
    hyp = hyp.rstrip(".")
    if i is None:
        prompt = f"C: {prem}\nQ: {hyp}. Yes, No, or Maybe?"
    # Few-shot demonstration case:
    else:
        ans = "Yes" if row[gold_colname] == 'entailment' else "Maybe"
        prompt = f"C{i}: {prem}\nQ{i}: {hyp}. Yes, No, or Maybe?\nA{i+1}: {ans}"
    return prompt

In [27]:
print(brown_et_al_style_prompt(toy_example))

C: We didn't eat pizza
Q: We didn't eat food. Yes, No, or Maybe?


### Few-shot Brown et al.-style

In [28]:
def few_shot_brown_et_al_style_prompt(row, n=3):
    demos = _sample_demos(row, n) 
    strs = [brown_et_al_style_prompt(row, i+1) for i, row in demos.iterrows()]
    strs.append(brown_et_al_style_prompt(row) + "\nA:")
    return "\n\n".join(strs)   

In [29]:
print(few_shot_brown_et_al_style_prompt(toy_example))

C1: The girl will not get a stuffed dog as a gift, but not because she failed the exam.
Q1: The girl will not get a stuffed hound as a gift, but not because she failed the exam. Yes, No, or Maybe?
A2: Yes

C2: The girl will not get a stuffed dog as a gift, but not because she failed the exam.
Q2: The girl will not get a stuffed chihuahua as a gift, but not because she failed the exam. Yes, No, or Maybe?
A3: Yes

C3: A dog not on the playground did not catch any baseball.
Q3: A dog not on the playground did not catch any ball. Yes, No, or Maybe?
A4: Maybe

C4: The girl will not get a stuffed beagle as a gift, but not because she failed the exam.
Q4: The girl will not get a stuffed dog as a gift, but not because she failed the exam. Yes, No, or Maybe?
A5: Maybe

C: We didn't eat pizza
Q: We didn't eat food. Yes, No, or Maybe?
A:


### Structured

In [30]:
def structured_prompt(row, i=None):
    prem = row[sentence1_colname]
    hyp = row[sentence2_colname]    
    if i is None:    
        prompt = f"P: {prem}\nH: {hyp}\nL:"
    # Few-shot demonstration case:
    else:
        prompt = f"P{i}: {prem}\nH{i}: {hyp}\nL{i}: {row[gold_colname]}"
    return prompt

In [31]:
print(structured_prompt(toy_example))

P: We didn't eat pizza
H: We didn't eat food
L:


### Few-shot structured

In [32]:
def few_shot_structured_prompt(row, n=3):
    demos = _sample_demos(row, n)    
    strs = [structured_prompt(row, i+1) for i, row in demos.iterrows()]
    strs.append(structured_prompt(row))
    return "\n\n".join(strs) 

In [33]:
print(few_shot_structured_prompt(toy_example))

P1: the man does not own a dog and does not own a cat.
H1: the man does not own a malamute and does not own a cat.
L1: entailment

P2: The girl will not get a stuffed dog as a gift, but not because she failed the exam.
H2: The girl will not get a stuffed dachshund as a gift, but not because she failed the exam.
L2: entailment

P3: A not so tall person reading a paper is not currently sitting inside a hospital.
H3: A not so tall person reading a paper is not currently sitting inside a building.
L3: neutral

P4: A dog not on the playground did not catch any baseball.
H4: A dog not on the playground did not catch any ball.
L4: neutral

P: We didn't eat pizza
H: We didn't eat food
L:


### Reasoning

Loosely inspired by https://arxiv.org/pdf/2102.07350.pdf, https://arxiv.org/pdf/2201.11903.pdf, and others.

In [34]:
def reasoning_prompt(row):
    prem = row[sentence1_colname]
    hyp = row[sentence2_colname]
    prem = _mid_sentence_normalization(prem)
    hyp = _mid_sentence_normalization(hyp)
    prompt = f"""Logical and commonsense reasoning exam.

Explain your reasoning in detail, then answer with Yes or No. Your answers should follow this 4-line format:

Premise: <a tricky logical statement about the world>.
Question: <question requiring logical deduction>.
Reasoning: <an explanation of what you understand about the possible scenarios>.
Answer: <Yes or No>.

Premise: {prem}
Question: Can we logically conclude for sure that {hyp}?
Reasoning: Let's think logically step by step. The premise basically tells us that"""
    return prompt

In [35]:
print(reasoning_prompt(toy_example))

Logical and commonsense reasoning exam.

Explain your reasoning in detail, then answer with Yes or No. Your answers should follow this 4-line format:

Premise: <a tricky logical statement about the world>.
Question: <question requiring logical deduction>.
Reasoning: <an explanation of what you understand about the possible scenarios>.
Answer: <Yes or No>.

Premise: we didn't eat pizza
Question: Can we logically conclude for sure that we didn't eat food?
Reasoning: Let's think logically step by step. The premise basically tells us that


## Label inference

In [36]:
def run_gpt3(prompts, engine="text-davinci-002", temperature=0.0, batch_size=10, max_tokens=200):    
    all_responses = []    
    for i in range(0, len(prompts), batch_size):        
        response = completion_with_backoff(
            engine=engine,       
            prompt=prompts[i: i+batch_size],
            temperature=temperature,
            echo=False,
            max_tokens=max_tokens,
            n=1)
        # We'll keep just the response texts:
        all_responses += [d['text'].strip() for d in response['choices']]
    return all_responses

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def completion_with_backoff(**kwargs):
    return openai.Completion.create(**kwargs)

## Inferring labels

In [37]:
def infer_label(s):
    yes = re.compile(r"\byes\b", re.I | re.M)     
    if yes.search(s):
        return "entailment"
    else:
        return "neutral" 

## Experiment wrapper

In [38]:
def run_experiment(samp, prompt_func, engine, max_tokens=200):
    prompt_func_name = prompt_func.__name__        
    samp[prompt_func_name] = samp.apply(prompt_func, axis=1)    
    responses = run_gpt3(list(samp[prompt_func_name].values), engine=engine, max_tokens=max_tokens)
    response_key = engine + "_" + prompt_func_name + "_response"
    pred_key = engine + "_" + prompt_func_name + "_prediction"    
    samp[response_key] = responses    
    samp[pred_key] = samp[response_key].apply(infer_label)    
    return samp

## Results reporting

In [39]:
def report_results(samp, prompt_func, engine):
    accurate = samp[engine + "_" + prompt_func.__name__ + "_prediction"] == samp[gold_colname]
    results = pd.crosstab(samp.split, accurate)
    results.loc['All'] = results.sum(axis=0)    
    acc = results.apply(lambda row: row[True] / row.sum(), axis=1)        
    results['Accuracy'] = acc
    # Some clean-up for the LaTeX output:
    results = results.rename(columns={True: "Correct", False: "Incorrect"})
    results.columns.name = None
    results.index.name = None
    return results

## Experiments

### Prompt conditions

In [40]:
prompt_funcs = (
    conditional_question_prompt,
    few_shot_conditional_question_prompt,
    hypothesis_question_prompt,
    few_shot_hypothesis_question_prompt,
    conditional_truth_evaluation_prompt,
    few_shot_conditional_truth_evaluation_prompt,
    brown_et_al_style_prompt,
    few_shot_brown_et_al_style_prompt,
    structured_prompt,
    few_shot_structured_prompt,
    reasoning_prompt
)

### Experiment loop

In [41]:
engines = ("text-davinci-002", "text-davinci-003")

In [42]:
for engine in engines:    
    for prompt_func in prompt_funcs:
        print(engine, prompt_func.__name__)
        DATASET = run_experiment(DATASET, prompt_func, max_tokens=200, engine=engine)

text-davinci-002 conditional_question_prompt
text-davinci-002 few_shot_conditional_question_prompt
text-davinci-002 hypothesis_question_prompt
text-davinci-002 few_shot_hypothesis_question_prompt
text-davinci-002 conditional_truth_evaluation_prompt
text-davinci-002 few_shot_conditional_truth_evaluation_prompt
text-davinci-002 brown_et_al_style_prompt
text-davinci-002 few_shot_brown_et_al_style_prompt
text-davinci-002 structured_prompt
text-davinci-002 few_shot_structured_prompt
text-davinci-002 reasoning_prompt
text-davinci-003 conditional_question_prompt
text-davinci-003 few_shot_conditional_question_prompt
text-davinci-003 hypothesis_question_prompt
text-davinci-003 few_shot_hypothesis_question_prompt
text-davinci-003 conditional_truth_evaluation_prompt
text-davinci-003 few_shot_conditional_truth_evaluation_prompt
text-davinci-003 brown_et_al_style_prompt
text-davinci-003 few_shot_brown_et_al_style_prompt
text-davinci-003 structured_prompt
text-davinci-003 few_shot_structured_prompt


### Results

These are formatted for dirct use in the report.

In [43]:
for prompt_func in prompt_funcs:
    for engine in engines:
        print("%" * 70, end="\n")    
        sechead = prompt_func.__name__.replace("_", " ").title().replace("Few Shot", "Few-Shot")
        print(f"\\subsection{{{engine}: {sechead}}}",  end="\n\n") 
        p = prompt_func(toy_example)
        p = re.sub(r"\n", r"\\mynewline\n", p, re.M)         
        print(f"\\promptExample{{{p}}}", end="\n\n")        
        print("\\begin{center}")
        print(report_results(DATASET, prompt_func, engine).to_latex(float_format="%.2f"), end="")
        print("\\end{center}", end="\n\n")

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{text-davinci-002: Conditional Question Prompt}

\promptExample{Is it true that if we didn't eat pizza, then we didn't eat food?}

\begin{center}
\begin{tabular}{lrrr}
\toprule
{} &  Incorrect &  Correct &  Accuracy \\
\midrule
both\_not\_scope    &         70 &      130 &      0.65 \\
double\_neg        &         99 &      101 &      0.51 \\
not\_scoped        &         91 &      109 &      0.55 \\
one\_scope\_one\_not &        100 &      100 &      0.50 \\
All               &        360 &      440 &      0.55 \\
\bottomrule
\end{tabular}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{text-davinci-003: Conditional Question Prompt}

\promptExample{Is it true that if we didn't eat pizza, then we didn't eat food?}

\begin{center}
\begin{tabular}{lrrr}
\toprule
{} &  Incorrect &  Correct &  Accuracy \\
\midrule
both\_not\_scope    &         37 &      163 &   

## Error analysis

In [46]:
def error_analysis(samp, prompt_func, engine, splitname):
    prompt_func_name = prompt_func.__name__
    pred_colname = engine + "_" + prompt_func_name + "_prediction"
    response_colname = engine + "_" + prompt_func_name + "_response"
    err_df = samp[(samp[pred_colname] != samp[gold_colname]) & (samp.split == splitname)]
    return err_df[[sentence1_colname, gold_colname, sentence2_colname, prompt_func_name, pred_colname, response_colname]]

In [49]:
#error_analysis(DATASET, reasoning_prompt, "text-davinci-002", "one_scope_one_not")

In [50]:
DATASET.to_json("scone-davinci-results.json", orient='records', indent=4)