In [1]:
# Make sure to create a '.SECRET' file with your API key!
from query import *

In [13]:
filename = 'batch-2.csv'
batch_name = filename.split('.')[0]

# Will only use the sentences
data = load_batch(filename)
data = [str(x[0]) for x in data]

In [34]:
# Get API key & configure OpenAI API
with open('.SECRET') as f:
    secret = f.read().strip()
openai.organization = "org-wAc6SximlF8c4j1M4IepuYrV"
openai.api_key = secret

In [35]:
def callGPT(prompt):
    response = openai.Completion.create(
        engine="text-davinci-003", 
        prompt=prompt, 
        temperature=1,
        top_p=0.96,
        frequency_penalty=0,
        presence_penalty=0,
        max_tokens=256)
    responses = [x['text'] for x in response['choices']]
    if len(responses) > 1:
        print(f"More than one response! {responses}")
    responses = responses[0]
    return responses

###  Zero-shot Eval

In [None]:
asset = load_asset('asset/dataset')
sample_asset(asset, num_samples=1)

[('Human skin hues can range from very dark brown to very pale pink.',
  'Human skin tone ranges from very dark brown to very pale pink.')]

In [36]:
print(prompt % data[0])
print(callGPT(prompt % data[0]))

Please rewrite the following complex sentence in order to make it easier to understand by non-native speakers of English. You can do so by replacing complex words with simpler synonyms (i.e. paraphrasing), deleting unimportant information (i.e. compression), and/or splitting a long complex sentence into several simpler ones. The final simplified sentence needs to be grammatical, fluent, and retain the main ideas of its original counterpart without altering its meaning.

The architecture of Winchester College is a diverse set of architectural styles, reflecting the multiple periods of building from the college's foundation in 1382, through additions in the medieval and Early Modern periods, to a major expansion of accommodation in the Victorian era and then further extensions at the turn of the 20th century and more recently.


Winchester College was established in 1382, and since then its architecture has been composed of a variety of styles, reflecting its expansion in the medieval, E

In [37]:
out = []
for sent in data:
    out.append(callGPT(prompt % sent))

In [38]:
out_cleaned = [sent.replace('\n', '') for sent in out]

In [39]:
with open(f'gpt-outputs/zero-shot-{batch_name}.json', 'w') as f:
    json.dump(out_cleaned, f, indent=4)

### Few-Shot Eval

In [42]:
gpt_input = create_few_shot_input(data[0])
print(gpt_input)
print(callGPT(gpt_input))

Please rewrite the following complex sentence in order to make it easier to understand by non-native speakers of English. You can do so by replacing complex words with simpler synonyms (i.e. paraphrasing), deleting unimportant information (i.e. compression), and/or splitting a long complex sentence into several simpler ones. The final simplified sentence needs to be grammatical, fluent, and retain the main ideas of its original counterpart without altering its meaning.

Examples: 
Input: Gable also earned an Academy Award nomination when he portrayed Fletcher Christian in 1935's Mutiny on the Bounty.
Output: Gable earned an Academy Award nomination for portraying Fletcher Christian in Mutiny on the Bounty.

Input: Pauline returned in the Game Boy remake of Donkey Kong in 1994, and later Mario vs. Donkey Kong 2: March of the Minis in 2006, although the character is now described as "Mario's friend".
Output: Pauline appeared again in the Game Boy remake of Donkey Kong in 1994, and later 

In [43]:
# Danger! Queries OpenAI ~$0.02 per sentence
out = []
for sent in data:
    gpt_input = create_few_shot_input(sent)
    out.append(callGPT(gpt_input))

In [44]:
# Clean outputs
out_cleaned = []
for sent in out:
    cleaned = sent
    if cleaned.startswith(' '):
        cleaned = cleaned[1:]
    if cleaned.startswith('\n\n'):
        cleaned = cleaned[2:]
    out_cleaned.append(cleaned)

In [45]:
with open(f'gpt-outputs/few-shot-{batch_name}.json', 'w') as f:
    json.dump(out_cleaned, f, indent=4)

### Query OpenAI with different sets of examples
The idea is to sample different datasets as the examples but use the same complex sentences. Theoretically the different demonstrations should impact the amount of compression.

In [21]:
simeval_sents[0]

{'original': "In late 2004, Suleman made headlines by cutting Howard Stern's radio show from four Citadel stations, citing Stern's frequent discussions regarding his upcoming move to Sirius Satellite Radio.",
 'simplification': "At the end of 2004, Suleman made headlines by cutting Howard Stern's radio show from four Citadel stations, saying that Stern's frequent discussions about his upcoming move to Sirius Satellite Radio were the reason why.",
 'system': 'systems/asset.test.simp'}

In [29]:
human_systems = set([
    'systems/asset.test.simp',
    'new_systems/turk_corpus_random.txt',
    'new_systems/simple_wiki.txt'
])

# Load data from simpeval
multi_dataset = load_multi_dataset(human_systems)

# Load batch 1 as our simplifications
multi_dataset += [{
    'original': x[0],
    'simplification': x[1],
    'system': 'new_systems/our_human_written'
} for x in load_batch('ourdata/batch-1-human.csv')]
human_systems.add('new_systems/our_human_written')

sample_multi_dataset(multi_dataset, 'systems/asset.test.simp', num_samples=2)

[('Brunstad has several fast food restaurants, a cafeteria-style restaurant, coffee bar, and its own grocery store.',
  'Brunstad is home to a few fast food restaurants, a cafeteria, coffee bar and a grocery store.'),
 ('It will then dislodge itself and sink back to the river bed in order to digest its food and wait for its next meal.',
  'It will then remove itself and sink back to the river bed in order to digest its food and wait for its next meal.')]

In [30]:
print(create_few_shot_input_multi_dataset(data[0], multi_dataset, 'new_systems/our_human_written'))

Please rewrite the following complex sentence in order to make it easier to understand by non-native speakers of English. You can do so by replacing complex words with simpler synonyms (i.e. paraphrasing), deleting unimportant information (i.e. compression), and/or splitting a long complex sentence into several simpler ones. The final simplified sentence needs to be grammatical, fluent, and retain the main ideas of its original counterpart without altering its meaning.

Examples: 
Input: The architecture of Winchester College is a diverse set of architectural styles, reflecting the multiple periods of building from the college's foundation in 1382, through additions in the medieval and Early Modern periods, to a major expansion of accommodation in the Victorian era and then further extensions at the turn of the 20th century and more recently.
Output: The buildings of Winchester College have diverse architectural styles. They reflect multiple periods of building from the college’s found

In [31]:
human_systems

{'new_systems/our_human_written',
 'new_systems/simple_wiki.txt',
 'new_systems/turk_corpus_random.txt',
 'systems/asset.test.simp'}

In [66]:
# Because of the amount of queries, I'm going through each system manually
system = 'new_systems/turk_corpus_random.txt'

In [67]:
# Danger! Queries OpenAI ~$0.02 per sentence
out = []
for sent in data:
    gpt_input = create_few_shot_input_multi_dataset(sent, multi_dataset, system)
    out.append(callGPT(gpt_input))

APIConnectionError: Error communicating with OpenAI

In [70]:
# Clean outputs
out_cleaned = []
for sent in out:
    cleaned = sent
    if cleaned.startswith(' '):
        cleaned = cleaned[1:]
    if cleaned.startswith('\n\n'):
        cleaned = cleaned[2:]
    out_cleaned.append(cleaned)

In [71]:
len(out_cleaned)

3

In [None]:
with open(f'gpt-outputs/cross-dataset-demonstrations/few-shot-{batch_name}-{system.replace("/", "-").replace(".", "-")}.json', 'w') as f:
    json.dump(out_cleaned, f, indent=4)

### Compare Multiple Prompts

In [95]:
prompts = {
"naive": """
Sentence: %s
Simplify the above sentence.
""",
"a_lot": """
Sentence: %s
Simplify the above sentence a lot.
""",
"content_a_lot": """
Sentence: %s
Simplify the content in the above sentence a lot.
""",
"as_possible": """
Sentence: %s
Simplify the above sentence as much as possible.
""",
"asset_instructions": """
You are given a sentence that need to be rewritten so that they use simpler English. This means that you should reduce the number of difficult words or idioms, simplify complex phrasing, delete information that may not be relevant, and make the sentence more straight-forward. This could be accomplished by applying different transformations to the original sentences. In this task, we ask you to use paraphrasing, compression and/or sentence splitting.

Sentence: %s
Simplified sentence: """
}

In [96]:
# Example using the first prompt
sentence = "The proletarians have nothing to lose but their chains."
print(prompts[2][1:].replace('%s', sentence))

KeyError: 2

In [None]:
# sent = data[10]
# prompt_id = 3
# prompt = prompts[prompt_id][1:].replace('%s', sent)

# Danger! This costs ~$0.3 for 20 sentences
out = []
for prompt in prompts:
    prompt_out = []
    for sent in data:
        prompt_out.append(callGPT(prompt[1:].replace('%s', sent)))
    out.append(prompt_out)

In [None]:
out_new = {}
for i in range(len(out)):
    prompt_out_new = []
    for sent in out[i]:
        if len(sent) > 0 and sent[0] == '\n':
            sent = sent[1:]
        prompt_out_new.append(sent)
    out_new[list(prompts.keys())[i]] = prompt_out_new

In [None]:
import json
with open('gpt3-output.json', 'w') as f:
    json.dump(out_new, f)

## Split Sentences for Annotators

In [None]:
# The problem with this is that it may randomly assign the same sentences
# to an annotator. This isn't easy to fix...

In [None]:
# Create split of data for annotators
annotators = [
    'anton', 'ayush', 'kelly', 'rachel', 'vinayak', 'vishnesh'
]

out = {}
for ann in annotators:
    out[ann] = []

seen = {}
tmp = copy.deepcopy(data)

SENT_PER_ANN = int(len(data)*2 / len(annotators))
if (len(data)*2) % len(annotators):
    print("Sentences are non-divisible I think...")

ann_pointer = 0
while len(tmp) != 0:
    sent = random.choice(tmp)
    if sent in out[annotators[ann_pointer]]:
        print('seen before')

    if sent not in seen:
        seen[sent] = 0
    seen[sent] += 1
    out[annotators[ann_pointer]].append(sent)
    ann_pointer += 1
    if ann_pointer == len(annotators):
        ann_pointer = 0
    if seen[sent] == 2:
        tmp.remove(sent)

# Sanity check: annotator with duplicate sentences, should be all
# empty
import collections
sents = [x for x in out.values()]
for a in sents:
    print([count for item, count in collections.Counter(a).items() if count > 1])

[]
[]
[]
[]
[]
[]


In [None]:
# Final sentence split per annotator
for key in out.keys():
    print(key, len(out[key]))

anton 10
ayush 10
kelly 10
rachel 10
vinayak 10
vishnesh 10


In [None]:
annotator = 'vishnesh'

clip = ""
for sent in out[annotator]:
    clip += f'{sent}\n\n\n\n'

import pyperclip
pyperclip.copy(clip)