In [87]:
import json
import random
import openai
import csv
import copy

### Sample ASSET Data

In [18]:
# Loads ASSET's 10 simplifications
def load_asset(path):
    with open(f'{path}/asset.test.orig', encoding='utf-8') as f:
        orig = f.read().splitlines()

    simplifications = []
    for i in range(10):
        with open(f'{path}/asset.test.simp.{i}', encoding='utf-8') as f:
            simplifications.append(f.read().splitlines())

    asset = {}

    counter = 0
    for sent in range(len(orig)):
        asset[counter] = {
            'original': orig[sent],
            'simplifications': [x[counter] for x in simplifications]
        }
        counter += 1
    
    return asset

# Randomly sample 5 ASSET sentences + 1 of their simplifications
def sample_asset(asset, num_samples=5):
    out = []
    for sample in random.sample(list(asset.items()), num_samples):
        out.append((sample[1]['original'], random.choice(sample[1]['simplifications'])))
    return out

asset = load_asset('asset/dataset')
sample_asset(asset, num_samples=1)

[('He settled in London, devoting himself chiefly to practical teaching.',
  'He lived in London. He was a teacher.')]

### Query OpenAI

In [82]:
# Get API key & configure OpenAI API
with open('.SECRET') as f:
    secret = f.read().strip()
openai.organization = "org-wAc6SximlF8c4j1M4IepuYrV"
openai.api_key = secret

# Load data
with open('text-simplification-new-wikipedia.csv', encoding='utf-8') as f:
    reader = csv.reader(f)
    next(reader)
    data = [row for row in reader]

# Will only use the sentences
data = [str(x[0]) for x in data]

In [102]:
def callGPT(prompt):
    response = openai.Completion.create(
        engine="text-davinci-002", 
        prompt=prompt, 
        temperature=1,
        top_p=0.96,
        frequency_penalty=0,
        presence_penalty=0,
        max_tokens=256)
    responses = [x['text'] for x in response['choices']]
    if len(responses) > 1:
        print(f"More than one response! {responses}")
    responses = responses[0]
    return responses

In [103]:
# sample_prompt = """
# Sentence: The library needs to be configured with your account's secret key.
# Simplify the above sentence.
# """

# callGPT(sample_prompt)

In [104]:
prompt = "Please rewrite the following complex sentence in order to make it easier to understand by non-native speakers of English. You can do so by replacing complex words with simpler synonyms (i.e. paraphrasing), deleting unimportant information (i.e. compression), and/or splitting a long complex sentence into several simpler ones. The final simplified sentence needs to be grammatical, fluent, and retain the main ideas of its original counterpart without altering its meaning.\n\n%s"
print(prompt % data[0])
print(callGPT(prompt % data[0]))

Please rewrite the following complex sentence in order to make it easier to understand by non-native speakers of English. You can do so by replacing complex words with simpler synonyms (i.e. paraphrasing), deleting unimportant information (i.e. compression), and/or splitting a long complex sentence into several simpler ones. The final simplified sentence needs to be grammatical, fluent, and retain the main ideas of its original counterpart without altering its meaning.

The architecture of Winchester College is a diverse set of architectural styles, reflecting the multiple periods of building from the college's foundation in 1382, through additions in the medieval and Early Modern periods, to a major expansion of accommodation in the Victorian era and then further extensions at the turn of the 20th century and more recently.


The architecture of Winchester College is a diverse set of architectural styles. It reflects the multiple periods of building from the college's foundation in 13

In [105]:
out = []
for sent in data:
    out.append(callGPT(prompt % sent))

In [106]:
out_cleaned = [sent.replace('\n', '') for sent in out]

In [107]:
with open('gpt3-output.json', 'w') as f:
    json.dump(out_cleaned, f, indent=4)

### Query OpenAI with Few-Shot

In [None]:
few_shot_template = """
Examples: %s
"""

example_template = """
Input: %s
Output: %s
"""

input_template = """
Input: %s
Output:
"""

In [None]:
print(few_shot_template % (example_template % sample_asset(asset, num_samples=1)[0]))


Examples: 
Input: The spacecraft consists of two main elements: the NASA Cassini orbiter, named after the Italian-French astronomer Giovanni Domenico Cassini, and the ESA Huygens probe, named after the Dutch astronomer, mathematician and physicist Christiaan Huygens.
Output: The spacecraft's two main elements are the NASA Cassini orbiter, named after the Italian-French astronomer Giovanni Domenico Cassini, and the ESA Huygens probe, named after the Dutch astronomer, mathematician and physicist Christiaan Huygens.




In [154]:
def create_few_shot_input(sent, num_samples=5):
    example_text = ""
    for example in sample_asset(asset, num_samples=num_samples):
        example_text += (example_template % example)
    return (prompt[:-3] + (few_shot_template % example_text) + prompt[:-3] + (input_template % sent)[:-1])
gpt_input = create_few_shot_input(data[0])
print(gpt_input)
print(callGPT(gpt_input))

Please rewrite the following complex sentence in order to make it easier to understand by non-native speakers of English. You can do so by replacing complex words with simpler synonyms (i.e. paraphrasing), deleting unimportant information (i.e. compression), and/or splitting a long complex sentence into several simpler ones. The final simplified sentence needs to be grammatical, fluent, and retain the main ideas of its original counterpart without altering its meaning.

Examples: 
Input: Tajikistan, Turkmenistan and Uzbekistan border Afghanistan to the north, Iran to the west, Pakistan to the south and the People's Republic of China to the east.
Output: Tajikistan, Turkmenistan and Uzbekistan border Afghanistan to the north. They also border Iran to the west, Pakistan to the south and the People's Republic of China to the east.

Input: The two former presidents were later separately charged with mutiny and treason for their roles in the 1979 coup and the 1980 Gwangju massacre.
Output: 

In [147]:
# Danger! Queries OpenAI ~$0.02 per sentence
out = []
for sent in data:
    gpt_input = create_few_shot_input(sent)
    out.append(callGPT(gpt_input))

In [151]:
out_cleaned = []
for sent in out:
    cleaned = sent
    if cleaned.startswith(' '):
        cleaned = cleaned[1:]
    if cleaned.startswith('\n\n'):
        cleaned = cleaned[2:]
    out_cleaned.append(cleaned)

In [152]:
with open('gpt3-few-shot-output.json', 'w') as f:
    json.dump(out_cleaned, f, indent=4)

### Compare Multiple Prompts

In [95]:
prompts = {
"naive": """
Sentence: %s
Simplify the above sentence.
""",
"a_lot": """
Sentence: %s
Simplify the above sentence a lot.
""",
"content_a_lot": """
Sentence: %s
Simplify the content in the above sentence a lot.
""",
"as_possible": """
Sentence: %s
Simplify the above sentence as much as possible.
""",
"asset_instructions": """
You are given a sentence that need to be rewritten so that they use simpler English. This means that you should reduce the number of difficult words or idioms, simplify complex phrasing, delete information that may not be relevant, and make the sentence more straight-forward. This could be accomplished by applying different transformations to the original sentences. In this task, we ask you to use paraphrasing, compression and/or sentence splitting.

Sentence: %s
Simplified sentence: """
}

In [96]:
# Example using the first prompt
sentence = "The proletarians have nothing to lose but their chains."
print(prompts[2][1:].replace('%s', sentence))

KeyError: 2

In [None]:
# sent = data[10]
# prompt_id = 3
# prompt = prompts[prompt_id][1:].replace('%s', sent)

# Danger! This costs ~$0.3 for 20 sentences
out = []
for prompt in prompts:
    prompt_out = []
    for sent in data:
        prompt_out.append(callGPT(prompt[1:].replace('%s', sent)))
    out.append(prompt_out)

In [None]:
out_new = {}
for i in range(len(out)):
    prompt_out_new = []
    for sent in out[i]:
        if len(sent) > 0 and sent[0] == '\n':
            sent = sent[1:]
        prompt_out_new.append(sent)
    out_new[list(prompts.keys())[i]] = prompt_out_new

In [None]:
import json
with open('gpt3-output.json', 'w') as f:
    json.dump(out_new, f)

## Split Sentences for Annotators

In [None]:
# The problem with this is that it may randomly assign the same sentences
# to an annotator. This isn't easy to fix...

In [None]:
# Create split of data for annotators
annotators = [
    'anton', 'ayush', 'kelly', 'rachel', 'vinayak', 'vishnesh'
]

out = {}
for ann in annotators:
    out[ann] = []

seen = {}
tmp = copy.deepcopy(data)

SENT_PER_ANN = int(len(data)*2 / len(annotators))
if (len(data)*2) % len(annotators):
    print("Sentences are non-divisible I think...")

ann_pointer = 0
while len(tmp) != 0:
    sent = random.choice(tmp)
    if sent in out[annotators[ann_pointer]]:
        print('seen before')

    if sent not in seen:
        seen[sent] = 0
    seen[sent] += 1
    out[annotators[ann_pointer]].append(sent)
    ann_pointer += 1
    if ann_pointer == len(annotators):
        ann_pointer = 0
    if seen[sent] == 2:
        tmp.remove(sent)

# Sanity check: annotator with duplicate sentences, should be all
# empty
import collections
sents = [x for x in out.values()]
for a in sents:
    print([count for item, count in collections.Counter(a).items() if count > 1])

[]
[]
[]
[]
[]
[]


In [None]:
# Final sentence split per annotator
for key in out.keys():
    print(key, len(out[key]))

anton 10
ayush 10
kelly 10
rachel 10
vinayak 10
vishnesh 10


In [None]:
annotator = 'vishnesh'

clip = ""
for sent in out[annotator]:
    clip += f'{sent}\n\n\n\n'

import pyperclip
pyperclip.copy(clip)