In [53]:
def edit_dist(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = [i2+1]
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
    return distances[-1]

def avg(lst, prec=2):
    if len(lst) == 0:
        return 0
    return round(sum(lst) / len(lst), prec)

### Cross-system Comparison

In [115]:
import json

human_systems = {
    'systems/asset.test.simp': 'asset',
    'new_systems/turk_corpus_random.txt': 'turk-corpus',
    'new_systems/simple_wiki.txt': 'simple-wiki',
    'new_systems/our_human_written': 'our-data'
}

path = 'gpt-outputs/cross-dataset-demonstrations'

In [116]:
data = {}
for system in human_systems.keys():
    with open(f'{path}/few-shot-batch-2-{human_systems[system]}.json') as f:
        data[system] = json.load(f)

In [117]:
from query import load_batch
orig = [str(x[0]) for x in load_batch('batch-2.csv')]
gold = [str(x[1]) for x in load_batch('ourdata/batch-2-human.csv')]
human_systems['gold/our_human_written'] = 'gold-standard'

In [118]:
with open(f'gpt-outputs/zero-shot-batch-2.json') as f:
    data['systems/zero-shot'] = json.load(f)
human_systems['systems/zero-shot'] = 'zero-shot'

In [119]:
interwoven = []
for i in range(min([len(x) for x in data.values()])):
    entry = {}
    for system in data.keys():
        entry[system] = data[system][i]        
    entry['gold/our_human_written'] = gold[i]
    entry['original'] = orig[i]
    interwoven.append(entry)

In [123]:
edit_dists = {k: [] for k, v in human_systems.items()}
for sent in interwoven: 
    for system in human_systems.keys():
        edit_dists[system].append(edit_dist(sent['original'], sent[system]))
average_edit_dist = {k: avg(v) for k, v in edit_dists.items()}
print("distance from original sentence:")
print(average_edit_dist)

edit_dists = {k: [] for k, v in human_systems.items()}
for sent in interwoven: 
    for system in human_systems.keys():
        edit_dists[system].append(edit_dist(sent['gold/our_human_written'], sent[system]))
average_edit_dist = {k: avg(v) for k, v in edit_dists.items()}
print("distance from human written simplification:")
print(average_edit_dist)

distance from original sentence:
{'systems/asset.test.simp': 97.67, 'new_systems/turk_corpus_random.txt': 86.5, 'new_systems/simple_wiki.txt': 96.5, 'new_systems/our_human_written': 97.61, 'gold/our_human_written': 104.89, 'systems/zero-shot': 107.33}
distance from human written simplification:
{'systems/asset.test.simp': 115.33, 'new_systems/turk_corpus_random.txt': 112.06, 'new_systems/simple_wiki.txt': 117.78, 'new_systems/our_human_written': 114.39, 'gold/our_human_written': 0.0, 'systems/zero-shot': 123.72}


In [121]:
sent = interwoven[0]
for system in sent.keys():
    print(system)
    print(sent[system], end='\n\n')

systems/asset.test.simp
The club announced on social media that customers managed to restrain the attacker. They also said that they were 'shocked and saddened' by the attack and sent their condolences to the victims and their families.

new_systems/turk_corpus_random.txt
The club said on social media that customers stopped the gunman. It expressed that it was "devastated by the senseless attack on our community" and offered condolences to the victims and their families.

new_systems/simple_wiki.txt
The club announced on social media that customers stopped the shooter. They expressed that they were "heartbroken by the senseless attack on our community" and sent their condolences to the victims and their loved ones.

new_systems/our_human_written
The club reported on social media that customers prevented the gunman. They said they were “devastated” by the attack and offered their condolences to the victims and their families.

systems/zero-shot
The club announced on social media that cu

### `text-davinci-002` vs `text-davinci-003`

In [2]:
import json
with open('gpt-outputs/few-shot-batch-1.json') as f:
    data = json.load(f)

In [4]:
# Allows interactive plotting
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import matplotlib.pyplot as plt, random

# This is my first time using iPy, so this is a bit clunky...
def series(i):
    for sent in [x[i] for x in data.values()]:
        print(sent + "\n")
    return()

interact(
    series, 
    i = widgets.IntSlider(
        value=0,
        min=0,
        max=len(data)-1,
        step=1,
        description='Sentence:',
        orientation='horizontal'
    )
)

# data=[data], 
# good_deletion=(-20,20,0.5), 
# good_trivial_insertion=(-20,20,0.5), 
# good_insertion=(-20,20,0.5), 
# good_paraphrase=(-20,20,0.5), 
# good_syntax=(-20,20,0.5), 
# grammar_error=(-20,20,0.5), 
# content_error=(-20,20,0.5),
# size_calculation=['linear', 'log', 'square', 'none'],
# average=[True, False],
# user=['all'] + sorted(list(set([sent['user'] for sent in data]))),
# display_distribution=[True, False]

interactive(children=(IntSlider(value=0, description='Sentence:', max=29), Output()), _dom_classes=('widget-in…

<function __main__.series(i)>

In [20]:
import json
import csv
import os

In [59]:
def load_different_models(batch_num_range):
    out = []
    for batch_num in batch_num_range:
        path = os.getcwd()
        files = [f'{path}\\{x}' for x in os.listdir(path) if 'json' in x and 'few' in x]

        d2 = [f for f in files if 'davinci-002' in f and f'batch-{batch_num}' in f]
        d3 = [f for f in files if 'davinci-002' not in f and f'batch-{batch_num}' in f]

        with open(d2[0]) as f:
            data_d2 = json.load(f)

        with open(d3[0]) as f:
            data_d3 = json.load(f)

        filename = f'batch-{batch_num}.csv'
        # Load data
        with open(filename, encoding='utf-8') as f:
            reader = csv.reader(f)
            next(reader)
            data = [row for row in reader]
        # Will only use the sentences
        orig = [str(x[0]) for x in data]
        
        for i in range(len(orig)):
            out.append({
                'batch': batch_num,
                'original': orig[i],
                'text-davinci-002': data_d2[i],
                'text-davinci-003': data_d3[i],
                'models-edit-dist': edit_dist(data_d2[i], data_d3[i]),
                'orig-edit-dist': edit_dist(orig[i], data_d3[i]),
                'models-length-change': len(data_d2[i]) - len(data_d3[i]),
            })

    return out

In [60]:
data = load_different_models([1, 2])

In [62]:
avg([x['models-length-change'] for x in data])

15.92

In [63]:
avg([x['models-edit-dist'] for x in data])

106.4

In [64]:
avg([x['orig-edit-dist'] for x in data])

113.65