### Analyze provided results from paper

In [11]:
import json

folder_prefix = '../match/'

dataset_names = {
    'abt-buy-sampled-gs_domain-complex-force': 'abt-buy-gs',
    'walmart-amazon-sampled-gs_domain-complex-force': 'walmart-amazon-gs',
    'amazon-google-sampled-gs_domain-complex-force': 'amazon-google-gs',
    'dblp-scholar-sampled-gs_domain-complex-force': 'dblp-scholar-gs',
    # 'wdcproducts-80cc-seen-sampled-250-gs-2_domain-complex-force': 'wdcproducts-80cc-seen-gs',
    'dblp-acm-sampled-gs_domain-complex-force': 'dblp-acm-gs'
}

def format_output_prod(line):
    split = line.split('\n')
    assert(len(split) == 2)
    return {
        "Product 1": split[0][12:-1],
        "Product 2": split[1][12:-1]
    }

def format_output_pub(line):
    split = line.split('\n')
    assert(len(split) == 2)
    return {
        "Publication 1": split[0][16:-1],
        "Publication 2": split[1][16:-1]
    }

In [15]:
true_pos_all, true_neg_all, false_pos_all, false_neg_all = {}, {}, {}, {}
accuracy, f1, precision, recall = {}, {}, {}, {}

for d in dataset_names.keys():

    # read in data
    with open(f'../libem-sample-data/{dataset_names[d][:-3]}/test.ndjson', 'r') as f:
        dataset = [json.loads(line) for line in f]
    with open(f'{folder_prefix}LLMForEM/prompt-answer-combined/prompts_and_answers/{d}_default_gpt-4-0613_run-1.jsonl', 'r') as f:
        results = [json.loads(line) for line in f]

    # process results
    true_pos, true_neg, false_pos, false_neg, bad_format = [], [], [], [], []
    correct = 0

    for i in range(len(results)):
        # make sure prompt matches and format output
        data = results[i]['prompt']
        
        if results[i]['answer'] == 'Yes':
            if (dataset[i]['label'] == 1):
                true_pos.append(data)
                correct += 1
            else:
                false_pos.append(data)
        elif results[i]['answer'] == 'No':
            if (dataset[i]['label'] == 0):
                true_neg.append(data)
                correct += 1
            else:
                false_neg.append(data)
        else:
            bad_format.append(results[i])
    
    assert(len(bad_format) == 0)
    # print(len(results))
    name = dataset_names[d][:-3]
    
    true_pos_all[name], true_neg_all[name], false_pos_all[name], false_neg_all[name] = true_pos, true_neg, false_pos, false_neg
    accuracy[name], precision[name], recall[name] = correct/len(data), len(true_pos) / (len(true_pos) + len(false_pos)), len(true_pos) / (len(true_pos) + len(false_neg))
    f1[name] = 2 * len(true_pos) / (2 * len(true_pos) + len(false_pos) + len(false_neg))

In [None]:
with open(f'./paper-results/true_pos.json', 'w') as f:
        json.dump(true_pos_all, f, indent=4)
with open(f'./paper-results/true_neg.json', 'w') as f:
        json.dump(true_neg_all, f, indent=4)
with open(f'./paper-results/false_pos.json', 'w') as f:
        json.dump(false_pos_all, f, indent=4)
with open(f'./paper-results/false_neg.json', 'w') as f:
        json.dump(false_neg_all, f, indent=4)
with open(f'./paper-results/f1.json', 'w') as f:
        json.dump(f1, f, indent=4)

In [16]:
import pandas as pd

df = pd.DataFrame()
df['Precision'] = pd.Series(precision)
df['Recall'] = pd.Series(recall)
df['F1'] = pd.Series(f1)
df['Precision'] *= 100
df['Recall'] *= 100
df['F1'] *= 100
df.round(1)

Unnamed: 0,Precision,Recall,F1
abt-buy,95.1,95.1,95.1
walmart-amazon,84.3,94.3,89.0
amazon-google,63.8,92.7,75.6
dblp-scholar,89.7,87.2,88.4
dblp-acm,94.0,100.0,96.9


### Analyze reproduced paper (no schema) results

In [None]:
import json
folder = 'no-schema-gpt4-turbo'

dataset_names = {
    'abt-buy-sampled-gs_domain-complex-force': 'abt-buy-gs',
    'walmart-amazon-sampled-gs_domain-complex-force': 'walmart-amazon-gs',
    'amazon-google-sampled-gs_domain-complex-force': 'amazon-google-gs',
    'dblp-scholar-sampled-gs_domain-complex-force': 'dblp-scholar-gs',
    # 'wdcproducts-80cc-seen-sampled-250-gs-2_domain-complex-force': 'wdcproducts-80cc-seen-gs',
    'dblp-acm-sampled-gs_domain-complex-force': 'dblp-acm-gs'
}

In [None]:
def format_output_prod(line):
    split = line.split('\n')
    assert(len(split) == 2)
    return {
        "Product 1": split[0][12:-1],
        "Product 2": split[1][12:-1]
    }

def format_output_pub(line):
    split = line.split('\n')
    assert(len(split) == 2)
    return {
        "Publication 1": split[0][16:-1],
        "Publication 2": split[1][16:-1]
    }

In [None]:
true_pos_all, true_neg_all, false_pos_all, false_neg_all = {}, {}, {}, {}
accuracy, f1, precision, recall = {}, {}, {}, {}

for d in dataset_names.keys():
    
    # read in results
    if d != 'dblp-acm-sampled-gs_domain-complex-force':
        with open(f'./LLMForEM/tasks/{d}.json', 'r') as f:
            dataset = json.load(f)['examples']
    with open(f"./{folder}/{d}_results.json", "r") as f:
        results = json.load(f)
        
    # process results
    true_pos, true_neg, false_pos, false_neg = [], [], [], []
    correct, pos, neg = 0, 0, 0

    for i in range(len(results)):
        if d != 'dblp-acm-sampled-gs_domain-complex-force':
            # make sure prompt matches and format output
            if (d == 'dblp-scholar-sampled-gs_domain-complex-force'):
                assert(results[i]['prompt'][120:] == dataset[i]['input'])
                data = format_output_pub(dataset[i]['input'])
            else:
                assert(results[i]['prompt'][124:] == dataset[i]['input'])
                data = format_output_prod(dataset[i]['input'])
            
            if (results[i]['answer'][0:3] == 'Yes'):
                if (dataset[i]['target_scores']['Yes'] == 1):
                    true_pos.append(data)
                    correct += 1
                else:
                    false_pos.append(data)
            else:
                if (dataset[i]['target_scores']['No'] == 1):
                    true_neg.append(data)
                    correct += 1
                else:
                    false_neg.append(data)
        else:
            if results[i]['Match'][0:3] == 'yes':
                if results[i]['Label'] == 1:
                    true_pos.append(data)
                    correct += 1
                else:
                    false_pos.append(data)
            else:
                if results[i]['Label'] == 0:
                    true_neg.append(data)
                    correct += 1
                else:
                    false_neg.append(data)
    
    name = dataset_names[d]
    
    true_pos_all[name], true_neg_all[name], false_pos_all[name], false_neg_all[name] = true_pos, true_neg, false_pos, false_neg
    accuracy[name], precision[name], recall[name] = correct/len(data), len(true_pos) / (len(true_pos) + len(false_pos)), len(true_pos) / (len(true_pos) + len(false_neg))
    f1[name] = 2 * len(true_pos) / (2 * len(true_pos) + len(false_pos) + len(false_neg))

In [None]:
with open(f'./{folder}/true_pos.json', 'w') as f:
        json.dump(true_pos_all, f, indent=4)
with open(f'./{folder}/true_neg.json', 'w') as f:
        json.dump(true_neg_all, f, indent=4)
with open(f'./{folder}/false_pos.json', 'w') as f:
        json.dump(false_pos_all, f, indent=4)
with open(f'./{folder}/false_neg.json', 'w') as f:
        json.dump(false_neg_all, f, indent=4)
# with open(f'./{folder}/accuracy.json', 'w') as f:
#         json.dump(accuracy, f, indent=4)
with open(f'./{folder}/f1.json', 'w') as f:
        json.dump(f1, f, indent=4)

In [None]:
import pandas as pd

df = pd.DataFrame()
df['Precision'] = pd.Series(precision)
df['Recall'] = pd.Series(recall)
df['F1'] = pd.Series(f1)
df['Precision'] *= 100
df['Recall'] *= 100
df['F1'] *= 100
df.round(2)

In [None]:
import numpy as np
np.mean(df['F1'])

### Analyze with schema results, sampled (test) set only

In [66]:
import json
import numpy as np

folder = '../match/with-schema-gpt4-turbo'
dataset_names = {
    'abt-buy-sampled-gs_domain-complex-force': 'abt-buy-gs',
    'walmart-amazon-sampled-gs_domain-complex-force': 'walmart-amazon-gs',
    'amazon-google-sampled-gs_domain-complex-force': 'amazon-google-gs',
    'dblp-scholar-sampled-gs_domain-complex-force': 'dblp-scholar-gs',
    # 'wdcproducts-80cc-seen-sampled-250-gs-2_domain-complex-force': 'wdcproducts-80cc-seen-gs',
    'dblp-acm-sampled-gs_domain-complex-force': 'dblp-acm-gs'
}

true_pos_all, true_neg_all, false_pos_all, false_neg_all = {}, {}, {}, {}
accuracy, precision, recall, f1 = {}, {}, {}, {}

for d_k in dataset_names.keys():
    name = dataset_names[d_k]
    
    with open(f'../libem-sample-data/{name[:-3]}/test.ndjson', 'r') as fi:
        in_set = [json.loads(line)['pair_id'] for line in fi]
    
    # read in results
    with open(f"{folder}/{name}_results.json", "r") as f:
        data = json.load(f)
        
        # process results
        true_pos, true_neg, false_pos, false_neg = [], [], [], []
        correct, pos, neg = 0, 0, 0
    
        for d in data:
            # find if in sampled data
            prod1 = d['product 1']
            prod2 = d['product 2']
            
            id_pair = prod1[7: prod1.find('","')] + '#' + prod2[6: prod2.find('","')]
            if not id_pair in in_set:
                continue
            
            output = {'product 1': d['product 1'], 'product 2': d['product 2']}
            if d['response'][0:3] == 'Yes':
                if d['label'] == '1':
                    true_pos.append(output)
                    correct += 1
                    pos += 1
                else:
                    false_pos.append(output)
                    neg += 1
            else:
                if d['label'] == '0':
                    true_neg.append(output)
                    correct += 1
                    neg += 1
                else:
                    false_neg.append(output)
                    pos += 1
        
        # make sure filtered set has same length as test set
        assert(len(in_set) == len(true_pos) + len(true_neg) + len(false_pos) + len(false_neg))
        
        true_pos_all[name], true_neg_all[name], false_pos_all[name], false_neg_all[name] = true_pos, true_neg, false_pos, false_neg
        accuracy[name], precision[name], recall[name] = correct/(pos + neg), len(true_pos) / (len(true_pos) + len(false_pos)), len(true_pos) / (len(true_pos) + len(false_neg))
        f1[name] =  2 * len(true_pos) / (2 * len(true_pos) + len(false_pos) + len(false_neg))

In [None]:
with open(f'./{folder}/true_pos.json', 'w') as f:
        json.dump(true_pos_all, f, indent=4)
with open(f'./{folder}/true_neg.json', 'w') as f:
        json.dump(true_neg_all, f, indent=4)
with open(f'./{folder}/false_pos.json', 'w') as f:
        json.dump(false_pos_all, f, indent=4)
with open(f'./{folder}/false_neg.json', 'w') as f:
        json.dump(false_neg_all, f, indent=4)
# with open(f'./{folder}/accuracy.json', 'w') as f:
#         json.dump(accuracy, f, indent=4)
with open(f'./{folder}/f1.json', 'w') as f:
        json.dump(f1, f, indent=4)

In [67]:
import pandas as pd

df = pd.DataFrame()
df['Precision'] = pd.Series(precision)
df['Recall'] = pd.Series(recall)
df['F1'] = pd.Series(f1)
df['Precision'] *= 100
df['Recall'] *= 100
df['F1'] *= 100
df.round(1)

Unnamed: 0,Precision,Recall,F1
abt-buy-gs,96.6,96.1,96.4
walmart-amazon-gs,78.7,95.9,86.4
amazon-google-gs,71.1,98.7,82.6
dblp-scholar-gs,90.6,96.0,93.2
dblp-acm-gs,96.5,99.6,98.0


### Analyze libem run results

In [106]:
import json

# dataset name: file location
files = {
    # 'Without Learning': '../libem/benchmark/results/???.json',
    'With Learning': '../libem/benchmark/results/???.json',
    }

true_pos_all, true_neg_all, false_pos_all, false_neg_all = {}, {}, {}, {}
accuracy, precision, recall, f1 = {}, {}, {}, {}

for name, file in files.items():
    
    # read in results
    with open(file, "r") as f:
        data = json.load(f)
        
        # process results
        true_pos, true_neg, false_pos, false_neg = [], [], [], []
        correct, pos, neg = 0, 0, 0

        for d in data:
            if d['pred'][0:3] == 'yes':
                if d['label'] == 1:
                    true_pos.append(d)
                    correct += 1
                    pos += 1
                else:
                    false_pos.append(d)
                    neg += 1
            else:
                if d['label'] == 0:
                    true_neg.append(d)
                    correct += 1
                    neg += 1
                else:
                    false_neg.append(d)
                    pos += 1

        true_pos_all[name], true_neg_all[name], false_pos_all[name], false_neg_all[name] = true_pos, true_neg, false_pos, false_neg
        accuracy[name], precision[name], recall[name] = correct/len(data), len(true_pos) / (len(true_pos) + len(false_pos)), len(true_pos) / (len(true_pos) + len(false_neg))
        f1[name] =  2 * len(true_pos) / (2 * len(true_pos) + len(false_pos) + len(false_neg))

In [113]:
import pandas as pd

df = pd.DataFrame()
df['Precision'] = pd.Series(precision)
df['Recall'] = pd.Series(recall)
df['F1'] = pd.Series(f1)
df['Precision'] *= 100
df['Recall'] *= 100
df['F1'] *= 100
df.round(1)

Unnamed: 0,Precision,Recall,F1
With Learning,78.9,73.5,76.1


In [16]:
folder = '???'

with open(f'./{folder}/true_pos.json', 'w') as f:
        json.dump(true_pos_all, f, indent=4)
with open(f'./{folder}/true_neg.json', 'w') as f:
        json.dump(true_neg_all, f, indent=4)
with open(f'./{folder}/false_pos.json', 'w') as f:
        json.dump(false_pos_all, f, indent=4)
with open(f'./{folder}/false_neg.json', 'w') as f:
        json.dump(false_neg_all, f, indent=4)
# with open(f'./{folder}/accuracy.json', 'w') as f:
#         json.dump(accuracy, f, indent=4)
with open(f'./{folder}/f1.json', 'w') as f:
        json.dump(f1, f, indent=4)