In [34]:
from sentence_transformers import SentenceTransformer
from scipy import spatial
from bs4 import BeautifulSoup
import re
import json
import numpy as np

In [2]:
embedder = SentenceTransformer('bert-base-nli-mean-tokens')

100%|██████████| 405M/405M [00:29<00:00, 13.7MB/s]


In [42]:
def get_embeddings(events, embedder):
    # Corpus with sentence embeddings
    corpus_embeddings = embedder.encode(events)
    return corpus_embeddings

def calc_centriod(events, embedder):
    emds = get_embeddings(events, embedder)
    return (emds, np.array(np.mean(emds, axis=0)))

def calc_min(events, centriod):
    sim = []
    for e in events:
        sim.append((1 - spatial.distance.cosine(e, centriod)))
    return min(sim)

def event_sim(events):
    sim = []
    if len(events)<2:
        return 0
    for i in range(len(events)):
        for j in range(i+1,len(events)):
            sim.append((1 - spatial.distance.cosine(events[i], events[j])))
    return min(sim)    

In [5]:
with open('paraphrase.json') as f:
    scenarios = json.load(f)

In [45]:
scenario_emd = {}
for scene in scenarios:
    scenario_emd[scene] = {}
    for label in scenarios[scene]:
        scenario_emd[scene][label] = {}
        emds, centriod = calc_centriod(scenarios[scene][label], embedder)
        scenario_emd[scene][label]["mean"] = centriod
        scenario_emd[scene][label]["min"] = calc_min(emds, centriod)

In [57]:
def relevance_metric(in_path, scenario_emd):
    with open(in_path) as f:
        lines = f.readlines()
        precision = []
        for scenario in lines:
            splitted = scenario.split(":")
            scenario = splitted[1].rstrip(' <EOS>')
            script = splitted[0].strip().replace("<BOS> here is a sequence of events that happen while ","")
            new_scenario = script + ": "
            scenario = re.sub(r'\d+[.]', '</bevent> <bevent>', scenario)
            scenario = scenario + '</bevent>'
            scenario = scenario.strip().lstrip('</bevent>')
            soup = BeautifulSoup(scenario)
            events = []
            for a in soup.find_all('bevent'):
                events.append(a.string.strip())
            
            correct = 0 
            events_emd = get_embeddings(events, embedder)
            max_label = ""
            max_sim = 0
            for i in range(len(events)):
                for label in scenario_emd[script]:
                    sim = 1 - spatial.distance.cosine(events_emd[i], scenario_emd[script][label]["mean"])
                    if sim > max_sim:
                        max_sim = sim
                        max_label = label
                if max_sim >= scenario_emd[script][max_label]["min"]:
                    #print(script, max_label, events[i])
                    correct+=1
                else:
                    pass#print(script, max_label, events[i])
            precision.append(correct*1.0/len(events))
        return precision

In [56]:
precision = relevance_metric("./outputs/generated_valid_basic_large_g16_epoch1.txt", scenario_emd)

In [55]:
np.mean(precision), np.std(precision)

(0.8172727272727272, 0.29961798450546734)