In [2]:
import os
import numpy as np
import random
import re
import sacrebleu
from bs4 import BeautifulSoup
import editdistance
import sys
from itertools import combinations

# Remove Irrelevant and Deduplicate

In [54]:
def read_file(file_path):
    with open(file_path) as f:
        lines = f.readlines()
        lines = [int(line.strip()) for line in lines]
    return lines

def detect_repeated(events):
    duplicates = []
#     print(len(events))
    for i in range(len(events)):
        for j in range(len(events)):
            score = editdistance.eval(events[i], events[j])
            if score==0 and i!=j:
                duplicates.append((i,j))
    return list(set([max(i,j) for (i,j) in duplicates]))
        
        
def read_output(in_path, out_path, predictions, repeated=False):
    with open(in_path) as f, open(out_path, 'w') as o:
        lines = f.readlines()
        i=0
        for scenario in lines:
            # print(scenario)
            splitted = scenario.split(": ")
            scenario = splitted[1].rstrip(' <EOS>')
            # script = splitted[0].strip().replace("<BOS> here is a sequence of events that happen while ","")
            new_scenario = splitted[0].strip() + ": "
            if scenario.startswith('<BEVENT>'):
                scenario = re.sub(r'<EEVENT>', '</bevent>', scenario)
            else:
                scenario = re.sub(r'\d+[.]', '</bevent> <bevent>', scenario)
                scenario = scenario.strip() + ' </bevent>'
                scenario = scenario.strip().lstrip('</bevent>')
            soup = BeautifulSoup(scenario)
            events = []
            count=0
            for a in soup.find_all('bevent'):
                events.append(a.string.strip())
            j=1
            if repeated:
                repeated_idx = detect_repeated(events)
#                 print(repeated_idx)
                if len(events)>0:
                    for idx, e in enumerate(events):
                        if idx not in repeated_idx:
                            new_scenario+= str(j) + ". " + e.strip() + " "
                            j+=1
                        i+=1
            else:
                if len(events)>0:
                    for idx, e in enumerate(events):
                        if predictions[i]==1:
                            new_scenario+= str(j) + ". " + e.strip() + " "
                            j+=1
                        i+=1
                    

            o.write("{}\n".format(new_scenario))

In [96]:
# read the predictions
prompt = 'ordered'
predictions = read_file('./roberta-relevant-classification-e10/valid_'+prompt+'_predictions.txt')

In [73]:
# remove irrelevant
read_output('./outputs/generated_valid_'+prompt+'_large_g16_epoch1.txt', './outputs/generated_valid_'+prompt+'_large_g16_epoch1_removed.txt', predictions)

In [74]:
# deduplicate ater removing irrelevant
read_output('./outputs/generated_valid_'+prompt+'_large_g16_epoch1_removed.txt', './outputs/generated_valid_'+prompt+'_large_g16_epoch1_removed_deduplicated.txt', predictions, repeated=True)

# test generated data preparation

In [97]:
predictions = read_file('./roberta-relevant-classification-e10/test_'+ prompt+'_predictions.txt')
# remove irrelevant
read_output('./outputs/generated_test_'+prompt+'_large_g16_epoch1.txt', './outputs/generated_test_'+prompt+'_large_g16_epoch1_removed.txt', predictions)
# deduplicate ater removing irrelevant
read_output('./outputs/generated_test_'+prompt+'_large_g16_epoch1_removed.txt', './outputs/generated_test_'+prompt+'_large_g16_epoch1_removed_deduplicated.txt', predictions, repeated=True)

# test fold generated data preparation

In [61]:
for prompt in ['basic', 'ordered', 'direct', 'describe', 'expect', 'tokens', 'all_tokens']:
    for fold in range(1,9):
        predictions = read_file('./folds/roberta-relevant-classification-fold'+str(fold)+'-e5/'+prompt+'_fold'+str(fold)+'_predictions.txt')
        # remove irrelevant
        read_output('./outputs/folds/generated_test_'+prompt+'_large_fold'+str(fold)+'_g16_epoch1.txt', './outputs/folds/generated_test_'+prompt+'_large_fold'+str(fold)+'_g16_epoch1_removed.txt', predictions)
        # deduplicate ater removing irrelevant
        read_output('./outputs/folds/generated_test_'+prompt+'_large_fold'+str(fold)+'_g16_epoch1_removed.txt', './outputs/folds/generated_test_'+prompt+'_large_fold'+str(fold)+'_g16_epoch1_removed_deduplicated.txt', predictions, repeated=True)

# Novel scenarios

In [64]:
predictions = read_file('./folds/roberta-relevant-classification-fold1-e5/new_scenarios_fold1_predictions.txt')
# remove irrelevant
read_output('./outputs/folds/generated_new_scenarios_basic_large_fold1_g16_epoch1.txt', './outputs/folds/generated_new_scenarios_basic_large_fold1_g16_epoch1_removed.txt', predictions)
# deduplicate ater removing irrelevant
read_output('./outputs/folds/generated_new_scenarios_basic_large_fold1_g16_epoch1_removed.txt', './outputs/folds/generated_new_scenarios_basic_large_fold1_g16_epoch1_removed_deduplicated.txt', predictions, repeated=True)

# Search for cycles and Topological sort

In [5]:
#Python program to print topological sorting of a DAG 
from collections import defaultdict 
  
#Class to represent a graph 
class Graph: 
    def __init__(self,vertices): 
        self.graph = defaultdict(list) #dictionary containing adjacency List 
        self.V = vertices #No. of vertices 
  
    # function to add an edge to graph 
    def addEdge(self,u,v): 
        self.graph[u].append(v) 
  
    # A recursive function used by topologicalSort 
    def topologicalSortUtil(self,v,visited,stack): 
  
        # Mark the current node as visited. 
        visited[v] = True
  
        # Recur for all the vertices adjacent to this vertex 
        for i in self.graph[v]: 
            if visited[i] == False: 
                self.topologicalSortUtil(i,visited,stack) 
  
        # Push current vertex to stack which stores result 
        stack.insert(0,v) 
  
    # The function to do Topological Sort. It uses recursive  
    # topologicalSortUtil() 
    def topologicalSort(self): 
        # Mark all the vertices as not visited 
        visited = [False]*self.V 
        stack =[] 
  
        # Call the recursive helper function to store Topological 
        # Sort starting from all vertices one by one 
        for i in range(self.V): 
            if visited[i] == False: 
                self.topologicalSortUtil(i,visited,stack) 
  
        # Print contents of stack 
        return stack
    
    # detect cycles
    def isCyclicUtil(self, v, visited, recStack): 
  
        # Mark current node as visited and  
        # adds to recursion stack 
        visited[v] = True
        recStack[v] = True
  
        # Recur for all neighbours 
        # if any neighbour is visited and in  
        # recStack then graph is cyclic 
        for neighbour in self.graph[v]: 
            if visited[neighbour] == False: 
                if self.isCyclicUtil(neighbour, visited, recStack) == True: 
                    return True
            elif recStack[neighbour] == True: 
                return True
  
        # The node needs to be poped from  
        # recursion stack before function ends 
        recStack[v] = False
        return False
  
    # Returns true if graph is cyclic else false 
    def isCyclic(self): 
        visited = [False] * self.V 
        recStack = [False] * self.V 
        for node in range(self.V): 
            if visited[node] == False: 
                if self.isCyclicUtil(node,visited,recStack) == True: 
                    return True
        return False

In [25]:
dict_script = {}
dict_script["bake a cake"]="baking a cake"
dict_script["borrow a book from the library"]="borrowing a book from the library"
dict_script["change batteries in an alarm clock"]="changing batteries in an alarm clock"
dict_script["fly in an airplane"]="flying in an airplane"
dict_script["get a hair cut"]="getting a hair cut"
dict_script["go grocery shopping"]="going grocery shopping"
dict_script["go on a train"]="going on a train"
dict_script["plant a tree"]="planting a tree"
dict_script["repair a flat bicycle tire"]="repairing a flat bicycle tire"
dict_script["ride on a bus"]="riding on a bus"
dict_script["take a bath"]="taking a bath"
dict_script["order fastfood online"]= "ordering fastfood online"
dict_script["cook in a microwave"]="cooking in a microwave"
dict_script["answer telephone"]="answering telephone"
dict_script["buy from a vending machine"]="buying from a vending machine"
dict_script["tie shoe laces"]="tying shoe laces"
dict_script["brush teeth"]="brushing teeth"
dict_script["make ginger paste"]="making ginger paste"
dict_script["go for a wedding"]="going for a wedding"
dict_script["attend a wedding"]="attending a wedding"
dict_script["wash a car"]="washing a car"
dict_script["take out trash"]="taking out trash"
dict_script["take a taxi"]="taking a taxi"
dict_script["surf the internet"]="surfing the internet"
dict_script["watch television"] = "watching television"
dict_script["go to a club to dance"]="going to a club to dance"
dict_script["eat in a fast food restaurant"]="eating in a fast food restaurant"
dict_script["pay with a credit card"]="paying with a credit card"
dict_script["play tennis"]="playing tennis"
dict_script["go to the theater"]="going to the theater"
dict_script["take a child to bed"]="taking a child to bed"
dict_script["wash dishes"]="washing dishes"
dict_script["make a bonfire"]="making a bonfire"
dict_script["go to the sauna"]="going to the sauna"
dict_script["make coffee"]="making coffee"
dict_script["go to the swimming pool"]="going to the swimming pool"
dict_script["take a shower"]="taking a shower"
dict_script["iron laundry"]="ironing laundry"
dict_script["take a driving lesson"]="taking a driving lesson"
dict_script["go to the dentist"]="going to the dentist"
dict_script["go to a funeral"]="going to a funeral"
dict_script["wash one's hair"]="washing one's hair"
dict_script["fuel a car"]="fueling a car"
dict_script["send food back (in a restaurant)"]="sending food back (in a restaurant)"
dict_script["check in at an airport"]="checking in at an airport"
dict_script["have a barbecue"]="having a barbecue"
dict_script["order a pizza"]="ordering a pizza"
dict_script["clean up a flat"]="cleaning up a flat"
dict_script["make scrambled eggs"]="making scrambled eggs"
dict_script["take the underground"]="taking the underground"
dict_script["renovate a room"]="renovating a room"
dict_script["cook pasta"]="cooking pasta"
dict_script["sew a button"]="sewing a button"
dict_script["do laundry"]="doing laundry"
dict_script["go bowling"]="going bowling"

In [22]:
# create graph from the partial predictions
def convert_event_to_node(events):
    event_to_node = {}
    node_to_event = {}
    for idx, e in enumerate(events):
        event_to_node[e.strip()] = idx
        node_to_event[idx] = e.strip()
    return event_to_node, node_to_event

def correct_order(in_path, classification_inp_path, pred_path, out_path):
    with open(in_path) as f, open(classification_inp_path) as l, open(pred_path) as p ,open(out_path, 'w') as o:
        lines = f.readlines()
        partial_orders = l.readlines()
        predictions = p.readlines()
        j=0
        for scene in lines:
            scenario = scene
            splitted = scenario.split(": ")
            scenario = splitted[1].rstrip(' <EOS>')
            script = splitted[0].strip()
            scenario = re.sub(r'\d+[.]', '</bevent> <bevent>', scenario)
            scenario = re.sub(r'<EEVENT>', '</bevent>', scenario)
            
            scenario = scenario + '</bevent>'
            scenario = scenario.strip().lstrip('</bevent>')
            soup = BeautifulSoup(scenario)
            events = []
            count=0
            for a in soup.find_all('bevent'):
                events.append(a.string)
                
            event_to_node, node_to_event = convert_event_to_node(events)
            all_combinations = list(combinations(range(len(events)), 2))
            g = Graph(len(events))
            for i in range(len(all_combinations)):
                # adding edges to the graph
                two_events = partial_orders[j].strip()[0:-1].split('</s>')[1:3]
                node1 = event_to_node[two_events[0].strip()]
                node2 = event_to_node[two_events[1].strip()]
                if int(predictions[j].strip()) == 1:
                    g.addEdge(node1, node2)
                else:
                    g.addEdge(node2, node1)
                j+=1
            
            # detect cycle
            if g.isCyclic():
                print("cannot find an ordering")
                print(scene, g.graph)
                o.write("{}\n".format(scene.strip()))
            else:
                # topological sort
                ordered_events = g.topologicalSort()
                #scenario = "<BOS> here is a sequence of events that happen while " + script.strip() + ": "
                scenario = script.strip() + ": "
                for idx, node in enumerate(ordered_events):
                    scenario += str(idx+1) + ". " + node_to_event[node] + " "
                o.write("{}\n".format(scenario))

In [24]:
for split in ['test','valid']:
    for prompt in [ 'expect', 'ordered', 'direct', 'describe', 'tokens', 'all_tokens']: 
        in_path = './outputs/generated_'+split+'_'+prompt+'_large_g16_epoch1_removed_deduplicated.txt'
        classification_inp_path = './data/'+split+'_classification_'+prompt+'_output_removed_deduplicated.txt'
        pred_path = './roberta-classification-partial-context-all-b16/epoch5/'+split+'_'+prompt+'_predictions.txt'
        out_path = './outputs/generated_'+split+'_'+prompt+'_large_g16_epoch1_removed_deduplicated_ordered.txt'
        correct_order(in_path, classification_inp_path, pred_path, out_path)

cannot find an ordering
<BOS> these are the things that happen when you bake a cake: 1. put the water in the stove 2. put the sugar in 3. turn on the stove 4. let the sugar melt 5. pour the cake into the water 6. mix the cake and pour it into the cake molds 7. place the molds on the cake 
 defaultdict(<class 'list'>, {0: [1, 2, 3, 4, 5, 6], 2: [1, 3, 4, 5, 6], 1: [3, 4], 5: [1, 3, 4, 6], 6: [1, 3], 4: [3, 6], 3: []})
cannot find an ordering
<BOS> these are the things that happen when you bake a cake: 1. get a mixer 2. get some water 3. place a small bowl of water on the mixer 4. put a small amount of butter into the bowl 5. add sugar 6. mix until the sugar is dissolved 7. place the bowl of water in the oven 8. turn the oven on 9. place the bowl of water on the cake 10. cook the cake for 15 minutes 11. take the cake out of the oven 12. cut the cake with a knife or knife and serve 
 defaultdict(<class 'list'>, {1: [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 0: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 

In [65]:
# fold tests
for fold in range(1,9):
    for prompt in [ 'basic',  'ordered', 'direct', 'describe', 'expect', 'tokens', 'all_tokens']: 
        in_path = './outputs/folds/generated_test_'+prompt+'_large_fold'+ str(fold)+'_g16_epoch1_removed_deduplicated.txt'
        classification_inp_path = './data/folds/test_classification_'+prompt+'_fold'+str(fold)+'_output_removed_deduplicated.txt'
        pred_path = './folds/roberta-classification-fold'+str(fold)+'-e5/'+prompt+'_fold'+str(fold)+'_predictions.txt'
        out_path = './outputs/folds/generated_test_'+prompt+'_large_fold'+str(fold)+'_g16_epoch1_removed_deduplicated_ordered.txt'
        correct_order(in_path, classification_inp_path, pred_path, out_path)

cannot find an ordering
<BOS> here is a sequence of events that happen while baking a cake: 1. get a loaf of bread 2. get a cupcake 3. fill a bowl with water 4. put the water in the bread 5. turn on the oven 6. wait for the cake to be done 7. take the bowl with the water and fill it with milk 8. put the cake in the oven 9. take the cake out of the oven 10. take the pan with the cream on it 11. put the cream in the cake 12. let it mix 13. pour the cream into the cake 14. pour the cake into the cups 15. wait to get all the cake on the cake 
 defaultdict(<class 'list'>, {0: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 2: [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 1: [3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 4: [1, 3, 5, 7, 8, 9, 10, 11, 12, 13, 14], 3: [5, 8, 10, 11, 12, 14], 6: [3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14], 7: [3, 5, 8, 10, 11, 12, 14], 9: [3, 5, 7, 8, 10, 11, 12, 13, 14], 13: [3, 7, 10, 11, 12, 14], 5: [8, 12, 13, 14], 10: [5, 12], 11: [5, 8, 10, 14], 8: [10, 12, 13], 14:

In [29]:
in_path = './outputs/generated_test_basic_large_g16_epoch1_removed_deduplicated.txt'
classification_inp_path = './data/test_classification_basic_output_removed_deduplicated.txt'
pred_path = './roberta-classification-partial-context-all-b16/epoch5/test_generated_removed_deduplicated_predictions.txt'
out_path = './outputs/generated_test_basic_large_g16_epoch1_removed_deduplicated_ordered.txt'
correct_order(in_path, classification_inp_path, pred_path, out_path)

cannot find an ordering
<BOS> here is a sequence of events that happen while baking a cake: 1. fill pan with water 2. put oil in pan 3. turn on burner on stove 4. put cake on pan 5. wait for it to harden 6. put butter in pan 7. add sugar 8. beat with a spoon 9. pour batter in pan 10. let cool for at least 30 minutes 11. eat cake 
 defaultdict(<class 'list'>, {0: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 2: [1, 3, 4, 5, 6, 7, 8, 9, 10], 1: [3, 4, 5, 6, 7, 8, 9, 10], 3: [4, 9, 10], 5: [3, 4, 6, 7, 8, 9, 10], 6: [3, 4, 7, 9, 10], 7: [3, 4, 8, 9, 10], 8: [3, 4, 6, 9, 10], 4: [9, 10], 9: [10], 10: []})
cannot find an ordering
<BOS> here is a sequence of events that happen while baking a cake: 1. get a loaf of bread 2. get a pan 3. get butter, salt and sugar 4. place the pan on the stove 5. turn the stove on 6. put the bread in the pan 7. put some sugar and butter in the pan 8. turn on the stove 9. cook the cake 10. take out the pan 11. turn off the stove 12. place the pan in the trash 
 defaultdict(

# New Scenarios

In [107]:
in_path = './outputs/generated_new_scenarios_ordered_removed_deduplicated.txt'
classification_inp_path = './data/test_classification_new_scenarios_ordered_removed_deduplicated.txt'
pred_path = './roberta-classification-partial-context-all-b16/epoch5/new_scenarios_ordered_predictions.txt'
out_path = './outputs/generated_new_scenarios_ordered_removed_deduplicated_ordered.txt'
correct_order(in_path, classification_inp_path, pred_path, out_path)

cannot find an ordering
<BOS> here is an ordered sequence of events that occur when you order fastfood online: 1. choose the type of food you want 2. place the order 3. make sure the food is delivered 4. choose the delivery service 5. pay for the delivery service 6. wait for the order to come 7. take the delivery 8. eat the food 
 defaultdict(<class 'list'>, {0: [1, 2, 4, 5, 6, 7], 3: [0, 1, 2, 4, 5, 6, 7], 1: [2, 4, 5, 6, 7], 4: [2, 7], 5: [2, 4, 6, 7], 6: [2, 4], 2: [7], 7: [6]})
cannot find an ordering
<BOS> here is an ordered sequence of events that occur when you order fastfood online: 1. go to website 2. select what you want 3. call the company 4. tell them what you want 5. have your credit card or debit card ready 6. wait for them to take your order 7. enjoy the food 
 defaultdict(<class 'list'>, {0: [1, 2, 3, 4, 5, 6], 1: [2, 3, 4, 5, 6], 2: [3, 5, 6], 4: [2, 5, 6], 3: [4, 5, 6], 5: [6]})
cannot find an ordering
<BOS> here is an ordered sequence of events that occur when you or

In [67]:
in_path = './outputs/folds/generated_new_scenarios_basic_large_fold1_g16_epoch1_removed_deduplicated.txt'
classification_inp_path = './data/folds/test_classification_new_scenarios_basic_large_fold1_g16_removed_deduplicated.txt'
pred_path = './folds/roberta-classification-fold1-e5/new_scenarios_fold1_predictions.txt'
out_path = './outputs/folds/generated_new_scenarios_basic_large_fold1_g16_epoch1_removed_deduplicated_ordered.txt'
correct_order(in_path, classification_inp_path, pred_path, out_path)

cannot find an ordering
<BOS> here is a sequence of events that happen while buying from a vending machine: 1. enter the store 2. select a product 3. go to the cashier 4. pay with the card 5. select delivery or take the product 6. get the product when it arrives 7. get out of the store 
 defaultdict(<class 'list'>, {0: [1, 2, 3, 4, 5, 6], 2: [1, 3, 4, 5, 6], 1: [3, 4, 5, 6], 3: [4, 6], 5: [3, 6], 4: [5, 6]})
cannot find an ordering
<BOS> here is a sequence of events that happen while tying shoe laces: 1. take off shoes. 2. put shoes on. 3. pick out a new set of laces. 4. pick up shoe that you want to tie laces. 5. tie shoe laces to shoe. 6. hang up shoes. 
 defaultdict(<class 'list'>, {0: [1, 2, 3, 4, 5], 2: [1, 3, 4, 5], 1: [3, 5], 4: [1, 5], 3: [4, 5]})
cannot find an ordering
<BOS> here is a sequence of events that happen while tying shoe laces: 1. find out what size and color you want your laces to be 2. find a place you will hang your laces so they are not too loose or too tight 3