In [1]:
import os
from transformers import GPT2Tokenizer
import numpy as np
import pandas as pd
import json
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

  from .autonotebook import tqdm as notebook_tqdm


## Dataset

In [2]:
DATAFOLDER = 'mistake_labels'
ALL_CSV_NAMES = os.listdir(DATAFOLDER)

def read_annotation(csv_name):
    df = pd.read_csv(os.path.join(DATAFOLDER, csv_name), header=0)
    return df

def check_for_mistake(df):
    return any(df['label'].str.contains('mistake', case=False))

def split_csv_files(csv_files):
    correct_files = []
    mistake_files = []
    for csv_file in csv_files:
        df = read_annotation(csv_file)
        if check_for_mistake(df):
            mistake_files.append(csv_file)
        else:
            correct_files.append(csv_file)
    return correct_files, mistake_files

def transform_to_onehot(line):
    this = line[1]['this']
    that = line[1]['that']
    verb = line[1]['verb']
    this_idx = FULL_VERB_LIST.index(this)
    that_idx = FULL_VERB_LIST.index(that)
    verb_idx = FULL_VERB_LIST.index(verb)
    
    action = [0] * len(FULL_VERB_LIST)
    action[this_idx] += 1
    action[that_idx] += 1
    action[verb_idx] += 1

    action = tuple(action)
    return action

def transform_to_str(line):
    this = line[1]['this']
    that = line[1]['that']
    verb = line[1]['verb']
    parts = sorted([this, that])
    action_str = verb + "-" + "-".join(parts)
    return action_str


def extract_label(line):
    label = line[1]['label']
    return label

CORRECT_PROC, MISTAKE_PROC = split_csv_files(ALL_CSV_NAMES) # 138, 190

ALL_VERBS = {'attach', 'detach'}
ALL_PARTS = set()
for csv_file in ALL_CSV_NAMES:
    df = read_annotation(csv_file)
    this = df['this'].unique()
    that = df['that'].unique()
    ALL_PARTS.update(this)
    ALL_PARTS.update(that)
ALL_VERBS = sorted(list(ALL_VERBS))
ALL_PARTS = sorted(list(ALL_PARTS))
FULL_VERB_LIST = ALL_VERBS + ALL_PARTS
assert len(FULL_VERB_LIST) == 67

In [3]:
ALL_ACTIONS = set()
ALL_ACTIONS_STR = set()
for csv_file in ALL_CSV_NAMES:
    df = read_annotation(csv_file)
    for line in df.iterrows():
        action = transform_to_onehot(line)
        # is sorted
        action_str = transform_to_str(line)
        ALL_ACTIONS.add(action)
        ALL_ACTIONS_STR.add(action_str)

BOS = "<BOS>"
EOS = "<EOS>"
BOS_TOKEN = tuple([0] * len(FULL_VERB_LIST))
EOS_TOKEN = tuple([1] * len(FULL_VERB_LIST))

ALL_ACTIONS.add(BOS_TOKEN) # start token
ALL_ACTIONS.add(EOS_TOKEN) # end token
ALL_ACTIONS_STR.add(BOS)
ALL_ACTIONS_STR.add(EOS)

ALL_ACTIONS = sorted(list(ALL_ACTIONS))
ALL_ACTIONS_STR = sorted(list(ALL_ACTIONS_STR))

In [4]:
symbolic_vocabulary = {act:i for i, act in enumerate(ALL_ACTIONS_STR)}

In [5]:
for k in symbolic_vocabulary.keys():
    print(k, ":", symbolic_vocabulary[k])

<BOS> : 0
<EOS> : 1
attach-arm connector-body : 2
attach-arm connector-boom : 3
attach-arm connector-bulldozer arm : 4
attach-arm connector-bumper : 5
attach-arm connector-cabin : 6
attach-arm connector-chassis : 7
attach-arm connector-excavator arm : 8
attach-arm connector-interior : 9
attach-arm connector-nut : 10
attach-arm connector-push frame : 11
attach-arm-arm : 12
attach-arm-arm connector : 13
attach-arm-blade : 14
attach-arm-body : 15
attach-arm-boom : 16
attach-arm-bucket : 17
attach-arm-cabin : 18
attach-arm-chassis : 19
attach-arm-clamp : 20
attach-arm-hook : 21
attach-arm-interior : 22
attach-arm-jackhammer : 23
attach-arm-nut : 24
attach-arm-push frame : 25
attach-arm-roller : 26
attach-arm-track : 27
attach-arm-turntable base : 28
attach-arm-turntable top : 29
attach-back seat-body : 30
attach-back seat-chassis : 31
attach-back seat-interior : 32
attach-base-base : 33
attach-base-body : 34
attach-base-cabin window : 35
attach-base-chassis : 36
attach-base-container : 37


In [6]:
item_delim = tokenizer.encode(",")
row_delim = tokenizer.encode("\n")
sample_delim = tokenizer.encode("---\n")

# Handpicked: comma-separated number matrices.
alphabet = [tokenizer.encode(" " + str(a))[0] for a in range(len(ALL_ACTIONS_STR))]

value_to_token = lambda x: {i:a for i, a in enumerate(alphabet)}[x]
token_to_value = lambda x: {a:i for i, a in enumerate(alphabet)}[x]

# Random sampled tokens.
# seed_offset = 0
# np.random.seed(42 + seed_offset)
# alphabet = [int(i) for i in np.random.randint(tokenizer.vocab_size, size=10)]
# value_to_token = lambda x: {i:a for i, a in enumerate(alphabet)}[x]

print("Token Set:", {i:value_to_token(i) for i in np.arange(len(alphabet))})

Token Set: {0: 657, 1: 352, 2: 362, 3: 513, 4: 604, 5: 642, 6: 718, 7: 767, 8: 807, 9: 860, 10: 838, 11: 1367, 12: 1105, 13: 1511, 14: 1478, 15: 1315, 16: 1467, 17: 1596, 18: 1248, 19: 678, 20: 1160, 21: 2310, 22: 2534, 23: 2242, 24: 1987, 25: 1679, 26: 2608, 27: 2681, 28: 2579, 29: 2808, 30: 1542, 31: 3261, 32: 3933, 33: 4747, 34: 4974, 35: 3439, 36: 4570, 37: 5214, 38: 4353, 39: 5014, 40: 2319, 41: 6073, 42: 5433, 43: 5946, 44: 5846, 45: 4153, 46: 6337, 47: 6298, 48: 4764, 49: 5125, 50: 2026, 51: 6885, 52: 6740, 53: 7192, 54: 7175, 55: 5996, 56: 7265, 57: 7632, 58: 7618, 59: 7863, 60: 3126, 61: 8454, 62: 8190, 63: 8093, 64: 5598, 65: 6135, 66: 7930, 67: 8275, 68: 8257, 69: 8644, 70: 4317, 71: 9166, 72: 7724, 73: 8854, 74: 8915, 75: 5441, 76: 8684, 77: 8541, 78: 8699, 79: 9225, 80: 4019, 81: 9773, 82: 9415, 83: 9698, 84: 9508, 85: 7600, 86: 9849, 87: 10083, 88: 9193, 89: 9919, 90: 4101, 91: 10495, 92: 10190, 93: 10261, 94: 10048, 95: 6957, 96: 9907, 97: 10111, 98: 9661, 99: 7388, 100:

In [7]:
def from_act_to_tokens(act):
    return value_to_token(symbolic_vocabulary[act])

def from_proc_to_seq(proc, with_labels=False):
    seq = []
    if with_labels:
        labels = []
    for line in proc.iterrows():
        action = transform_to_str(line)
        seq.append(action)
        if with_labels:
            label = extract_label(line)
            labels.append(label)
    if with_labels:
        return seq, labels
    return seq

def from_seq_to_tokens(seq):
    tokens = []
    for action in seq:
        # print(action)
        tokens += [from_act_to_tokens(action)]
    if not len(seq) == 1:
        tokens += tokenizer.encode("\n")
    return tokens

def from_token_to_act(token):
    return ALL_ACTIONS_STR[token_to_value(token)]

def from_tokens_to_seq(tokens):
    seq = []
    for token in tokens:
        seq.append(from_token_to_act(token))
    return seq

def from_seq_to_sentence(seq, input_string="input:\n", output_string="output:\n" , input_output=False):
    symbolic_seq_in = [" " + str(ALL_ACTIONS_STR.index(a)) for a in seq[:-1]]
    input_str = input_string
    input_corpus = ",".join(symbolic_seq_in) + "\n"
    output_str = output_string
    output_corpus = " " + str(ALL_ACTIONS_STR.index(seq[-1])) + "\n"
    if input_output:
        return input_str + input_corpus + output_str, output_corpus
    return input_str + input_corpus + output_str + output_corpus

        

def from_seq_to_token_datasets(correct_csvs=CORRECT_PROC, mistake_csvs=MISTAKE_PROC):
    tokens = []
    train_samples = []
    for i, csv in enumerate(correct_csvs):
        proc = read_annotation(csv)
        tokens = []
        seq = from_proc_to_seq(proc)
        # print(seq)
        tokens += tokenizer.encode("input:\n")
        tokens += from_seq_to_tokens(seq[:-1])
        tokens += tokenizer.encode("output:\n")
        tokens += from_seq_to_tokens(seq[-1:])
        # tokens += tokenizer.encode("\n")
        # print(tokens)
        if i == 2:
            break
        train_samples.append(tokens)
        
    test_inputs = []
    test_outputs = []
    for csv in mistake_csvs:
        inputs, outputs = [], []
        proc = read_annotation(csv)
        seq = from_proc_to_seq(proc)
        inputs += tokenizer.encode("input:\n")
        inputs += from_seq_to_tokens(seq[:-1])
        inputs += tokenizer.encode("output:\n")
        test_inputs.append(inputs)
        outputs += from_seq_to_tokens(seq[-1:])
        test_outputs.append(outputs)
    return train_samples, test_inputs, test_outputs

## Grouping Toys

In [8]:
action_df = pd.read_csv("grouping_actions.csv", header=None, names=['name', 'type'])

In [9]:
toy_types = action_df['type'].unique().tolist()

In [10]:
toy_types_dict = dict()
for toy_type in toy_types:
    toy_types_dict[toy_type] = action_df[action_df['type'] == toy_type]['name'].tolist()

In [11]:
toy_types_dict

{'excavator': ['a01',
  'b04a',
  'b05a',
  'b06c',
  'c01a',
  'c02a',
  'c03a',
  'c04a',
  'c04d',
  'c05a',
  'c06e',
  'c07a',
  'c12c',
  'c13c',
  'c14a'],
 'bulldozer': ['a02',
  'a03',
  'a07',
  'a11',
  'b01a',
  'b04b',
  'b08b',
  'b08c',
  'c02b',
  'c03b',
  'c05b',
  'c13d'],
 'clamp': ['a06', 'a09', 'b02b', 'b08a', 'c14b'],
 'crane': ['a08', 'b05b', 'b06b', 'c01c', 'c06c', 'c08a', 'c12a'],
 'garbage_truck': ['a10', 'a15', 'c07c', 'c08b', 'c10a'],
 'dumper': ['a12',
  'a13',
  'a21',
  'b02a',
  'b05d',
  'b06a',
  'b08d',
  'c01b',
  'c03d',
  'c06a',
  'c09a',
  'c10b',
  'c12e',
  'c13e'],
 'transporter': ['a14', 'c08c', 'c09c'],
 'ladder_truck': ['a16', 'a19', 'b03a', 'c06f', 'c07b', 'c12d'],
 'fire_truck': ['a17', 'a18', 'a20', 'b03b'],
 'car': ['a23', 'a24', 'a26', 'a31', 'c11a'],
 'suv': ['a27', 'a28', 'a29', 'a30', 'c11b'],
 'roller': ['b01b', 'b04d', 'c02c', 'c03f', 'c04c', 'c13a'],
 'jackhammer': ['b04c', 'c03e', 'c04b', 'c13b'],
 'cement_mixer': ['b05c', 'b06

## JSONs

### Regular Jsons

In these files, fixed the procedure for a toy X, we just consider as context all the procedures that are **correct** and assembly the **same toy X**.

In [16]:
for proc in ALL_CSV_NAMES:
    list_proc = read_annotation(proc)
    # annotate user and toy
    user, toy = proc.split("_")[3].split("-")
    toy_type = action_df[action_df['name'] == toy]['type'].tolist()[0]
    pos_examples = [x.split(".")[0] for x in CORRECT_PROC if toy in x and x != proc]
    neg_examples = [x.split(".")[0] for x in MISTAKE_PROC if toy in x and x != proc]
    label = "correct" if proc in CORRECT_PROC else "mistake"
    seq, labels= from_proc_to_seq(list_proc, with_labels=True)
    if label == "mistake":
        first_mistake_idx = labels.index("mistake")
        seq = seq[:first_mistake_idx + 1]
        labels = labels[:first_mistake_idx + 1]
    action_str_input, action_str_output = from_seq_to_sentence(seq, input_string="Input Sequence:\n", output_string="Next Symbol:\n", input_output=True)
    # print(proc)
    # print(user, toy)
    # print(pos_examples)
    # print(neg_examples)
    # print(label)
    # print(seq)
    # print(labels)
    # print(action_str_input)
    # print(action_str_output)
    curr_dict = {
        "user": user,
        "toy": toy,
        "toy_type": toy_type,
        "pos_examples": pos_examples,
        "neg_examples": neg_examples,
        "procedure_label": label,
        "seq": seq,
        "actions_label": labels,
        "input_str": action_str_input,
        "output_str": action_str_output
    }
    with open(os.path.join("mistake_jsons", proc.split(".")[0] + ".json"), "w") as f:
        json.dump(curr_dict, f, indent=4)

In [17]:
for json_file in os.listdir("mistake_jsons"):
    sen = ""
    with open(os.path.join("mistake_jsons", json_file), "r") as f:
        curr_dict = json.load(f)
    for neighbour_proc in curr_dict["pos_examples"]:
        with open(os.path.join("mistake_jsons", neighbour_proc + ".json"), "r") as f:
            neighbour_dict = json.load(f)
        sen += neighbour_dict["input_str"] + neighbour_dict["output_str"] + "---\n"
    curr_dict["context_str"] = sen
    # sen += curr_dict["input_str"]
    # curr_dict["context_tokens"] = tokenizer.encode(sen)
    # curr_dict["input_tokens"] = tokenizer.encode(curr_dict["input_str"])
    # curr_dict["output_tokens"] = tokenizer.encode(curr_dict["output_str"])
    with open(os.path.join("mistake_jsons", json_file), "w") as f:
        json.dump(curr_dict, f, indent=4)

Example:
```
{
    "user": "9021",
    "toy": "c10a",
    "toy_type": "garbage_truck",
    "pos_examples": [
        "nusar-2021_action_both_9072-c10a_9072_user_id_2021-02-11_112415",
        "nusar-2021_action_both_9085-c10a_9085_user_id_2021-02-22_174720"
    ],
    "neg_examples": [
        "nusar-2021_action_both_9051-c10a_9051_user_id_2021-02-22_120421"
    ],
    "procedure_label": "correct",
    "seq": [
        "attach-connector-container",
        "attach-container-rear roof",
        "attach-container-lid",
        "attach-container-container",
        "attach-chassis-interior",
        "attach-cabin-interior",
        "attach-chassis-container",
        "attach-cabin-roof",
        "attach-bumper-cabin",
        "attach-light-roof",
        "attach-chassis-wheel"
    ],
    "actions_label": [
        "correct",
        "correct",
        "correct",
        "correct",
        "correct",
        "correct",
        "correct",
        "correct",
        "correct",
        "correct",
        "correct"
    ],
    "input_str": "Input Sequence:\n 145, 149, 148, 147, 125, 107, 117, 111, 89, 174\nNext Symbol:\n",
    "output_str": " 143\n",
    "context_str": "Input Sequence:\n 135, 143, 125, 107, 117, 111\nNext Symbol:\n 89\n---\nInput Sequence:\n 148, 147, 149, 125, 102, 117, 89, 174, 111\nNext Symbol:\n 143\n---\n"
}
```

### Modified JSONs
Here we try a different strategy. Since the "context" is often very lacking (for some toy we don't have it at all since there is only one proc for those toys!), we can try to enlarge the context providing more sequences from other toys (with different name), from the same type (excavator Vs. Crane Vs. dumper and so on).

Expectations:
- PRO:
    - Larger context, easier to get a similar action (avoiding to go completely blind)
- CON:
    - It can be confusive, since not all the toys of the same types share all the components

In [24]:
jsons = []
for proc in ALL_CSV_NAMES:
    list_proc = read_annotation(proc)
    # annotate user and toy
    user, toy = proc.split("_")[3].split("-")
    toy_type = action_df[action_df['name'] == toy]['type'].tolist()[0]
    pos_examples = [x.split(".")[0] for x in CORRECT_PROC if toy in x and x != proc]
    neg_examples = [x.split(".")[0] for x in MISTAKE_PROC if toy in x and x != proc]
    # Here we consider also the similar toys:'
    for toy_name in toy_types_dict[toy_type]:
        if toy_name == toy:
            continue
        pos_examples += [x.split(".")[0] for x in CORRECT_PROC if toy_name in x]
        neg_examples += [x.split(".")[0] for x in MISTAKE_PROC if toy_name in x]
    label = "correct" if proc in CORRECT_PROC else "mistake"
    seq, labels= from_proc_to_seq(list_proc, with_labels=True)
    if label == "mistake":
        first_mistake_idx = labels.index("mistake")
        seq = seq[:first_mistake_idx + 1]
        labels = labels[:first_mistake_idx + 1]
    action_str_input, action_str_output = from_seq_to_sentence(seq, input_string="Input Sequence:\n", output_string="Next Symbol:\n", input_output=True)
    # print(proc)
    # print(user, toy)
    # print(pos_examples)
    # print(neg_examples)
    # print(label)
    # print(seq)
    # print(labels)
    # print(action_str_input)
    # print(action_str_output)
    curr_dict = {
        "user": user,
        "toy": toy,
        "toy_type": toy_type,
        "pos_examples": pos_examples,
        "neg_examples": neg_examples,
        "procedure_label": label,
        "seq": seq,
        "actions_label": labels,
        "input_str": action_str_input,
        "output_str": action_str_output
    }
    with open(os.path.join("mistake_jsons_more_context", proc.split(".")[0] + ".json"), "w") as f:
        json.dump(curr_dict, f, indent=4)

['nusar-2021_action_both_9046-b06b_9046_user_id_2021-02-22_105953', 'nusar-2021_action_both_9071-b06b_9071_user_id_2021-02-11_100739', 'nusar-2021_action_both_9015-a08_9015_user_id_2021-02-02_155549', 'nusar-2021_action_both_9044-b05b_9044_user_id_2021-02-05_163057', 'nusar-2021_action_both_9036-c01c_9036_user_id_2021-02-18_092539', 'nusar-2021_action_both_9051-c08a_9051_user_id_2021-02-22_115717', 'nusar-2021_action_both_9081-c08a_9081_user_id_2021-02-12_160626', 'nusar-2021_action_both_9025-c12a_9025_user_id_2021-02-18_111731']
['nusar-2021_action_both_9042-a02_9042_user_id_2021-02-05_111642', 'nusar-2021_action_both_9032-a03_9032_user_id_2021-02-25_155323', 'nusar-2021_action_both_9074-a03_9074_user_id_2021-02-11_151600', 'nusar-2021_action_both_9026-a07_9026_user_id_2021-02-03_162446', 'nusar-2021_action_both_9075-a07_9075_user_id_2021-02-12_093237', 'nusar-2021_action_both_9055-b08b_9055_user_id_2021-02-24_104106', 'nusar-2021_action_both_9034-c02b_9034_user_id_2021-02-23_173828',

In [25]:
for json_file in os.listdir("mistake_jsons"):
    sen = ""
    with open(os.path.join("mistake_jsons_more_context", json_file), "r") as f:
        curr_dict = json.load(f)
    for neighbour_proc in curr_dict["pos_examples"]:
        with open(os.path.join("mistake_jsons", neighbour_proc + ".json"), "r") as f:
            neighbour_dict = json.load(f)
        sen += neighbour_dict["input_str"] + neighbour_dict["output_str"] + "---\n"
    curr_dict["context_str"] = sen
    # curr_dict["context_tokens"] = tokenizer.encode(sen)
    # curr_dict["input_tokens"] = tokenizer.encode(curr_dict["input_str"])
    # curr_dict["output_tokens"] = tokenizer.encode(curr_dict["output_str"])
    # print(sen)
    # print("Solution:")
    # print(curr_dict["output_str"])
    # print("-----")
    # print(tokenizer.encode(sen))
    # print("-----")
    # print(tokenizer.encode(curr_dict["output_str"]))
    with open(os.path.join("mistake_jsons_more_context", json_file), "w") as f:
        json.dump(curr_dict, f, indent=4)