In [4]:
import json
import pandas as pd
import os
import re
import seaborn as sns
import math
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from tqdm import tqdm
from transformers import RobertaTokenizerFast
from nltk import word_tokenize, pos_tag, ngrams
from transformers import RobertaTokenizerFast
from pprint import pprint
from wordcloud import WordCloud
import spacy
from tars.alfred.gen.constants import OBJECTS

ModuleNotFoundError: No module named 'wordcloud'

# Meta Data Analysis

In [None]:
bbpe_tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

def get_num_tokens(text):
    return len(word_tokenize(text))

def get_num_bbpe_tokens(text):
    return len(bbpe_tokenizer(text)["input_ids"])

def freqs_to_df(d, sort=True):
    return pd.DataFrame.from_dict(d, orient="index", columns=["Count"]).sort_values("Count", ascending=False)

def check_obj_in_steps(obj_toks_lower, steps, high_idx):
    check_steps = []
    if high_idx < len(steps):
        check_steps.append(steps[high_idx])
    if high_idx > 0:
        check_steps.append(steps[high_idx - 1])
    if high_idx < len(steps) - 1:
        check_steps.append(steps[high_idx + 1])
    for step in check_steps:
        step_lower = step.lower()
        for tok in obj_toks_lower:
            if tok in step_lower:
                return 1
    return 0

def task_dirs(splits, data_path):
    for split in splits:
        task_dirs = os.listdir(f'{data_path}/{split}')
        for task_dir in tqdm(task_dirs):
            yield split, task_dir

In [None]:
splits = ["train", "valid_seen", "valid_unseen"]
task_fields = ["task_type", "focus_object", "base_object", "dest_object", "scene"]
data_path = "../tars/alfred/data/json_2.1.0"

In [None]:
# Build table for generating dataset statistics
stats_dict = defaultdict(lambda: [])
action_freqs = defaultdict(lambda: defaultdict(lambda: 0))
bigram_freqs = {"task": defaultdict(lambda: 0), "steps": defaultdict(lambda: 0)}
trigram_freqs = {"task": defaultdict(lambda: 0), "steps": defaultdict(lambda: 0)}

for split, task_dir in task_dirs(splits, data_path):
    task_values = task_dir.split("-")
    for trial_dir in os.listdir("{}/{}/{}".format(data_path, split, task_dir)):
        stats_dict["split"].append(split)
        stats_dict["task_id"].append(trial_dir)

        for j, field in enumerate(task_fields):
            stats_dict[field].append(task_values[j])

        traj_data_file = open("{}/{}/{}/{}/traj_data.json".format(data_path, split, task_dir, trial_dir))
        traj_data = json.load(traj_data_file)
        num_steps_list = []
        num_step_tokens_list = []
        num_task_tokens_list = []
        num_step_bbpe_tokens_list = []
        num_task_bbpe_tokens_list = []

        # common_nouns_freq = defaultdict(lambda: 0)
        for directive in traj_data["turk_annotations"]["anns"]:
            task_desc_toks = word_tokenize(directive["task_desc"])
            num_task_tokens_list.append(len(task_desc_toks))
            for bigram in ngrams(task_desc_toks, 2):
                bigram_freqs["task"][bigram] += 1
            for trigram in ngrams(task_desc_toks, 3):
                trigram_freqs["task"][trigram] += 1
            num_steps_list.append(len(directive["high_descs"]))
            total_steps_toks = 0
            for desc in directive["high_descs"]:
                desc_toks = word_tokenize(desc)
                total_steps_toks += len(desc_toks)
                for bigram in ngrams(desc_toks, 2):
                    bigram_freqs["steps"][bigram] += 1
                for trigram in ngrams(desc_toks, 3):
                    trigram_freqs["steps"][trigram] += 1
            num_step_tokens_list.append(total_steps_toks)

            num_step_bbpe_tokens_list.append(sum([get_num_bbpe_tokens(desc) for desc in directive["high_descs"]]))
            num_task_bbpe_tokens_list.append(get_num_bbpe_tokens(directive["task_desc"]))

        stats_dict["steps"].append(np.mean(num_steps_list))
        stats_dict["total_steps_toks"].append(np.mean(num_step_tokens_list))
        stats_dict["total_steps_bbpe_toks"].append(np.mean(num_step_bbpe_tokens_list))
        stats_dict["task_toks"].append(np.mean(num_task_tokens_list))
        stats_dict["task_bbpe_toks"].append(np.mean(num_task_bbpe_tokens_list))
        stats_dict["images"].append(len(traj_data["images"]))
        stats_dict["actions"].append(len(traj_data["plan"]["low_actions"]))
        stats_dict["high_actions"].append(len(traj_data["plan"]["high_pddl"]))        
        stats_dict['total_objects'].append(len(traj_data['scene']['object_poses']))

        nav_count = 0
        interact_count = 0
        mask_cov_per_ac = 0
        interact_step_cov = 0

        for action in traj_data["plan"]["low_actions"]:
            action_freqs[split][action["api_action"]["action"]] += 1
            args = action['discrete_action']['args']
            if "mask" in args:
                obj_name = action["api_action"]["objectId"].split("|")[0]
                obj_toks_lower = [tok.lower() for tok in re.sub('([a-z])([A-Z])', r'\1 \2', obj_name).split()]
                interact_step_cov += np.mean([check_obj_in_steps(obj_toks_lower, directive["high_descs"], action["high_idx"]) for directive in traj_data["turk_annotations"]["anns"]])
                interact_count += 1
                mask_cov_per_ac += sum([l for _, l in args['mask']])
            else:
                nav_count += 1

        stats_dict["mask_coverage_per_action"].append(mask_cov_per_ac / (300 * 300 * interact_count))
        stats_dict["nav_actions"].append(nav_count)
        stats_dict["interact_actions"].append(interact_count)
        stats_dict["interact_step_cov"].append(interact_step_cov / interact_count)

stats_df = pd.DataFrame(stats_dict)
action_df = pd.DataFrame(action_freqs)
task_bigram_df = freqs_to_df(bigram_freqs["task"])
steps_bigram_df = freqs_to_df(bigram_freqs["steps"])
task_trigram_df = freqs_to_df(trigram_freqs["task"])
steps_trigram_df = freqs_to_df(trigram_freqs["steps"])

In [None]:
# Derive some additional columns
stats_df["toks/step"] = stats_df["total_steps_toks"] / stats_df["steps"]
stats_df["bbpe_toks/step"] = stats_df["total_steps_bbpe_toks"] / stats_df["steps"]
stats_df["actions/step"] = stats_df["actions"] / stats_df["steps"]
stats_df["images/action"] = stats_df["images"] / stats_df["actions"]
stats_df["nav/interact"] = stats_df["nav_actions"] / stats_df["interact_actions"]

stats_df = stats_df.round(2)

In [None]:
pd.set_option('display.max_columns', 100)
stats_df

In [None]:
action_df

In [None]:
# Plot frequency of a categorical field. Useful for task_type and maybe objects.
def plot_freq(col, **kwargs):
    splits = ["train", "valid_seen", "valid_unseen"]
    freq_cols = {split:stats_df[stats_df["split"] == split][col].value_counts(normalize=True) for split in splits}
    pd.concat(freq_cols, axis=1).plot.bar(xlabel=col, ylabel="Relative Frequency", **kwargs)

# Plot histogram of a quantitative field. 
def plot_hist(col, **kwargs):
    splits = ["train", "valid_seen", "valid_unseen"]
    axes = stats_df.hist(col, by="split", sharex=True, **kwargs)
    for ax in axes.reshape(-1):
        ax.tick_params(axis="x", which="both", labelbottom=True)
        ax.set_xlabel(col, visible=True)
        ax.set_ylabel("Frequency")

In [None]:
action_plot = action_df.div(action_df.sum(axis=0)).plot.bar(figsize=(10, 5), xlabel="action", ylabel="Relative Frequency")

In [None]:
plot_freq("task_type", figsize=(8, 4))

In [None]:
plot_freq("focus_object", figsize=(20, 5))

In [None]:
plot_freq("dest_object", figsize=(10, 5))

In [None]:
plot_hist("steps", figsize=(10, 5))

In [None]:
plot_hist("total_objects", figsize=(10, 5))

In [None]:
plot_hist("mask_coverage_per_action", figsize=(10, 5))

In [None]:
plot_hist("interact_step_cov", figsize=(10, 5))

In [None]:
# KDE per split of quantitative fields
# Inspired by https://stackoverflow.com/questions/46045750/python-distplot-with-multiple-distributions

cs = [c for c in stats_df.columns if stats_df[c].dtype != 'O']
splits = stats_df['split'].unique()
df = stats_df[['split'] + cs].melt(['split'], var_name='cols', value_name='vals')
num_cols = 4
num_rows = math.ceil(len(cs) / num_cols)
fig, axs = plt.subplots(num_rows, num_cols, figsize=(25, 20))

col = 0
for r in range(len(axs)):
    for c in range(len(axs[r])):
        if col >= len(cs):
            break
        for s in splits:
            sns.distplot(stats_df[stats_df['split'] == s][cs[col]], hist=False, rug=False, ax=axs[r][c])
        col += 1


In [None]:
# Mean of quantitative fields
stats_df.groupby('split').mean().round(2)

# Textual Analysis

In [None]:
task_trigram_df[:20]

In [None]:
steps_trigram_df[:20]

In [None]:
def show_word_cloud(split):
    d = defaultdict(lambda: 0)

    for nouns_dict in stats_df[stats_df['split'] == split]['common_nouns_freq']:
        for k in nouns_dict:
            d[k] += nouns_dict[k]

    wc = WordCloud(background_color="white", max_words=1000)
    wc.generate_from_frequencies(d)

    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

show_word_cloud('train')
show_word_cloud('valid_seen')
show_word_cloud('valid_unseen')

In [None]:
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])
common_nouns = defaultdict(lambda:set())
common_nouns_bi = defaultdict(lambda:set())
unk_words = defaultdict(lambda:set())
total_words = defaultdict(lambda:0)

for split, task_dir in task_dirs(splits, data_path):
    for trial_dir in os.listdir("{}/{}/{}".format(data_path, split, task_dir)):
        traj_data_file = open("{}/{}/{}/{}/traj_data.json".format(data_path, split, task_dir, trial_dir))
        traj_data = json.load(traj_data_file)

        for directive in traj_data["turk_annotations"]["anns"]:
            words = '. '.join(directive['high_descs'] + [directive["task_desc"] + '. '])
            tokens = nlp(words.lower())
            total_words[split] += len(tokens)
            for i, t in enumerate(tokens):
                if t.pos_ == 'NOUN':
                    common_nouns[split].add(t.text)
                    if (i < len(tokens) - 1) and tokens[i + 1].pos_ == 'NOUN':
                        common_nouns_bi[split].add(t.text + '|||' + tokens[i + 1].text)
                if not t.has_vector:
                    unk_words[split].add(t.text)

In [None]:
# print all unknown words in train and validation sets. Note most words are just misspelled, which can probably be corrected
remove_words = defaultdict(lambda: [])
for s in unk_words:
    for unk in unk_words[s]:
        if unk.strip() == '':
            remove_words[s].append(unk)

for s in unk_words:
    unk_words[s] = unk_words[s].difference(remove_words[s])

pprint(dict(unk_words))

In [None]:
# Percentage of unknown words
for s in unk_words:
    print(f'Percentage of unknown words in {s}: {len(unk_words[s]) / total_words[s]}')

In [None]:
synonyms = {}
for ob in tqdm(OBJECTS):
    ob_tokens = nlp(re.sub('([a-z])([A-Z])', r'\1 \2', ob).lower())
    
    sims = {n: ob_tokens.similarity(nlp(n)) for split in common_nouns for n in common_nouns[split]}
    if len(ob_tokens) == 2:
        sims.update({' '.join(n.split('|||')): ob_tokens.similarity(nlp(' '.join(n.split('|||')))) for split in common_nouns for n in common_nouns_bi[split]})

    sims = sorted(sims, key=lambda k: -sims[k])
    synonyms[ob_tokens.text] = sims[:3]

In [None]:
# Top 3 synonyms for every object measured by word embedding distance
pprint(synonyms)

In [None]:
# Percentage of objects referred to in instructions compared to objects in the scene

splits = ["train", "valid_seen", "valid_unseen"]
task_fields = ["task_type", "focus_object", "base_object", "dest_object", "scene"]
data_path = "../tars/alfred/data/json_2.1.0"

pseudo_attention_metric = [] # list of tuples (number of objects in ste-by-step instructions, total number of objects as per object_poses)
corresponding_files = []

for split in splits:
    try:
        task_dirs = os.listdir("{}/{}".format(data_path, split))
        for i in range(len(task_dirs)):
            task_dir = task_dirs[i]
            task_values = task_dir.split("-")
            for trial_dir in os.listdir("{}/{}/{}".format(data_path, split, task_dir)):
                traj_data_file = open("{}/{}/{}/{}/traj_data.json".format(data_path, split, task_dir, trial_dir))
                traj_data = json.load(traj_data_file)
                object_poses_json = traj_data['scene']['object_poses']
                object_name_list = [] # object names in the scene
                for object_pose_json in object_poses_json:
                    objectName = object_pose_json["objectName"]
                    object_name_list.append(objectName[:objectName.find("_")])

                object_name_list = list(set(object_name_list)) # remove duplicates

                # Get the step by step instructions
                count = 0
                words_already_accounted = []
                for instructions_json in traj_data["turk_annotations"]['anns']:
                    for step_by_step_instruction in instructions_json["high_descs"]:
                        for object in object_name_list:
                            # compare object in scene to object referred in the instructions. Do not double-count
                            if object.lower() not in words_already_accounted and object.lower() in step_by_step_instruction:
                                count += 1
                                words_already_accounted.append(object.lower())

                pseudo_attention_metric.append((count, len(object_name_list), count/len(object_name_list)))
                corresponding_files.append(traj_data_file)
                if pseudo_attention_metric[-1][-1] > 1:
                    print(traj_data_file)
    except:
        pass

pseudo_attention_metric = np.asarray(pseudo_attention_metric)
percentages = pseudo_attention_metric[:, 2]
plt.hist(percentages, bins=10)
plt.ylabel("Frequency")
plt.xlabel("%")
plt.title("Percentage of objects referred to in instructions \n compared to objects in the scene")
plt.show()

# Visual Analysis

In [None]:
from easyimages import EasyImageList

seg_dir = 'instance_masks'
rgb_dir = 'high_res_images'
depth_dir = 'depth_images'

random_trajs = [
    '../tars/alfred/data/json_2.1.0/train/pick_two_obj_and_place-SoapBottle-None-Cabinet-406/trial_T20190909_145544_288730/',
    '../tars/alfred/data/json_2.1.0/train/pick_and_place_with_movable_recep-Spatula-Pan-CounterTop-13/trial_T20190908_194609_016883/'
]

ims_dict = {}
for d in [rgb_dir, depth_dir, seg_dir]:
    ims = EasyImageList([])
    for t in random_trajs:
        ims.ims += EasyImageList.from_folder(os.path.join(t, d))
    ims_dict[d] = ims

In [None]:
# sample random RGB images
ims_dict['high_res_images'].html(sample=33)

In [None]:
# sample random depth images
ims_dict['depth_images'].html(sample=33)

In [None]:
# sample random segmentation images
ims_dict['instance_masks'].html(sample=33)