In [1]:
import os
from pathlib import Path
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

import plotly.graph_objects as go

import dash
from dash import dcc
from dash import html

# OKVQA - data investigations

This notebook examines the OKVQA dataset. Both images and questions will be analyzed. 

## 1) Load data
We start by loading the OKVQA datasets downloaded through [MMF](https://github.com/facebookresearch/mmf).


In [2]:
# data load function
def loadData(filename, data_path):
    data = np.load(data_path / filename, allow_pickle=True)
    dataset_specs = data[0]
    
    print("Dataset specs:")
    for key, value in dataset_specs.items():
        print(f"{key}: {value}")
    
    return pd.DataFrame.from_records(data[1:])

In [3]:
# path to dataset
data_path = Path.home() / '.cache/torch/mmf/data/datasets/okvqa/defaults/annotations/annotations'
filenames = ['imdb_train.npy', 'imdb_test.npy', 'imdb_val.npy', 'imdb_trainval.npy']

okvqa = {}
for filename in filenames:
    print(f"{'-'*50}\nLoading {filename}...\n")
    name = filename.split('.')[0].split('_')[1]
    okvqa[name] = loadData(filename, data_path)

--------------------------------------------------
Loading imdb_train.npy...

Dataset specs:
create_time: 2020-09-03 13:40:00.451863
dataset_name: okvqa
version: 1.1
has_answer: True
has_gt_layout: False
created_at: 1599165600.4518836
--------------------------------------------------
Loading imdb_test.npy...

Dataset specs:
create_time: 2020-09-03 13:39:59.718679
dataset_name: okvqa
version: 1.1
has_answer: True
has_gt_layout: False
created_at: 1599165599.718699
--------------------------------------------------
Loading imdb_val.npy...

Dataset specs:
create_time: 2020-09-03 13:40:00.642192
dataset_name: okvqa
version: 1.1
has_answer: True
has_gt_layout: False
created_at: 1599165600.6422124
--------------------------------------------------
Loading imdb_trainval.npy...

Dataset specs:
create_time: 2020-09-03 13:40:00.072816
dataset_name: okvqa
version: 1.1
has_answer: True
has_gt_layout: False
created_at: 1599165600.0728385


## 2) Initial investigation

As given by the creators of the [OKVQA dataset](https://okvqa.allenai.org/), there are 14,055 open-ended questions in the dataset. Let's figure out how this relates to the data that we just loaded...

In [4]:
okvqa['train'].shape[0] + okvqa['test'].shape[0] + okvqa['val'].shape[0] + okvqa['trainval'].shape[0]

23064

This is much larger than what is stated by the creators of the OKVQA dataset. But wait a second... One of the datafiles provided by MMF is called `imdb_trainval.npy` - what happens if we ignore this?

In [5]:
okvqa['train'].shape[0] + okvqa['test'].shape[0] + okvqa['val'].shape[0]

14055

Alright! So the trainval is probably just a combination of the training and validation data - let's quickly verify this...

In [6]:
okvqa['train'].shape[0] + okvqa['val'].shape[0] == okvqa['trainval'].shape[0]

True

Indeed that is true! Let's ignore the `imdb_train.npy` and `imdb_trainval.npy` files and inspect the training data as the mix of these and compare it to the test data.

In [7]:
okvqa_full = pd.concat([okvqa['trainval'], okvqa['test']]).reset_index().drop('index', axis=1)

print(f"Shape of dataframe: {okvqa_full.shape}")
okvqa_full.head()

Shape of dataframe: (14055, 9)


Unnamed: 0,image_id,question_id,question_str,question_tokens,all_answers,answers,image_name,feature_path,ocr_tokens
0,51606,516065,What is the hairstyle of the blond called?,"[what, is, the, hairstyle, of, the, blond, cal...","[pony tail, pony tail, pony tail, pony tail, p...","[pony tail, pony tail, pony tail, pony tail, p...",COCO_train2014_000000051606,COCO_train2014_000000051606.npy,[]
1,81721,817215,How old do you have to be in canada to do this?,"[how, old, do, you, have, to, be, in, canada, ...","[18, 18, 18, 18, 18, 18, 18, 18, 18, 18]","[18, 18, 18, 18, 18, 18, 18, 18, 18, 18]",COCO_train2014_000000081721,COCO_train2014_000000081721.npy,[]
2,480208,4802085,Can you guess the place where the man is playing?,"[can, you, guess, the, place, where, the, man,...","[aspen, aspen, mountain, mountain, mountain, m...","[aspen, aspen, mountain, mountain, mountain, m...",COCO_train2014_000000480208,COCO_train2014_000000480208.npy,[]
3,570618,5706185,Which rail company is named after a town in ne...,"[which, rail, company, is, named, after, a, to...","[santa fe, santa fe, santa fe, santa fe, new e...","[santa fe, santa fe, santa fe, santa fe, new e...",COCO_train2014_000000570618,COCO_train2014_000000570618.npy,[]
4,478903,4789035,Is the boy swimming or doing another water act...,"[is, the, boy, swimming, or, doing, another, w...","[another activity, another activity, another a...","[another activity, another activity, another a...",COCO_train2014_000000478903,COCO_train2014_000000478903.npy,[]


## Questions



In [654]:
def createSunburstVariables(dataset, N, shuffle=True, seed=42):
    # define accumulation operator for strings
    def accum_operator(x1, x2, join_char=" "):
        return x1 + join_char + x2
    
    # initialize lists
    parents, labels, ids = [''], [''], ['CLS']
    occurrence_labels, occurrence_dict = [], {}

    
    # loop through token lists from questions
    for token_list in tqdm(dataset.sample(frac=1, random_state=seed)[:N].question_tokens):
        # create lists of accumulated strings
        accum = list(accumulate(['CLS']+token_list, func=accum_operator))
        accum_labels = list(accumulate(token_list, func=accum_operator))

        # define candidate tokens
        parent_candidates = accum[:-1]
        id_candidates = accum[1:]
        label_candidates = token_list

        # loop through candidate ids
        for i, id_candidate in enumerate(id_candidates):

            # count occurrences of this accumulated label
            try:
                occurrence_dict[accum_labels[i]] += 1
            except KeyError:
                occurrence_dict[accum_labels[i]] = 1

            # check if we have seen this id before (if so, it will cause trouble if we include it!)
            if id_candidate not in set(ids):
                parents.append(parent_candidates[i])
                ids.append(id_candidates[i])
                labels.append(token_list[i])
                occurrence_labels.append(accum_labels[i])

    # count occurrences of labels for list       
    occurrences = [int(occurrence_dict[id_name]) for id_name in occurrence_labels]
    occurrences.insert(0, N)
    
    df = pd.DataFrame({'ids':ids, 'labels':labels, 'parents':parents, 'occurrences':occurrences})
    df.occurrences.astype(int)

    return df

def plotSunburst(df, N, visulization_depth, width=500, height=500, use_dash=True):
    fig = go.Figure(
            go.Sunburst(
                ids=df.ids,
                labels=df.labels,
                parents=df.parents,
                values=df.occurrences,
                text=df.occurrences / N * 100,
                branchvalues="total",
                hovertemplate='<b>%{label} </b> <br> Percentage: %{text:.2f}%',
                domain=dict(column=1),
                maxdepth=visualization_depth,
                insidetextorientation='radial',
            ))

    
    #fig.update_traces(hovertemplate=hovertemplate)
    fig.update_layout(autosize=False, 
                      width=width, height=height, 
                      margin=dict(t=10, l=10, r=10, b=10), 
                      uniformtext=dict(minsize=10, mode='hide'))

    if use_dash:
        app = dash.Dash()
        app.layout = html.Div([dcc.Graph(figure=fig)])
        app.run_server(debug=True, use_reloader=False)  # Turn off reloader if inside Jupyter
    else:
        fig.show()

In [656]:
N = 51
visualization_depth = 10

temp = createSunburstVariables(okvqa['trainval'], N)
plotSunburst(temp, N, visualization_depth, use_dash=False)

HBox(children=(FloatProgress(value=0.0, max=51.0), HTML(value='')))




In [648]:
N = 51
visualization_depth = 10

temp = createSunburstVariables(okvqa['test'], N)
plotSunburst(temp, N, visualization_depth, use_dash=False)

HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))




In [647]:
N = 31
visualization_depth = 10

temp = createSunburstVariables(okvqa_full, N)
plotSunburst(temp, N, visualization_depth, use_dash=False)

HBox(children=(FloatProgress(value=0.0, max=31.0), HTML(value='')))




Deploy dash app in webpage: https://dash.plotly.com/deployment