# Co-occurence and causality matrix calculation for the Epic Kitchens dataset

Causality matrix will be computed by:
- Considering pairs of verb-noun in epic kitchens
- For each pair of verb-noun, adding occurences of each other pair that comes after it
    - version 1: only count what comes right after
    - version 2: count every action that comes after the given pair, but reduce contribution exponentially according to temporal distance

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
from tqdm import tqdm


In [3]:
# Load Epic kitchens train annotations
train_annotations = pd.read_csv('./epic-kitchens-100-annotations/EPIC_100_train.csv')

# Show statistics
print('Train annotations:')
print("Size", train_annotations.shape)

train_annotations.head(20)


Train annotations:
Size (67217, 15)


Unnamed: 0,narration_id,participant_id,video_id,narration_timestamp,start_timestamp,stop_timestamp,start_frame,stop_frame,narration,verb,verb_class,noun,noun_class,all_nouns,all_noun_classes
0,P01_01_0,P01,P01_01,00:00:01.089,00:00:00.14,00:00:03.37,8,202,open door,open,3,door,3,['door'],[3]
1,P01_01_1,P01,P01_01,00:00:02.629,00:00:04.37,00:00:06.17,262,370,turn on light,turn-on,6,light,114,['light'],[114]
2,P01_01_10,P01,P01_01,00:00:23.340,00:00:24.97,00:00:26.20,1498,1572,open drawer,open,3,drawer,8,['drawer'],[8]
3,P01_01_100,P01,P01_01,00:07:57.919,00:07:59.75,00:08:00.88,28785,28852,take cup,take,0,cup,13,['cup'],[13]
4,P01_01_101,P01,P01_01,00:08:00.020,00:08:01.47,00:08:02.21,28888,28932,open cupboard,open,3,cupboard,3,['cupboard'],[3]
5,P01_01_102,P01,P01_01,00:08:01.229,00:08:02.13,00:08:03.00,28927,28980,put cup into cupboard,put-into,5,cup,13,"['cup', 'cupboard']","[13, 3]"
6,P01_01_103,P01,P01_01,00:08:03.919,00:08:05.22,00:08:07.21,29113,29232,take container and lid,take,0,container,21,"['container', 'lid']","[21, 6]"
7,P01_01_104,P01,P01_01,00:08:07.610,00:08:08.38,00:08:09.12,29302,29347,put container on top of counter,put-on,1,container,21,"['container', 'top:counter']","[21, 42]"
8,P01_01_105,P01,P01_01,00:08:09.860,00:08:12.00,00:08:12.74,29520,29564,open container,open,3,container,21,['container'],[21]
9,P01_01_106,P01,P01_01,00:08:12.900,00:08:13.22,00:08:14.37,29593,29662,put container inside container,put-inside,5,container,21,"['container', 'container']","[21, 21]"


In [4]:
# Get count of actions in train set
print(train_annotations['verb'].value_counts())
print()
print(train_annotations['noun'].value_counts())

pick-up        9868
put-down       7726
open           4851
close          3463
take           3412
               ... 
sit-on            1
put-aside         1
walk-around       1
take-in           1
scrub-down        1
Name: verb, Length: 856, dtype: int64

tap                   3567
plate                 2186
knife                 2092
spoon                 1806
cupboard              1786
                      ... 
box:salad                1
cap:milk                 1
oil:more                 1
packaging:parmesan       1
jar:coffee               1
Name: noun, Length: 2032, dtype: int64


In [35]:
def build_causality_matrix(train_annotations):
    '''Build a matrix where each row and column are combinations of verb and nouns, 
    and each cell corresponds to the number of time that column comes right after row.'''

    # Get all unique combinations of verb and noun
    unique_nv_combos = get_unique_nv_combos(train_annotations)

    # Make dict of dict representing causality matrix
    causality_matrix = {}
    for verb_noun in unique_nv_combos:
        causality_matrix[verb_noun] = {}
        for other_verb_noun in unique_nv_combos:
            causality_matrix[verb_noun][other_verb_noun] = 0

    print("Causality matrix initialized")

    # Fill matrix by iterating over train_annotations and counting the number of times each combination appears after another
    for index, row in tqdm(train_annotations.iterrows()):
        verb_noun = row['verb_nouns']
        verb_noun_after = train_annotations.iloc(index+1)['verb_nouns']
        causality_matrix[verb_noun][verb_noun_after] += 1

    return causality_matrix

def add_verb_noun_column(df):
    df["verb_nouns"] = df["verb"] + " " + df["all_nouns"]
    df["verb_nouns"] = df["verb_nouns"].str.replace("[","")
    df["verb_nouns"] = df["verb_nouns"].str.replace("]","")
    df["verb_nouns"] = df["verb_nouns"].str.replace("'","")
    df["verb_nouns"] = df["verb_nouns"].str.replace(",","")
    return df

def get_unique_nv_combos(df):
    df = add_verb_noun_column(df)
    unique_nv_combos = df["verb_nouns"].unique()

    print("len(unique_nv_combos)",len(unique_nv_combos))
    return unique_nv_combos


In [36]:
cm = build_causality_matrix(train_annotations)

  df["verb_nouns"] = df["verb_nouns"].str.replace("[","")
  df["verb_nouns"] = df["verb_nouns"].str.replace("]","")


len(unique_nv_combos) 13656


KeyboardInterrupt: 

In [9]:
# Save causality matrix as json
import json
with open('causality_matrix.json', 'w') as fp:
    json.dump(cm, fp)
    

(495,)