# This notebook finds and extracts all of the Wildlife videos from the overall YT8M dataset and then stores their video and frame data separately for further processing

### Imports

In [1]:
import os
from tqdm import tqdm
import numpy as np
import tensorflow as tf
import pandas as pd
import requests
import re
import json
import pickle
from path import Path

2023-03-31 15:03:57.498334: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-03-31 15:03:58.672147: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-03-31 15:03:58.673766: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
print(tf.__version__)
print(pd.__version__)
! python --version

2.12.0
1.3.4
Python 3.9.7


### Set the base path here

In [None]:
# this should be the path where you are storing the entire YT8M dataset
base_path = Path('/nfs/turbo/seas-nhcarter/human_wildlife_interactions')

In [3]:
# check and make sure we can access the correct directory
video_path = Path(base_path / 'video')
frame_path = Path(base_path / 'frame')

In [4]:
# find the specific wildlife video ids we know we want
# get list of ids from yt8m api
def get_entity_videoIds(entity_name):
    ''' gets a list of video ids in the YT8M training dataset tagged with a given entity(name)'''

    entity_id = entity2id[entity_name]

    url = f'https://storage.googleapis.com/data.yt8m.org/2/j/v/{entity_id}.js'
    response = requests.get(url)
    response.raise_for_status() 

    data = response.text
    pattern = r'\w+'
    ids = re.findall(pattern, data)[2:] # video ids start at index 2 onward
    print(f'{entity_name}({entity_id}): {len(ids)} videos found')

    return ids

new_url = 'https://research.google.com/youtube8m/csv/2/vocabulary.csv'
new_vocab = pd.read_csv(new_url)
animal_df = new_vocab[(new_vocab.Vertical1 == 'Pets & Animals') | (new_vocab.Vertical2 == 'Pets & Animals')] # Pets & Animal only present in V1&2
summary_df = animal_df.groupby(['Name','KnowledgeGraphId']).agg({'TrainVideoCount':'sum'}).reset_index()
entity2id = dict(zip(summary_df.Name, summary_df.KnowledgeGraphId.str[3:]))
videoIds = get_entity_videoIds('Wildlife')

Wildlife(01280g): 4243 videos found


In [5]:
video_list = os.listdir(video_path / 'train')
frame_list = os.listdir(frame_path / 'train')

In [6]:
data_dict = {}

# iterate through all TFRecords
for record in tqdm(video_list):
    path = video_path / 'train' / record
    # iterate through all videos in the record and add to dictionary if it is in the wildlife category
    try:
        for video in tf.compat.v1.python_io.tf_record_iterator(path):
            seq_video = tf.train.Example.FromString(video)
            video_id = seq_video.features.feature['id'].bytes_list.value[0].decode(encoding='UTF-8')
            if video_id in videoIds:
                if video_id not in data_dict.keys():
                    data_dict[video_id] = {'video': video, 'video_example': seq_video}
    except:
        print("Error in {}".format(path))
        continue

  0%|          | 0/3845 [00:00<?, ?it/s]

Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


 58%|█████▊    | 2247/3845 [03:03<01:45, 15.20it/s]

Error in /nfs/turbo/seas-nhcarter/human_wildlife_interactions/video/train/2_video_train_download_plan.json


100%|██████████| 3845/3845 [05:06<00:00, 12.55it/s]


In [7]:
write_path = base_path / 'classifier_video_data/video_data.pkl'
pickle.dump(data_dict, open(base_path / write_path, 'wb'))

In [8]:
frame_dict = {}
# iterate through all TFRecords
for record in tqdm(frame_list):
    path = frame_path / 'train' / record
    # iterate through all videos in the record and add to dictionary if it is in the wildlife category
    try:
        for frame in tf.compat.v1.python_io.tf_record_iterator(path):
            seq_frame = tf.train.Example.FromString(frame)
            seq_example = tf.train.SequenceExample.FromString(frame)
            frame_id = seq_frame.features.feature['id'].bytes_list.value[0].decode(encoding='UTF-8')
            if frame_id in videoIds:
                if frame_id not in frame_dict.keys():
                    frame_dict[frame_id] = {'frame':frame,'frame_example':seq_frame,'sequence_example':seq_example}
    except:
        continue

100%|██████████| 3845/3845 [2:31:59<00:00,  2.37s/it]  


In [9]:
write_path = base_path /'classifier_video_data/frame_data.pkl'
pickle.dump(frame_dict, open(write_path, 'wb'))