In [2]:
from datasets import load_dataset
from datetime import datetime

import os

# #full dataset (600GB of data)
dataset = load_dataset("HuggingFaceFV/finevideo", split="train", num_proc=64)

Resolving data files:   0%|          | 0/1357 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1357 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/916 [00:00<?, ?it/s]

In [3]:
dataset

Dataset({
    features: ['mp4', 'json'],
    num_rows: 43751
})

# Process the Dataset

In [4]:
filtered_dataset = dataset.filter(lambda x: x['json']["content_parent_category"] == "Sports", num_proc=64)

In [5]:
filtered_dataset[0]["json"].keys()

dict_keys(['content_fine_category', 'content_metadata', 'content_parent_category', 'duration_seconds', 'original_json_filename', 'original_video_filename', 'resolution', 'text_to_speech', 'text_to_speech_word_count', 'timecoded_text_to_speech', 'youtube_age_limit', 'youtube_categories', 'youtube_channel', 'youtube_channel_follower_count', 'youtube_comment_count', 'youtube_description', 'youtube_like_count', 'youtube_tags', 'youtube_title', 'youtube_upload_date', 'youtube_view_count'])

In [47]:
import numpy as np
from tqdm import tqdm
def process_dataset_chunks(dataset, chunk_size=5, max_chunks=10):
    # chunk size in seconds
    datetimer = lambda x: datetime.strptime(x, "%H:%M:%S.%f").time()
    dataset_list = []
    for example in tqdm(dataset, total=len(dataset)):
        batch_item = {"chunks": []}
        # combine activities
        all_activities = []
        example = filtered_dataset[0]
        for j in range(len(example["json"]["content_metadata"]["scenes"])):
            all_activities += example["json"]["content_metadata"]["scenes"][j]["activities"]
        # convert timestamp start and end to int tuple
        for activity in all_activities:
            start = datetimer(activity["timestamp"]["start_timestamp"])
            start = start.second + start.minute * 60 + start.hour * 3600
            end = datetimer(activity["timestamp"]["end_timestamp"])
            end = end.second + end.minute * 60 + end.hour * 360
            activity["timestamp"]["interval"] = (start, end)
        # chunk intervals 
        max_seconds = example["json"]["duration_seconds"]
        chunked_intervals = [(chunk_size*i, chunk_size*(i+1)) for i in range(max_chunks)]
        # sample activity that happened before the end of the chunk
        rolling_activities = []
        idx = 0
        for chunk in chunked_intervals:
            if(chunk[1] > max_seconds):
                break
            for j, activity in enumerate(all_activities):
                if activity["timestamp"]["interval"][0] < chunk[1] and j > idx:
                    rolling_activities.append(activity)
                    idx += 1
            if(len(rolling_activities) == 0):
                sample_idx = np.random.randint(0, len(all_activities))
            else:
                sample_idx = np.random.randint(0, len(rolling_activities))
            batch_item["chunks"].append({"interval": all_activities[sample_idx]["timestamp"]["interval"], 
                                         "activity": all_activities[sample_idx]["description"]})
        batch_item["mp4"] = example["mp4"]
        dataset_list.append(batch_item)

    return dataset_list

processed_dataset = process_dataset_chunks(filtered_dataset, chunk_size=5, max_chunks=20)


        

100%|██████████| 3359/3359 [00:25<00:00, 129.38it/s]


In [49]:
processed_dataset[0]["chunks"]

[{'interval': (0, 1), 'activity': 'Police recruits practice combat'},
 {'interval': (6, 8), 'activity': 'Police officers stand in formation'},
 {'interval': (2, 3),
  'activity': 'Police officers walk together on a sidewalk'},
 {'interval': (2, 3),
  'activity': 'Police officers walk together on a sidewalk'},
 {'interval': (7, 8), 'activity': 'Police recruit #2 runs'},
 {'interval': (24, 25), 'activity': 'Police car navigates an obstacle course'},
 {'interval': (19, 20),
  'activity': 'Police recruits practice combat techniques'},
 {'interval': (2, 3),
  'activity': 'Police officers walk together on a sidewalk'},
 {'interval': (24, 25), 'activity': 'Police car navigates an obstacle course'},
 {'interval': (7, 8), 'activity': 'Police recruit #2 runs'},
 {'interval': (39, 40),
  'activity': 'Shots of the Academy buildings and grounds'},
 {'interval': (27, 28), 'activity': 'Police recruits apply a tourniquet'},
 {'interval': (27, 28), 'activity': 'Police recruits apply a tourniquet'},
 {'

In [None]:
from datasets import Dataset
processed_dataset_hf = Dataset.from_list(processed_dataset)

PermissionError: [Errno 13] Permission denied: '/sports_dataset'

In [52]:
processed_dataset_hf.save_to_disk("~/temp/sports_dataset")

Saving the dataset (116/116 shards): 100%|██████████| 3359/3359 [00:10<00:00, 307.44 examples/s]


# Analyzing the Dataset

In [6]:
filtered_dataset

Dataset({
    features: ['mp4', 'json'],
    num_rows: 3359
})

In [7]:
sample = filtered_dataset[0]
sample['json']['content_fine_category'], sample['json']['content_parent_category'] 


('Career Highlights', 'Sports')

In [9]:
with open('../../sample.mp4', 'wb') as video_file:
    video_file.write(sample['mp4'])

In [31]:
sample['json'].keys()

dict_keys(['content_fine_category', 'content_metadata', 'content_parent_category', 'duration_seconds', 'original_json_filename', 'original_video_filename', 'resolution', 'text_to_speech', 'text_to_speech_word_count', 'timecoded_text_to_speech', 'youtube_age_limit', 'youtube_categories', 'youtube_channel', 'youtube_channel_follower_count', 'youtube_comment_count', 'youtube_description', 'youtube_like_count', 'youtube_tags', 'youtube_title', 'youtube_upload_date', 'youtube_view_count'])

In [32]:
sample['json']['timecoded_text_to_speech']

[{'end': '00:00:03.560',
  'start': '00:00:00.000',
  'text': ' Good luck and may I assist you?'},
 {'end': '00:00:21.040', 'start': '00:00:18.560', 'text': ' Go! Go!'},
 {'end': '00:00:27.160', 'start': '00:00:26.000', 'text': ' Three.'},
 {'end': '00:00:28.800', 'start': '00:00:27.560', 'text': ' Two.'},
 {'end': '00:00:37.000',
  'start': '00:00:30.000',
  'text': ' Life as a QPS recruit is very fun.'},
 {'end': '00:00:40.000',
  'start': '00:00:37.000',
  'text': ' You can kind of live and breathe economy.'},
 {'end': '00:00:42.000',
  'start': '00:00:40.000',
  'text': ' You go to bed thinking about what you learnt that day.'},
 {'end': '00:00:45.000',
  'start': '00:00:42.000',
  'text': " You wake up thinking about what you're going to learn that in the next day."},
 {'end': '00:00:53.000',
  'start': '00:00:45.000',
  'text': " It's definitely very consuming but it's been amazing learning the ins and outs of policing."},
 {'end': '00:00:59.000',
  'start': '00:00:53.000',
  'te

In [None]:
sample['json']['content_metadata'].keys()

dict_keys(['characterList', 'description', 'fps', 'qAndA', 'scenes', 'storylines', 'title', 'trimmingSuggestions'])

In [34]:
sample['json']['content_metadata']['characterList']

[{'characterId': 'c1',
  'description': 'White female, light brown hair tied in a bun, pale complexion, wearing a blue QPS training shirt.',
  'name': 'Police Recruit #1'},
 {'characterId': 'c2',
  'description': 'White male with short brown hair, wearing a blue QPS training shirt.',
  'name': 'Police Recruit #2'},
 {'characterId': 'c3',
  'description': 'Superintendent, White male with short brown hair and greying temples, wearing a QPS uniform and tie.',
  'name': 'Joe Jaramazovic'},
 {'characterId': 'c4',
  'description': 'Senior Sergeant, White female with short blonde hair and glasses, wearing a QPS uniform.',
  'name': 'Jane Fitzgerald'},
 {'characterId': 'c5',
  'description': 'White male, short brown hair, wearing a QPS uniform and tie.',
  'name': 'Unknown Officer 1'}]

In [35]:
sample['json']['content_metadata']["description"]

'A video analysis of the daily life and experiences of police recruits at the Queensland Police Service Academy.'

In [36]:
sample['json']['content_metadata']["qAndA"]

[{'answer': 'The video starts with police recruits practicing combat techniques, specifically ground fighting and grappling.',
  'question': 'What type of training is shown in the very beginning of the video?'},
 {'answer': 'She mentions the difficulty of transitioning from a job that involved being outdoors all day to the academy environment, which involves long hours of classroom learning and studying.',
  'question': 'What does Police Recruit #1 say is the biggest challenge she faced during her training?'},
 {'answer': 'The video showcases signage displaying the core values of Integrity, Community, Respect and Fairness, and Professionalism, emphasizing their significance in the training and the overall culture of the Queensland Police Service.',
  'question': 'How does the video visually represent the core values of the Queensland Police Service?'},
 {'answer': 'The initial training program lasts for 18 months. The first two months involve 100% supervision, followed by 50% supervisi

In [37]:
sample['json']['content_metadata']["scenes"]

[{'activities': [{'description': 'Police recruits practice combat',
    'timestamp': {'end_timestamp': '00:00:01.000',
     'start_timestamp': '00:00:00.000'}},
   {'description': 'Police officers walk together on a sidewalk',
    'timestamp': {'end_timestamp': '00:00:03.000',
     'start_timestamp': '00:00:02.000'}},
   {'description': 'Police recruits work together on laptops',
    'timestamp': {'end_timestamp': '00:00:04.000',
     'start_timestamp': '00:00:03.000'}},
   {'description': 'Police officers stand in formation',
    'timestamp': {'end_timestamp': '00:00:08.000',
     'start_timestamp': '00:00:06.000'}},
   {'description': 'Police recruit #2 runs',
    'timestamp': {'end_timestamp': '00:00:08.000',
     'start_timestamp': '00:00:07.000'}},
   {'description': 'Police recruits practice shooting',
    'timestamp': {'end_timestamp': '00:00:10.000',
     'start_timestamp': '00:00:09.000'}},
   {'description': 'Police officers ride in a patrol car',
    'timestamp': {'end_times