# Content
## 1. Prepare videos
## 2. Generate raw data list from DATA_PATH
## 3. Confirm that videos are correct

## Prepare videos
Please refer to the [official website](https://deepmind.com/research/open-source/kinetics) and/or the official script to prepare the videos.
Note that the folder structure should look like this:
```
eva-video
├── ...
├── data
│   ├── k400/600/700  -> PATH_TO_Kinetics-400/600/700
│   │   ├── train
│   │   │   ├── ${CLASS_NAME}/${VIDEO_ID}
│   │   ├── val
│   │   │   ├── ${CLASS_NAME}/${VIDEO_ID}
│   ├── k400/600/700/722_train.txt
│   ├── k400/600/700/722_val.txt
│   ├── k722_to_k400/600/700_mapping.npy
├── ...
```

## Generate raw data list from DATA_PATH

- remove duplicates and leaked videos
- map `trainID` (0-721) to 0-399 (k400), 0-599 (k600), 0-699 (k700)

The merged dataset coined Kinetics-722 (K-722) integrates all valid training samples from Kinetics-400 (K-400), Kinetics-600 (K-600) and Kinetics-700 (K-700).
Notably, for a fair and legal comparison, we removed leaked videos in all validation sets and duplicated videos in all training sets based on `youtube id` of the video.
Accordingly, the cleaned K-722 contains 0.63M training videos, covering 722 human action classes. We also provide [our data list]( https://huggingface.co/BAAI/EVA/blob/main/eva%20video%20data%20list.zip).

In [None]:
import os
import pandas as pd
import numpy as np

from decord import VideoReader
from decord import cpu


DATA_ROOT = '../data'

PATH_k400 = {'train': 'k400/train', 'val': 'k400/val'}
PATH_k600 = {'train': 'k600/train', 'val': 'k600/val'}
PATH_k700 = {'train': 'k700/train', 'val': 'k700/val'}

kinetics_class_list = {
    'k400': {k: sorted(os.listdir(os.path.join(DATA_ROOT, PATH_k400[k]))) for k in ['train', 'val']},
    'k600': {k: sorted(os.listdir(os.path.join(DATA_ROOT, PATH_k600[k]))) for k in ['train', 'val']},
    'k700': {k: sorted(os.listdir(os.path.join(DATA_ROOT, PATH_k700[k]))) for k in ['train', 'val']},
}

total_classes = sorted(
    set(kinetics_class_list['k400']['val']) | set(kinetics_class_list['k600']['val']) | set(
        kinetics_class_list['k700']['val']))

K722_CLASS_MAPPING = {val: i for i, val in enumerate(total_classes)}

K400_CLASS_MAPPING = {val: i for i, val in enumerate(sorted(set(kinetics_class_list['k400']['val'])))}

K600_CLASS_MAPPING = {val: i for i, val in enumerate(sorted(set(kinetics_class_list['k600']['val'])))}

K700_CLASS_MAPPING = {val: i for i, val in enumerate(sorted(set(kinetics_class_list['k700']['val'])))}

# map 722 classes to 400/600/700 classses
K722_TO_K400_MAPPING = []
K722_TO_K600_MAPPING = []
K722_TO_K700_MAPPING = []
for label, trainID in K722_CLASS_MAPPING.items():
    if label in K400_CLASS_MAPPING:
        K722_TO_K400_MAPPING.append(trainID)
    if label in K600_CLASS_MAPPING:
        K722_TO_K600_MAPPING.append(trainID)
    if label in K700_CLASS_MAPPING:
        K722_TO_K700_MAPPING.append(trainID)
np.save(os.path.join(DATA_ROOT, 'k722_to_k400_mapping.npy'), np.array(K722_TO_K400_MAPPING))
np.save(os.path.join(DATA_ROOT,'k722_to_k600_mapping.npy'), np.array(K722_TO_K600_MAPPING))
np.save(os.path.join(DATA_ROOT,'k722_to_k700_mapping.npy'), np.array(K722_TO_K700_MAPPING))

def name_to_trainid(name):
    return K722_CLASS_MAPPING[name]

In [None]:
kinetics_file_dict = {'k400': {split: [] for split in ['train', 'val']},
                'k600': {split: [] for split in ['train', 'val']},
                 'k700': {split: [] for split in ['train', 'val']},
                 'k722': {split: [] for split in ['train', 'val']},
                      }

for kinetics_name, class_sets in kinetics_class_list.items():  # k400, {'train': {CLASSES}, 'val': {CLASSES}}
    assert len(set(kinetics_class_list[kinetics_name]['train']) | set(kinetics_class_list[kinetics_name]['val'])) == len(kinetics_class_list[kinetics_name]['val'])
    for split in ['train', 'val']:
        for class_name in class_sets[split]:

            for video_id in os.listdir(
                    os.path.join(DATA_ROOT, kinetics_name, split, class_name)):

                if len(video_id) < 2 or video_id.endswith('tar.gz') or video_id.startswith('.'):  # special cases in local dataset
                    continue

                kinetics_file_dict[kinetics_name][split].append({'trainID': name_to_trainid(class_name), 'label': class_name,
                                                            'video_id': video_id, 'youtube_id': video_id[:11]})

                kinetics_file_dict['k722'][split].append({'trainID': name_to_trainid(class_name), 'label': class_name,
                                                     'video_id': video_id, 'youtube_id': video_id[:11],
                                                     'set': kinetics_name})

            print(kinetics_name, split, class_name, name_to_trainid(class_name))


k400_train_raw = pd.DataFrame(kinetics_file_dict['k400']['train'],
                              columns=['trainID', 'label', 'video_id', 'youtube_id'])
k400_val_raw = pd.DataFrame(kinetics_file_dict['k400']['val'],
                            columns=['trainID', 'label', 'video_id', 'youtube_id'])
k600_train_raw = pd.DataFrame(kinetics_file_dict['k600']['train'],
                              columns=['trainID', 'label', 'video_id', 'youtube_id'])
k600_val_raw = pd.DataFrame(kinetics_file_dict['k600']['val'],
                            columns=['trainID', 'label', 'video_id', 'youtube_id'])
k700_train_raw = pd.DataFrame(kinetics_file_dict['k700']['train'],
                              columns=['trainID', 'label', 'video_id', 'youtube_id'])
k700_val_raw = pd.DataFrame(kinetics_file_dict['k700']['val'],
                            columns=['trainID', 'label', 'video_id', 'youtube_id'])
k722_train_raw = pd.DataFrame(kinetics_file_dict['k722']['train'],
                              columns=['trainID', 'label', 'video_id', 'set', 'youtube_id'])
k722_val_raw = pd.DataFrame(kinetics_file_dict['k722']['val'],
                            columns=['trainID', 'label', 'video_id', 'set', 'youtube_id'])

# Kinetics400 data list precessing
print('-' * 66)
print(f'k400: train: {len(k400_train_raw)}, val: {len(k400_val_raw)}')
k400_train_raw = k400_train_raw.drop_duplicates(subset=['youtube_id'])
print('after drop repeated video in k400 train set:', len(k400_train_raw))
print('after drop leaked video in k400 val set:', len(k400_train_raw))
k400_val_raw = k400_val_raw.drop_duplicates(subset=['youtube_id'])
print('after drop repeated video in k400 val set:', len(k400_val_raw))

with open(f'{DATA_ROOT}/k400_train_raw.txt', 'w') as f:
    for index, row in k400_train_raw.iterrows():
        item = f"{os.path.join('k400/train', row['label'], row['video_id'])} {K722_TO_K400_MAPPING.index(name_to_trainid(row['label']))}\n"
        f.write(item)
f.close()
with open(f'{DATA_ROOT}/k400_val_raw.txt', 'w') as f:
    for index, row in k400_val_raw.iterrows():
        item = f"{os.path.join('k400/val', row['label'], row['video_id'])} {K722_TO_K400_MAPPING.index(name_to_trainid(row['label']))}\n"
        f.write(item)
f.close()

# Kinetics600 data list precessing
print('-' * 66)
print(f'k600: train: {len(k600_train_raw)}, val: {len(k600_val_raw)}')
k600_train_raw = k600_train_raw.drop_duplicates(subset=['youtube_id'])
print('after drop repeated video in k600 train set:', len(k600_train_raw))
k600_train_raw = k600_train_raw[
    ~k600_train_raw['youtube_id'].isin(k600_val_raw['youtube_id'])]
print('after drop leaked video in k600 val set:', len(k600_train_raw))
k600_val_raw = k600_val_raw.drop_duplicates(subset=['youtube_id'])
print('after drop repeated video in k600 val set:', len(k600_val_raw))

with open(f'{DATA_ROOT}/k600_train_raw.txt', 'w') as f:
    for index, row in k600_train_raw.iterrows():
        item = f"{os.path.join('k600/train', row['label'], row['video_id'])} {K722_TO_K600_MAPPING.index(name_to_trainid(row['label']))}\n"
        f.write(item)
f.close()
with open(f'{DATA_ROOT}/k600_val_raw.txt', 'w') as f:
    for index, row in k600_val_raw.iterrows():
        item = f"{os.path.join('k600/val', row['label'], row['video_id'])} {K722_TO_K600_MAPPING.index(name_to_trainid(row['label']))}\n"
        f.write(item)
f.close()

# Kinetics700 data list precessing
print('-' * 66)
print(f'k700: train: {len(k700_train_raw)}, val: {len(k700_val_raw)}')
k700_train_raw = k700_train_raw.drop_duplicates(subset=['youtube_id'])
print('after drop repeated video in k700 train set:', len(k700_train_raw))
k700_train_raw = k700_train_raw[
    ~k700_train_raw['youtube_id'].isin(k700_val_raw['youtube_id'])]
print('after drop leaked video in k700 val set:', len(k700_train_raw))
k700_val_raw = k700_val_raw.drop_duplicates(subset=['youtube_id'])
print('after drop repeated video in k700 val set:', len(k700_val_raw))

with open(f'{DATA_ROOT}/k700_train_raw.txt', 'w') as f:
    for index, row in k700_train_raw.iterrows():
        item = f"{os.path.join('k700/train', row['label'], row['video_id'])} {K722_TO_K700_MAPPING.index(name_to_trainid(row['label']))}\n"
        f.write(item)
f.close()
with open(f'{DATA_ROOT}/k700_val_raw.txt', 'w') as f:
    for index, row in k700_val_raw.iterrows():
        item = f"{os.path.join('k700/val', row['label'], row['video_id'])} {K722_TO_K700_MAPPING.index(name_to_trainid(row['label']))}\n"
        f.write(item)
f.close()


# Kinetics722 data list precessing
print('-' * 66)
print('k722 train length:', len(kinetics_file_dict['k722']['train']))
print(f"k722 val length: {len(kinetics_file_dict['k722']['val'])}")
print('-' * 66)

print('drop repeated video in k722 val set')
k722_val_drop = k722_val_raw.drop_duplicates(subset=['youtube_id'])
print('before drop duplicates (val): ', len(k722_val_raw))
print('after drop duplicates (val):', len(k722_val_drop))
print('-' * 66)

print('drop repeated video in k722 train set')
k722_train_drop = k722_train_raw.drop_duplicates(subset=['youtube_id'])
print('before drop duplicates: ', len(k722_train_raw))
print('after drop duplicates:', len(k722_train_drop))
print('-' * 66)

print('drop leaked video in k400/600/700 val set')
kinectics_val = {kinectics: pd.DataFrame(kinetics_file_dict[kinectics]['val'], columns=['label', 'video_id', 'youtube_id'])
                 for kinectics in ['k400', 'k600', 'k700']}
kinectics_train = {
    kinectics: pd.DataFrame(kinetics_file_dict[kinectics]['train'], columns=['label', 'video_id', 'youtube_id']) for
    kinectics in ['k400', 'k600', 'k700']}

k722_train_drop_leak = k722_train_drop.copy(deep=True)
for kinetics_name, val_csv in kinectics_val.items():
    k722_train_drop_leak = k722_train_drop_leak[
        ~ k722_train_drop_leak['youtube_id'].isin(kinectics_val[kinetics_name]['youtube_id'])]
    print(f'after drop {kinetics_name} validation:', len(k722_train_drop_leak))

with open(f'{DATA_ROOT}/k722_train_raw.txt', 'w') as f:
    for index, row in k722_train_drop_leak.iterrows():
        item = f"{os.path.join(row['set'], 'train', row['label'], row['video_id'])} {name_to_trainid(row['label'])}\n"
        f.write(item)
f.close()
with open(f'{DATA_ROOT}/k722_val_raw.txt', 'w') as f:
    for index, row in k722_val_drop.iterrows():
        item = f"{os.path.join(row['set'], 'val', row['label'], row['video_id'])} {name_to_trainid(row['label'])}\n"
        f.write(item)
f.close()

print('-' * 66)
print('right now our k722')
print('class num:', len(k722_train_drop_leak['label'].unique()))
print('k722 train video:', len(k722_train_drop_leak['label']))
print('k722 val video:', len(k722_val_drop['label']))

print('\nright now our k700')
print('class num:', len(k700_train_raw['label'].unique()))
print('k722 train video:', len(k700_train_raw['label']))
print('k722 val video:', len(k700_val_raw['label']))

print('\nright now our k600')
print('class num:', len(k600_train_raw['label'].unique()))
print('k722 train video:', len(k600_train_raw['label']))
print('k722 val video:', len(k600_val_raw['label']))

print('\nright now our k400')
print('class num:', len(k400_train_raw['label'].unique()))
print('k722 train video:', len(k400_train_raw['label']))
print('k722 val video:', len(k400_val_raw['label']))
print('-' * 66)

## Confirm that videos are correct

In [None]:
KINECTICS = ['k400', 'k600', 'k700', 'k722']
SETS = ['train', 'val']

print('Confirm that videos are correct.')
for kinetics in KINECTICS:
    for set in SETS:
        print(f'{kinetics}\t{set}:')
        total = 0
        video_infos = []
        ann_file = os.path.join(DATA_ROOT, f'{kinetics}_{set}_raw.txt')
        print(f'processing {ann_file}')
        with open(ann_file, 'r') as f:
            for line in f:
                line_split = line.strip().split()
                filename = ' '.join(line_split[:-1])
                label = int(line_split[-1])
                video_infos.append(dict(filename=filename, label=label))
                total += 1
        f.close()

        correct = []
        error = []
        for video in video_infos:
            file_name = video['filename']
            label = video['label']
            try:
                vr = VideoReader(os.path.join(DATA_ROOT, file_name), ctx=cpu(0))
                correct.append(f'{file_name} {label}')
            except:
                print(f'loading filed {file_name}')
                error.append(file_name)

        print(f'total: {total}\t correct:{len(correct)}\t error: {len(error)}')

        with open(os.path.join(DATA_ROOT, f'{ann_file.replace("_raw", "")}'), 'w') as f:
            for c in correct:
                f.write(f'{c}\n')
        f.close()

        with open(os.path.join(DATA_ROOT, f'{ann_file.replace("_raw", "_error")}'), 'w') as f:
            for e in error:
                f.write(f'{e}\n')
        f.close()
