# Data Loaders


В этом ноутбуке описывается процесс формирования обучающей выборки для обучения модели классификации жестов в дальнейшем.

Для запуска данного ноутбука необходимо выполнить следующие команды:

1. Загрузить данные с помощью команды ниже — для этого потребуется около 90 GB на диске или виртуальном хранилище.
```bash
sh download_data.sh
```

    - Данная команда автоматически загрузит данные в директорию `./INPUT_DATA/TRAIN_DATA/`.

    - Скачанные `zip`- архивы будут доступны в директории `./INPUT_DATA/ZIP/`

2. Установить зависимости:

```bash
pip install -r requirements.txt
```

## Алгоритм

## Код

Если используется `google.colab`, то выполните следующий код.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -r drive/MyDrive/detectime/requirements.txt

In [2]:
cd drive/MyDrive/detectime/

/content/drive/MyDrive/detectime


Подключим библиотеки.

In [8]:
import os
import sys
import cv2
import json
import logging
import pandas as pd
import seaborn as sns
import torch
import face_detection as fd
from pathlib import Path
import matplotlib.pyplot as plt
from tqdm import tqdm

sys.path.append('.')
from definitions import ROOT_DIR
from detectime.augmentations import (
    result_crop
)
from detectime.maskrcnn import (
    load_model_custom
)
from mrcnn.config import Config
from detectime.utils import read_image

%matplotlib inline
%load_ext autoreload
%autoreload 2

log = logging.getLogger(__name__)

{"asctime": "2021-07-07 07:52:15", "name": "matplotlib.pyplot", "filename": "pyplot.py", "levelname": "DEBUG", "message": "Loaded backend module://ipykernel.pylab.backend_inline version unknown."}
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Пропишем пути к источникам данных.

In [4]:
DATA_PATH = ROOT_DIR / 'data'
NOTEBOOK_PATH = ROOT_DIR / 'notebooks'
INPUT_DATA = DATA_PATH / 'INPUT_DATA'
INPUT_IMAGES_FOLDER = INPUT_DATA / 'TRAIN_DATA'
TRAIN_IMG_FOLDER = INPUT_DATA / 'TRAIN_IMG'
SAVE_TRAIN_IMAGES_HANDS = TRAIN_IMG_FOLDER / 'HANDS'
SAVE_TRAIN_IMAGES_FACES= TRAIN_IMG_FOLDER / 'FACES'

JSON_FOLDER = INPUT_DATA / 'JSON'
FACES_JSON_PRETRAINED = JSON_FOLDER / 'train_with_bboxes.json'
TRAIN_LABELS = INPUT_DATA / 'train.csv'
HAND_DETECTION_FOLDER = ROOT_DIR / 'model' / 'mask_rcnn_hand_detection.h5'


Загрузим основной конфиг из файла `./config.yml`.

In [5]:
import yaml
from detectime.utils import convert_dict_to_tuple

CONFIG_PATH = ROOT_DIR / 'config.yml'

with open(CONFIG_PATH) as f:
    data = yaml.safe_load(f)
config = convert_dict_to_tuple(dictionary=data)

Загрузим все необходимое для модели обнаружения лиц.

In [6]:
device_name = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device_name)

model_detector_faces = fd.build_detector(
        config.detection.detector_type,
        confidence_threshold=.5,
        nms_iou_threshold=.3,
        device=device,
        max_resolution=640
)
print('device', device_name)

Downloading: "https://folk.ntnu.no/haakohu/RetinaFace_mobilenet025.pth" to /root/.cache/torch/hub/checkpoints/RetinaFace_mobilenet025.pth


  0%|          | 0.00/1.71M [00:00<?, ?B/s]

device cuda


Теперь все необходимое для модуля обнаружения рук.
Параметр `DETECTION_MIN_CONFIDENCE` можно варьировать, но все же выставим
большую вероятность того, что найденный объект является рукой - так мы повысим
качество выборки для обучения.


In [7]:
class HandConfig(Config):
    NAME = "hand"
    IMAGES_PER_GPU = 1
    NUM_CLASSES = 1 + 1
    STEPS_PER_EPOCH = 10
    DETECTION_MIN_CONFIDENCE = 0.99


class InferenceConfig(HandConfig):
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1

model_detector_hands = load_model_custom(
    InferenceConfig(),
    str(HAND_DETECTION_FOLDER)
)

(None, None, None, 1024)
(None, None, None, 1024)
{"asctime": "2021-07-07 07:45:23", "name": "h5py._conv", "filename": "attrs.py", "levelname": "DEBUG", "message": "Creating converter from 3 to 5"}


Теперь напишем функцию, способ работы которой описан выше в разделе *Алгоритм*.

In [46]:
def get_train_data(data_df,
                   detector_faces,
                   detector_hands,
                   output_path,
                   file_name='hands.json',
                   crop_coefficient=1.5,
                   crop_hand_coefficient=1.5,
                   save_per_num_images=100,
                   return_data=True,
                   verbose=False):
    if os.path.exists(JSON_FOLDER / file_name):
        with open(str(JSON_FOLDER / file_name), 'r') as file:
            result_arr = json.load(file)
            log.info('loaded file')
    else:
        result_arr = []
    log.info(f'Savedir hands: {output_path}')
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    indices = list(data_df.index)
    frames = data_df.frame_path.values
    labels = data_df.label.values

    for idx, image_path, label in tqdm(zip(indices, frames, labels), 
                                       total=len(data_df), desc='find hands'):
        img_path = str(INPUT_IMAGES_FOLDER / image_path)
        if not os.path.isfile(img_path):
            log.info(f'NO SUCH FILE {img_path}')
            continue
        else:
            img = read_image(img_path)
            # DETECTOR FACES
            detections = detector_faces.detect(img)
            all_faces = []
            for det in detections:
                x1, y1, x2, y2, s = det.tolist()
                w = x2 - x1
                h = y2 - y1
                bbox = [round(x1), round(y1), round(w), round(h)]
                all_faces.append(bbox)
            # DETECT HANDS
            all_hands = detector_hands.detect([img], verbose=0)[0]['rois']

            # AREA
            area = []
            faces_and_hands = []

            # CROP FACES
            for face in all_faces:
                for hand in all_hands:
                    x3, y3, x4, y4 = result_crop(
                        img,
                        face,
                        crop_coefficient=crop_coefficient
                    )
                    y1, x1, y2, x2 = hand

                    left_x, left_y = max(x1, x3), max(y1, y3)
                    right_x, right_y = min(x2, x4), min(y2, y4)

                    width, height = right_x - left_x, right_y - left_y
                    if width <= 0 or height <= 0:
                        area.append(0)
                    else:
                        area.append(width * height)
                    faces_and_hands.append((face, hand))
            if area:
                max_area = max(area)
                if max_area != 0:
                    index_of_max_area = area.index(max_area)
                    index_of_face_hand = faces_and_hands[index_of_max_area]
                    final_face, final_hand = index_of_face_hand
                    y1, x1, y2, x2 = final_hand
                    if label == 3:
                        x1, y1, x2, y2 = result_crop(
                            img,
                            [x1, y1, x2-x1, y2-y1],
                            crop_coefficient=crop_hand_coefficient
                            )
                    item = {
                        'frame_path': image_path,
                        'video_name': data_df.video_name.iloc[idx],
                        'frame_id': int(data_df.frame_id.iloc[idx]),
                        'label': int(label),
                        'bbox': [int(x1), int(y1), int(x2), int(y2)]
                    }
                    result_arr.append(item)
                    if verbose:
                        log.info(f'saved picture {image_path}')
                        
        if idx % save_per_num_images == 0 and idx > 0:
            with open(str(output_path / file_name), 'w') as file:
                json.dump(result_arr, file, indent=4)
                
    with open(str(output_path / file_name), 'w') as file:
        json.dump(result_arr, file, indent=4)
    if return_data:
        return result_arr


Загрузим данные.

In [14]:
CLASS_NAME2LABEL_DICT = {
    'no_gesture': 0,
    'stop': 1,
    'victory': 2,
    'mute': 3,
    'ok': 4,
    'like': 5,
    'dislike': 6
}

train_data = pd.read_csv(str(TRAIN_LABELS))
train_data['label'] = train_data['class_name'].map(CLASS_NAME2LABEL_DICT)

Исключим данные, помеченные в качестве `no_gesture` - будем получать данные только для 6 классов.

In [15]:
train_data_gestures = train_data[train_data['label'] != 0]
print(train_data_gestures.shape)

(175174, 5)


In [16]:
from pathlib import Path

exists_pictures = []

for dirname, _, filenames in os.walk(INPUT_IMAGES_FOLDER):
    for filename in filenames:
        video_plus_picture = list(Path(os.path.join(dirname, filename)).parts)[-2:]
        frame_path = '/'.join(video_plus_picture)
        exists_pictures.append(frame_path)

len(exists_pictures)

204547

In [18]:
train = (
    train_data_gestures[
        train_data_gestures['frame_path']
            .isin(exists_pictures)
    ].reset_index(drop=True)
)
train.shape

(174371, 5)

Начнем получение тренировочных данных.

In [50]:
data = get_train_data(train,
                      model_detector_faces,
                      model_detector_hands,
                      JSON_FOLDER,
                      file_name='hands.json',
                      crop_coefficient=1.5,
                      crop_hand_coefficient=1.5,
                      save_per_num_images=100,
                      return_data=True,
                      verbose=False
                      )