# Dataset Preparation

- Split the videos into its frames and crop them using the DeepLabCut midbody position
- Extract the labels from the DeepLabCutFiles

In [1]:
import os
import pandas as pd
import numpy as np
import cv2

In [2]:
# Set and create the source and destination folders
videos_src_folder = '../data/raw/videos'
excels_src_folder = '../data/raw/excels'
csvs_src_folder = '../data/raw/csvs'

dataset_dest_folder = '../data/processed/ImageDatasetRGB'
try:
    if not os.path.exists(os.path.dirname(dataset_dest_folder)):
        os.mkdir(dataset_dest_folder)
        os.mkdir(os.path.join(dataset_dest_folder, 'features'))
        os.mkdir(os.path.join(dataset_dest_folder, 'labels'))    

except OSError as err:
    print(err)
    print('Dataset Destination Folders already exists')

In [3]:
# Read the content from the source folders
excels = sorted(os.listdir(excels_src_folder), key=lambda x: x.split('.')[0][-4:])[1:]
csvs = sorted(os.listdir(csvs_src_folder), key=lambda x: x.split('.')[0][-4:])[1:]
videos = sorted(os.listdir(videos_src_folder))[1:]

In [7]:
expansion = 80

for excel, csv, video in zip(excels, csvs, videos):
    # Get labels from excel
    
    # Read columns from excels corresponding to the 3 behaviours
    df_excel = pd.read_excel(os.path.join(excels_src_folder, excel), header=0, usecols='E,F,G')
    # Drop first 2 rows 
    df_excel.drop(index=[0,1], inplace=True)
    # Change voids per 0 
    df_excel.fillna(0, inplace=True)
    # Change marked cells for 1
    df_excel.replace(["x", "X"], 1, inplace=True)
    # Change blank space in column titles per _
    df_excel.columns = [col.lower().replace(' ','_') for col in df_excel.columns]
    # Change data format 
    df_excel = df_excel.astype(int)
    # Reset row indexes
    df_excel.reset_index(inplace=True, drop=True)
    # Save df as csv
    df_excel.to_csv(os.path.join(dataset_dest_folder, 'labels', excel.split('.')[0] + '.csv'), index=False)

    # Split video frames and crop by the midbody position
    #df = pd.read_csv(os.path.join(csvs_src_folder, csv), header=[0,1,2])
    df = pd.read_csv(os.path.join(csvs_src_folder, csv), header=0)
    print(df.shape)
    # Replace 3 row header with a row concatenation by '_'
    #df.columns = [('%s%s%s' % 
    #   ('%s' % a.lower() if not a.startswith("DLC") else '',
    #    ' %s' % b.lower() if not b.startswith("Unnamed") else '',
    #    ' %s' % c.lower() if not c.startswith("Unnamed") else '')
    #    ).strip().replace(' ', '_') 
    #    for a,b,c in df.columns
    #]
    df.columns = [col.lower().replace(' ','_') for col in df.columns]
    df.set_index(df.columns[0], inplace=True)

    midbody = np.concatenate((df['midbody_y'].values[:, np.newaxis], df['midbody_x'].values[:, np.newaxis]), axis=1)
    midbody = midbody.astype(int)

    print(df_excel.shape)

    # Get video frames
    os.mkdir(os.path.join(dataset_dest_folder, 'features', csv.split('.')[0]))
    vidcap = cv2.VideoCapture(os.path.join(videos_src_folder, video))
    success,image = vidcap.read()
    count = 0
    while success:
        # Build frame name
        frame_name = 'frame'
        for i in range(4-len(str(count))):
            frame_name += '0'
        frame_name += str(count) + '.jpg'

        # Crop image based on mouse midbody (make boxes having the mouse in the middle, should be square)
        top = max(0, midbody[count][0] - expansion) - max(0, midbody[count][0] + expansion - image.shape[0])
        bottom = min(image.shape[0], midbody[count][0] + expansion) + max(0, expansion - midbody[count][0])
        left = max(0, midbody[count][1] - expansion) - max(0, midbody[count][1] + expansion - image.shape[1])
        right = min(image.shape[1], midbody[count][1] + expansion) + max(0, expansion - midbody[count][1])

        frame = image[top:bottom, left:right]

        # Save frame
        cv2.imwrite(os.path.join(dataset_dest_folder, 'features', csv.split('.')[0], frame_name), frame)
        
        success, image = vidcap.read()
        count += 1

(6600, 86)
(6600, 3)
(6600, 86)
(6600, 3)
(6600, 86)
(6600, 3)
(6600, 86)
(6600, 3)
(6600, 86)
(6600, 3)
(6600, 86)
(6600, 3)
(6600, 86)
(6600, 3)
(6600, 86)
(6600, 3)
(6600, 86)
(6600, 3)
(6600, 86)
(6600, 3)
(6600, 86)
(6600, 3)
(6600, 86)
(6600, 3)
(6600, 86)
(6600, 3)
(6600, 86)
(6600, 3)
(6600, 86)
(6600, 3)
(6600, 86)
(6600, 3)
(6600, 86)
(6600, 3)
