# Split training and validation

In order to assess the accuracy of all our models, we create and save different dataframes corresponding to training and validation phases.

In [5]:
import numpy as np
import os
import pandas as pd
import re
import utils

In [6]:
# this cell MUST be run
# set a numpy seed to keep the same dataframes across machines
np.random.seed(69)

# set split training / validation
split = 0.8

## 1. Split training / validation without temporal organization

We want to use these dataframes in the case where our model does not implement correlation within elements of a same batch. This means that at training time, we load a complete random batch of frames. We also perform oversampling so as to have complete class balance.

In [7]:
# do the split only if it has not already been done
if not os.path.exists(utils.dfs_path + '/training_no_temp.pkl') or not os.path.exists(utils.dfs_path + '/validation_no_temp.pkl') or not os.path.exists(utils.dfs_path + '/training_no_temp_os.pkl') or os.path.exists(utils.dfs_path + '/validation_no_temp_os.pkl'):
    # load all train videos (labelled videos)
    all_train_videos = utils.get_train_test_video_names()['train']
    all_train_labels = pd.read_pickle(utils.labels_path)

    train_videos = np.array(all_train_videos)[np.random.choice(len(all_train_videos), int(split * len(all_train_videos)), replace=False)]
    validation_videos = np.setdiff1d(all_train_videos, train_videos, assume_unique=False)
    train_videos.sort()
    validation_videos.sort()

    # create two subdataframes for training and validation
    training_df = all_train_labels.loc[all_train_labels['videoname'].isin(train_videos)]
    validation_df = all_train_labels.loc[all_train_labels['videoname'].isin(validation_videos)]

    # add video number of frames as feature
    for phase in ['training', 'validation']:
        df = training_df.copy() if phase == 'training' else validation_df.copy()
        df['video_num_frames'] = (df.groupby('videoname')['frame'].transform('max') + 1).astype(int)
        df.to_pickle(utils.dfs_path + '/' + phase + '_no_temp.pkl')

        # oversample df
        df_size = df['label'].value_counts().max()
        lst = [df]
        for class_index, group in df.groupby('label'):
            lst.append(group.sample(df_size-len(group), replace=True))
        df = pd.concat(lst)
        df.to_pickle(utils.dfs_path + '/' + phase + '_no_temp_os.pkl')

## 2. Split training / validation with temporal organization

We want to use these dataframes in the case where our model does implement correlation within elements of a same batch. In other words, we want each batch to contain frames of the same video.

In [8]:
BATCH_SIZE = 64

if not os.path.exists(utils.dfs_path + '/training_temp.pkl') or not os.path.exists(utils.dfs_path + '/validation_temp.pkl'):
    for phase in ['training', 'validation']:
        df = pd.read_pickle(utils.dfs_path + '/' + phase + '_no_temp.pkl')
        for videoname in df['videoname'].unique():
            # pad with black images at the end of each video
            # so as to only have number of frames multiple of BATCH_SIZE
            # we can then process through the LSTM by batch without shuffling
            num_frames = df[df.videoname == videoname].shape[0]
            num_rows_to_add = (BATCH_SIZE - (num_frames % BATCH_SIZE)) % BATCH_SIZE
            video_num_frames = df[df.videoname == videoname]['video_num_frames'].tolist()[0]
            template_white_row = {'videoname': videoname, 'frame': 10000, 'label': -1, 'video_num_frames': video_num_frames}
            white_rows_to_add = pd.DataFrame([template_white_row for _ in range(num_rows_to_add)])
            df = pd.concat([df, white_rows_to_add], ignore_index=True)
        df['sort'] = df['videoname'].str[-12].astype(int) * 10000 + df['videoname'].str[-3:].astype(int)
        # sort rows
        df.sort_values(['sort', 'frame'],inplace=True, ascending=True)
        df.reset_index(inplace=True)
        df = df.drop(['sort', 'index'], axis=1)
        # shuffle batches
        index_list = np.array(df.index)
        np.random.shuffle(np.reshape(index_list, (-1, BATCH_SIZE)))
        shuffled_df = df.loc[index_list, :]
        shuffled_df.reset_index(inplace=True)
        shuffled_df = shuffled_df.drop(['index'], axis=1)
        # save df
        shuffled_df.to_pickle(utils.dfs_path + '/' + phase + '_temp.pkl')

## 3. Save testing df

This is to produce the testing df. As the weights are not updated at test time, we are able to use exactly the same format as the temporal training / validation dataframes. This way we can test all models on this testing df.

In [9]:
if not os.path.exists(utils.dfs_path + '/testing.pkl'):
    # list names of all videos in the test set
    surgeon1_test_videonames = ['RALIHR_surgeon01_fps01_' + str(x).zfill(4) for x in range(71,126)]
    surgeon2_test_videonames = ['RALIHR_surgeon02_fps01_' + str(x).zfill(4) for x in range(1,5)]
    surgeon3_test_videonames = ['RALIHR_surgeon03_fps01_0001']
    all_test_videonames = surgeon1_test_videonames + surgeon2_test_videonames + surgeon3_test_videonames

    # generate df with all frames of these videos
    videonames = []
    frames = []
    Ids = [] # id list for kaggle prediction
    for videoname in all_test_videonames:
        video_id = re.sub('[^0-9]', '',  videoname)
        video_id = video_id[0:2].zfill(3) + '-' + video_id[-4:] + '-'
        for frame in range(utils.count_frames(videoname)):
            videonames.append(videoname)
            frames.append(frame)
            Ids.append(video_id + str(frame + 1).zfill(5))
    df = pd.DataFrame({'videoname' : videonames, 'frame' : frames, 'Id': Ids})
    # add number of frames per video as feature
    df['video_num_frames'] = (df.groupby('videoname')['frame'].transform('max') + 1).astype(int)

    for videoname in df['videoname'].unique():
        # pad with black images at the end of each video
        # so as to only have number of frames multiple of BATCH_SIZE
        # we can then process through the LSTM by batch without shuffling
        num_frames = df[df.videoname == videoname].shape[0]
        num_rows_to_add = (BATCH_SIZE - (num_frames % BATCH_SIZE)) % BATCH_SIZE
        video_num_frames = df[df.videoname == videoname]['video_num_frames'].tolist()[0]
        Id = 'fake'
        template_white_row = {'videoname': videoname, 'frame': 10000, 'video_num_frames': video_num_frames, 'Id': Id}
        white_rows_to_add = pd.DataFrame([template_white_row for _ in range(num_rows_to_add)])
        df = pd.concat([df, white_rows_to_add], ignore_index=True)
    df['sort'] = df['videoname'].str[-12].astype(int) * 10000 + df['videoname'].str[-3:].astype(int)
    # sort rows
    df.sort_values(['sort', 'frame'],inplace=True, ascending=True)
    df.reset_index(inplace=True)
    df = df.drop(['sort', 'index'], axis=1)
    # shuffle batches
    index_list = np.array(df.index)
    np.random.shuffle(np.reshape(index_list, (-1, BATCH_SIZE)))
    shuffled_df = df.loc[index_list, :]
    shuffled_df.reset_index(inplace=True)
    shuffled_df = shuffled_df.drop(['index'], axis=1)
    # save df
    shuffled_df.to_pickle(utils.dfs_path + '/testing.pkl')

## 4. Create master training dfs

In [10]:
if not os.path.exists(utils.dfs_path + '/master_training_temp.pkl') or not os.path.exists(utils.dfs_path + '/master_training_no_temp.pkl'):
    for mode in ['temp', 'no_temp']:
        training_df = pd.read_pickle(utils.dfs_path + '/training_' + mode + '.pkl')
        validation_df = pd.read_pickle(utils.dfs_path + '/validation_' + mode + '.pkl')
        master_training_df = pd.concat([training_df, validation_df])
        master_training_df.to_pickle(utils.dfs_path + '/master_training_' + mode + '.pkl')