# Tranin Test split

## Find non-standard volume tracings

In [1]:
import pandas as pd
import numpy as np

tracing_df = pd.read_csv('../../dataset/EchoNet/VolumeTracings.csv')
tracing_df.head()

Unnamed: 0,FileName,X1,Y1,X2,Y2,Frame
0,0X100009310A3BD7FC.avi,51.260417,15.348958,64.932292,69.125,46
1,0X100009310A3BD7FC.avi,50.037611,17.167841,53.367222,16.32133,46
2,0X100009310A3BD7FC.avi,49.157378,20.407629,57.090549,18.390722,46
3,0X100009310A3BD7FC.avi,48.538173,23.581055,59.997339,20.667707,46
4,0X100009310A3BD7FC.avi,47.918968,26.75448,62.904129,22.944693,46


In [5]:
file_names = tracing_df['FileName'].unique()
print('Number of files:', len(file_names))
print(file_names)

Number of files: 10025
['0X100009310A3BD7FC.avi' '0X1002E8FBACD08477.avi'
 '0X1005D03EED19C65B.avi' ... '0XFE6E32991136338.avi'
 '0XFE83FF3D3B13C3A.avi' '0XFEBEEFF93F6FEB9.avi']


In [13]:
from tqdm import tqdm

tracing_cnt = dict()
for name in tqdm(file_names):
    frames_cnt = dict()
    tracing_data = tracing_df[tracing_df['FileName'] == name]
    frames = tracing_data['Frame'].values
    for f in frames:
        if f not in frames_cnt:
            frames_cnt[f] = 1
        else:
            frames_cnt[f] += 1
    tracing_cnt[name] = frames_cnt

100%|██████████| 10025/10025 [03:30<00:00, 47.69it/s]


In [14]:
tracing_cnt

{'0X100009310A3BD7FC.avi': {46: 21, 61: 21},
 '0X1002E8FBACD08477.avi': {3: 21, 18: 21},
 '0X1005D03EED19C65B.avi': {24: 21, 35: 21},
 '0X10075961BC11C88E.avi': {91: 21, 108: 21},
 '0X10094BA0A028EAC3.avi': {137: 21, 156: 21},
 '0X100CF05D141FF143.avi': {132: 21, 148: 21},
 '0X100E3B8D3280BEC5.avi': {25: 21, 38: 21},
 '0X100E491B3CD58DE2.avi': {49: 21, 75: 21},
 '0X100F044876B98F90.avi': {56: 21, 72: 21},
 '0X101026B90DAE7E95.avi': {45: 21, 62: 21},
 '0X1012703CDC1436FE.avi': {154: 21, 172: 21},
 '0X1013E8A4864781B.avi': {35: 21, 46: 21},
 '0X1018521A3BC5CDBA.avi': {42: 21, 57: 21},
 '0X101C388397F66EDB.avi': {46: 21, 62: 21},
 '0X101CFC9C5351DCBE.avi': {59: 21, 73: 21},
 '0X101E654AF3FC07A8.avi': {109: 21, 132: 21},
 '0X10267ADF2E644E0.avi': {150: 21, 172: 21},
 '0X102AE9C68B2C46DA.avi': {48: 21, 63: 21},
 '0X102C51641C321436.avi': {0: 21, 15: 21},
 '0X102CFB07F752AAE6.avi': {163: 21, 184: 21},
 '0X102DD1B9BE03716.avi': {52: 21, 71: 21},
 '0X102E453603FA1440.avi': {46: 21, 60: 21},
 '

In [20]:
non_standard_tracings = []
for name, cnt in tqdm(tracing_cnt.items()):
    cnt_values = list(cnt.values())
    if cnt_values[0] == 21 and cnt_values[1] == 21:
        continue
    non_standard_tracings.append(name)
len(non_standard_tracings)

100%|██████████| 10025/10025 [00:00<00:00, 529884.16it/s]


93

## Find files without volume tracing

In [21]:
tot_files_df = pd.read_csv('../../dataset/EchoNet/FileList.csv')
tot_files_df.head()

Unnamed: 0,FileName,EF,ESV,EDV,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split
0,0X100009310A3BD7FC,78.498406,14.881368,69.210534,112,112,50,174,VAL
1,0X1002E8FBACD08477,59.101988,40.383876,98.742884,112,112,50,215,TRAIN
2,0X1005D03EED19C65B,62.363798,14.267784,37.909734,112,112,50,104,TRAIN
3,0X10075961BC11C88E,54.545097,33.143084,72.91421,112,112,55,122,TRAIN
4,0X10094BA0A028EAC3,24.887742,127.581945,169.855024,112,112,52,207,VAL


In [22]:
tot_file_names = tot_files_df['FileName'].values
len(tot_file_names)

10030

In [29]:
no_volume_tracing = list()
for name in tot_file_names:
    n = f'{name}.avi'
    if n not in tracing_cnt:
        no_volume_tracing.append(n)
len(no_volume_tracing)

6

In [33]:
# Check if the files without volume tracing are non-standard

for n in no_volume_tracing:
    if n in non_standard_tracings:
        print(n)

## Data Split

In [51]:
no_training_data = non_standard_tracings + no_volume_tracing
for i in range(len(no_training_data)):
    no_training_data[i] = no_training_data[i].replace('.avi', '')
no_training_data = np.array(no_training_data)
len(no_training_data), no_training_data[:5]

(99,
 array(['0X12430512E2BBCD55', '0X13D1459C51B5C32E', '0X183FA1CAAEA9C545',
        '0X1CF4B07994B62DBB', '0X1E12EEE43FD913E5'], dtype='<U18'))

In [52]:
total_names = list(tot_file_names)
for name in no_training_data:
    if name in total_names:
        total_names.remove(name)
total_names = np.array(total_names)
len(total_names)

9931

In [53]:
train_cnt = int(len(tot_file_names) * 0.6)
val_cnt = int(len(tot_file_names) * 0.2)
test_cnt = len(tot_file_names) - (train_cnt + val_cnt)
print(train_cnt, val_cnt, test_cnt, train_cnt + val_cnt + test_cnt)

6018 2006 2006 10030


In [56]:
# Shuffle the data
import random

random.seed(42)
random.shuffle(total_names)

train_files = total_names[:train_cnt]
val_files = total_names[train_cnt:train_cnt + val_cnt]
test_files = total_names[train_cnt + val_cnt:]
test_files = np.concatenate((test_files, no_training_data)) # Add the files without volume tracing to the test set
len(train_files), len(val_files), len(test_files)

(6018, 2006, 2006)

In [57]:
train_files[:5], val_files[:5], test_files[:5]

(array(['0X650881B7ADEF9668', '0X23B1641319A2AFA2', '0X21780197024E9D79',
        '0X304FEB38430F6379', '0X7FE48E5475ADE2E3'], dtype='<U18'),
 array(['0X348C4CEA595CD540', '0X52D27E1752D12960', '0X79DFA3F8C5BD03E1',
        '0X191F4BD1E4262B78', '0X76F1860F9A3BA0D'], dtype='<U18'),
 array(['0X65753CA4ABFD5B4D', '0X33D4E36501CE4BB3', '0X20A77232EFCBBA15',
        '0X1A0A263B22CCD966', '0X72BA22CA9CD1AEDD'], dtype='<U18'))

## Create symbolic links to video files

In [63]:
import os

os.makedirs('../../dataset/EchoNet/Train', exist_ok=True)
for name in train_files:
    os.symlink(f'../Videos/{name}.avi', f'../../dataset/EchoNet/Train/{name}.avi')
os.makedirs('../../dataset/EchoNet/Val', exist_ok=True)
for name in val_files:
    os.symlink(f'../Videos/{name}.avi', f'../../dataset/EchoNet/Val/{name}.avi')
os.makedirs('../../dataset/EchoNet/Test', exist_ok=True)
for name in test_files:
    os.symlink(f'..//Videos/{name}.avi', f'../../dataset/EchoNet/Test/{name}.avi')

## Save split data as a csv file

In [65]:
split_data = list()
for name in train_files:
    split_data.append((name, 'Train'))
for name in val_files:
    split_data.append((name, 'Val'))
for name in test_files:
    split_data.append((name, 'Test'))

split_df = pd.DataFrame(split_data, columns=['FileName', 'Split'])
split_df.head(), split_df.tail()

(             FileName  Split
 0  0X650881B7ADEF9668  Train
 1  0X23B1641319A2AFA2  Train
 2  0X21780197024E9D79  Train
 3  0X304FEB38430F6379  Train
 4  0X7FE48E5475ADE2E3  Train,
                  FileName Split
 10025  0X234005774F4CB5CD  Test
 10026  0X2DC68261CBCC04AE  Test
 10027  0X35291BE9AB90FB89  Test
 10028  0X6C435C1B417FDE8A  Test
 10029  0X5515B0BD077BE68A  Test)

In [66]:
split_df.to_csv('../../dataset/EchoNet/FileInfo.csv', index=False)