# **Analyze sequences**

In [1]:
import os
import cv2
import subprocess
from tqdm.auto import tqdm
import pandas as pd
from IPython.display import Video, display, HTML
import warnings; warnings.simplefilter("ignore")


from StarfishDataSet import BASE_DIR

df = pd.read_csv("/kaggle/input/tensorflow-great-barrier-reef/train.csv")
df['annotations'] = df['annotations'].apply(eval)
df['n_annotations'] = df['annotations'].str.len()
df['has_annotations'] = df['annotations'].str.len() > 0
df['has_2_or_more_annotations'] = df['annotations'].str.len() >= 2
df['doesnt_have_annotations'] = df['annotations'].str.len() == 0
df['image_path'] = BASE_PATH + "video_" + df['video_id'].astype(str) + "/" + df['video_frame'].astype(str) + ".jpg"

In [2]:
df

Unnamed: 0,video_id,sequence,video_frame,sequence_frame,image_id,annotations,n_annotations,has_annotations,has_2_or_more_annotations,doesnt_have_annotations,image_path
0,0,40258,0,0,0-0,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...
1,0,40258,1,1,0-1,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...
2,0,40258,2,2,0-2,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...
3,0,40258,3,3,0-3,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...
4,0,40258,4,4,0-4,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...
...,...,...,...,...,...,...,...,...,...,...,...
23496,2,29859,10755,2983,2-10755,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...
23497,2,29859,10756,2984,2-10756,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...
23498,2,29859,10757,2985,2-10757,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...
23499,2,29859,10758,2986,2-10758,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...


In [3]:
df['sequence'].unique()

array([40258, 45518, 59337,  8399, 45015, 35305, 53708,   996, 60510,
       44160, 15827, 29424,  8503, 18048, 17665, 60754, 37114, 26651,
       22643, 29859])

In [4]:
df['sequence'].nunique()

20

In [5]:
df.groupby("sequence")['video_id'].nunique()

sequence
996      1
8399     1
8503     1
15827    1
17665    1
18048    1
22643    1
26651    1
29424    1
29859    1
35305    1
37114    1
40258    1
44160    1
45015    1
45518    1
53708    1
59337    1
60510    1
60754    1
Name: video_id, dtype: int64

In [6]:
# Videos 0 and 1 have 8 sequences, while video 2 has 4
df.groupby("video_id")['sequence'].nunique()

video_id
0    8
1    8
2    4
Name: sequence, dtype: int64

In [7]:
df_agg = df.groupby(["video_id", 'sequence']).agg({'sequence_frame': 'count', 'has_annotations': 'sum', 'doesnt_have_annotations': 'sum'})\
           .rename(columns={'sequence_frame': 'Total Frames', 'has_annotations': 'Frames with at least 1 object', 'doesnt_have_annotations': "Frames with no object"})
df_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,Total Frames,Frames with at least 1 object,Frames with no object
video_id,sequence,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,996,923,209,714
0,8399,1423,564,859
0,35305,853,80,773
0,40258,480,285,195
0,45015,617,22,595
0,45518,798,123,675
0,53708,1077,704,373
0,59337,537,156,381
1,8503,2843,1100,1743
1,15827,770,74,696


In [8]:
df_agg.sort_values("Total Frames")

Unnamed: 0_level_0,Unnamed: 1_level_0,Total Frames,Frames with at least 1 object,Frames with no object
video_id,sequence,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,18048,71,71,0
1,17665,87,87,0
1,44160,151,0,151
1,29424,184,0,184
0,40258,480,285,195
0,59337,537,156,381
0,45015,617,22,595
1,15827,770,74,696
0,45518,798,123,675
0,35305,853,80,773


In [9]:
df_agg.sort_values("Frames with at least 1 object")

Unnamed: 0_level_0,Unnamed: 1_level_0,Total Frames,Frames with at least 1 object,Frames with no object
video_id,sequence,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,37114,2800,0,2800
1,44160,151,0,151
1,29424,184,0,184
0,45015,617,22,595
2,26651,1525,29,1496
1,18048,71,71,0
2,29859,2988,71,2917
1,15827,770,74,696
0,35305,853,80,773
1,17665,87,87,0


In [10]:
# image_id is a unique identifier for a row
df['image_id'].nunique() == len(df)

True

In [11]:
df_agg.loc[[(0, 40258)]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Total Frames,Frames with at least 1 object,Frames with no object
video_id,sequence,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,40258,480,285,195


In [12]:
pd.set_option("display.max_rows", 500)
df[df['sequence'] == 40258]

Unnamed: 0,video_id,sequence,video_frame,sequence_frame,image_id,annotations,n_annotations,has_annotations,has_2_or_more_annotations,doesnt_have_annotations,image_path
0,0,40258,0,0,0-0,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...
1,0,40258,1,1,0-1,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...
2,0,40258,2,2,0-2,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...
3,0,40258,3,3,0-3,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...
4,0,40258,4,4,0-4,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...
5,0,40258,5,5,0-5,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...
6,0,40258,6,6,0-6,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...
7,0,40258,7,7,0-7,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...
8,0,40258,8,8,0-8,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...
9,0,40258,9,9,0-9,[],0,False,False,True,../input/tensorflow-great-barrier-reef/train_i...


In [13]:
df['start_cut_here'] = df['has_annotations'] & df['doesnt_have_annotations'].shift(1)  & df['doesnt_have_annotations'].shift(2)
df['end_cut_here'] = df['doesnt_have_annotations'] & df['has_annotations'].shift(1)  & df['has_annotations'].shift(2)
df['sequence_change'] = df['sequence'] != df['sequence'].shift(1)
df['last_row'] =  df.index == len(df)-1
df['cut_here'] = df['start_cut_here'] | df['end_cut_here'] | df['sequence_change'] | df['last_row']

In [14]:
start_idx = 0
for subsequence_id, end_idx in enumerate(df[df['cut_here']].index):
    df.loc[start_idx:end_idx, 'subsequence_id'] = subsequence_id
    start_idx = end_idx

In [15]:
df['subsequence_id'] = df['subsequence_id'].astype(int)

In [16]:
df['subsequence_id'].nunique()

137

In [17]:
drop_cols = ['start_cut_here', 'end_cut_here', 'sequence_change', 'last_row', 'cut_here', 'has_2_or_more_annotations', 'doesnt_have_annotations']
df = df.drop(drop_cols, axis=1)
df.head()

Unnamed: 0,video_id,sequence,video_frame,sequence_frame,image_id,annotations,n_annotations,has_annotations,image_path,subsequence_id
0,0,40258,0,0,0-0,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,1
1,0,40258,1,1,0-1,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,1
2,0,40258,2,2,0-2,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,1
3,0,40258,3,3,0-3,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,1
4,0,40258,4,4,0-4,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,1


In [18]:
df.groupby("subsequence_id")['has_annotations'].mean().round(2).sort_values().value_counts()

0.00    73
1.00    62
0.80     1
0.97     1
Name: has_annotations, dtype: int64

In [19]:
df_subseq_agg = df.groupby("subsequence_id")['has_annotations'].mean()
df_subseq_agg[~df_subseq_agg.isin([0, 1])]

subsequence_id
52    0.972973
53    0.800000
Name: has_annotations, dtype: float64

In [20]:
df[df['subsequence_id'] == 52]

Unnamed: 0,video_id,sequence,video_frame,sequence_frame,image_id,annotations,n_annotations,has_annotations,image_path,subsequence_id
5582,0,53708,9759,874,0-9759,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,52
5583,0,53708,9760,875,0-9760,"[{'x': 919, 'y': 319, 'width': 36, 'height': 31}]",1,True,../input/tensorflow-great-barrier-reef/train_i...,52
5584,0,53708,9761,876,0-9761,"[{'x': 919, 'y': 325, 'width': 36, 'height': 31}]",1,True,../input/tensorflow-great-barrier-reef/train_i...,52
5585,0,53708,9762,877,0-9762,"[{'x': 919, 'y': 332, 'width': 36, 'height': 31}]",1,True,../input/tensorflow-great-barrier-reef/train_i...,52
5586,0,53708,9763,878,0-9763,"[{'x': 917, 'y': 333, 'width': 42, 'height': 35}]",1,True,../input/tensorflow-great-barrier-reef/train_i...,52
5587,0,53708,9764,879,0-9764,"[{'x': 917, 'y': 337, 'width': 42, 'height': 35}]",1,True,../input/tensorflow-great-barrier-reef/train_i...,52
5588,0,53708,9765,880,0-9765,"[{'x': 917, 'y': 341, 'width': 42, 'height': 35}]",1,True,../input/tensorflow-great-barrier-reef/train_i...,52
5589,0,53708,9766,881,0-9766,"[{'x': 917, 'y': 351, 'width': 42, 'height': 35}]",1,True,../input/tensorflow-great-barrier-reef/train_i...,52
5590,0,53708,9767,882,0-9767,"[{'x': 917, 'y': 361, 'width': 42, 'height': 35}]",1,True,../input/tensorflow-great-barrier-reef/train_i...,52
5591,0,53708,9768,883,0-9768,"[{'x': 917, 'y': 371, 'width': 42, 'height': 35}]",1,True,../input/tensorflow-great-barrier-reef/train_i...,52


In [21]:
df[df['subsequence_id'] == 53]

Unnamed: 0,video_id,sequence,video_frame,sequence_frame,image_id,annotations,n_annotations,has_annotations,image_path,subsequence_id
5619,0,53708,9796,911,0-9796,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,53
5620,0,53708,9797,912,0-9797,"[{'x': 482, 'y': 616, 'width': 69, 'height': 57}]",1,True,../input/tensorflow-great-barrier-reef/train_i...,53
5621,0,53708,9798,913,0-9798,"[{'x': 480, 'y': 636, 'width': 69, 'height': 57}]",1,True,../input/tensorflow-great-barrier-reef/train_i...,53
5622,0,53708,9799,914,0-9799,"[{'x': 478, 'y': 657, 'width': 69, 'height': 57}]",1,True,../input/tensorflow-great-barrier-reef/train_i...,53
5623,0,53708,9800,915,0-9800,"[{'x': 478, 'y': 684, 'width': 69, 'height': 35}]",1,True,../input/tensorflow-great-barrier-reef/train_i...,53


In [22]:
df[df['subsequence_id'] == 54]

Unnamed: 0,video_id,sequence,video_frame,sequence_frame,image_id,annotations,n_annotations,has_annotations,image_path,subsequence_id
5624,0,53708,9801,916,0-9801,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,54
5625,0,53708,9802,917,0-9802,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,54
5626,0,53708,9803,918,0-9803,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,54
5627,0,53708,9804,919,0-9804,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,54
5628,0,53708,9805,920,0-9805,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,54
5629,0,53708,9806,921,0-9806,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,54
5630,0,53708,9807,922,0-9807,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,54
5631,0,53708,9808,923,0-9808,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,54
5632,0,53708,9809,924,0-9809,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,54
5633,0,53708,9810,925,0-9810,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,54


# Let's see how a sequence and a subsequence look like as videos!!

In [23]:
! mkdir videos/

In [24]:
def load_image(img_path):
    assert os.path.exists(img_path), f'{img_path} does not exist.'
    img = cv2.imread(img_path)
    return img

def load_image_with_annotations(img_path, annotations):
    img = load_image(img_path)
    if len(annotations) > 0:
        for ann in annotations:
            cv2.rectangle(img, (ann['x'], ann['y']),
                (ann['x'] + ann['width'], ann['y'] + ann['height']),
                (255, 255, 0), thickness=2,)
    return img

def make_video(df, part_id, is_subsequence=False):
    """
    Args:
        - part_id: either a sequence or a subsequence id
    """
    
    if is_subsequence:
        part_str = "subsequence_id"
    else:
        part_str = "sequence"
    
    print(f"Creating video for part={part_id}, is_subsequence={is_subsequence} (querying by {part_str})")
    # partly borrowed from https://github.com/RobMulla/helmet-assignment/blob/main/helmet_assignment/video.py
    fps = 15 # don't know exact value
    width = 1280
    height = 720
    save_path = f'videos/video_{part_str}_{part_id}.mp4'
    tmp_path = f'videos/tmp_video_{part_str}_{part_id}.mp4'
    
    
    output_video = cv2.VideoWriter(tmp_path, cv2.VideoWriter_fourcc(*"MP4V"), fps, (width, height))
    
    df_part = df.query(f'{part_str} == @part_id')
    for _, row in tqdm(df_part.iterrows(), total=len(df_part)):
        img = load_image_with_annotations(row.image_path, row.annotations)
        output_video.write(img)
    
    output_video.release()
    # Not all browsers support the codec, we will re-load the file at tmp_output_path
    # and convert to a codec that is more broadly readable using ffmpeg
    if os.path.exists(save_path):
        os.remove(save_path)
    subprocess.run(
        ["ffmpeg", "-i", tmp_path, "-crf", "18", "-preset", "veryfast", "-vcodec", "libx264", save_path],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL
    )
    os.remove(tmp_path)
    print(f"Finished creating video for {part_id}... saved as {save_path}")
    return save_path

In [25]:
video_path = make_video(df, 40258)

Creating video for part=40258, is_subsequence=False (querying by sequence)


OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'


  0%|          | 0/480 [00:00<?, ?it/s]

Finished creating video for 40258... saved as videos/video_sequence_40258.mp4


In [26]:
Video(video_path, width= 1280/2, height= 720/2)

In [27]:
subsequences = df.loc[df['sequence'] == 40258, 'subsequence_id'].unique()
subsequences

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [28]:
for subsequence in subsequences:
    video_path = make_video(df, subsequence, is_subsequence=True)
    display(HTML(f"<h2>Subsequence ID: {subsequence}</h2>"))
    display(Video(video_path, width= 1280/2, height= 720/2))

Creating video for part=1, is_subsequence=True (querying by subsequence_id)


OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'


  0%|          | 0/16 [00:00<?, ?it/s]

Finished creating video for 1... saved as videos/video_subsequence_id_1.mp4


Creating video for part=2, is_subsequence=True (querying by subsequence_id)


OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'


  0%|          | 0/164 [00:00<?, ?it/s]

Finished creating video for 2... saved as videos/video_subsequence_id_2.mp4


Creating video for part=3, is_subsequence=True (querying by subsequence_id)


OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'


  0%|          | 0/52 [00:00<?, ?it/s]

Finished creating video for 3... saved as videos/video_subsequence_id_3.mp4


Creating video for part=4, is_subsequence=True (querying by subsequence_id)


OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'


  0%|          | 0/58 [00:00<?, ?it/s]

Finished creating video for 4... saved as videos/video_subsequence_id_4.mp4


Creating video for part=5, is_subsequence=True (querying by subsequence_id)


OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'


  0%|          | 0/35 [00:00<?, ?it/s]

Finished creating video for 5... saved as videos/video_subsequence_id_5.mp4


Creating video for part=6, is_subsequence=True (querying by subsequence_id)


OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'


  0%|          | 0/50 [00:00<?, ?it/s]

Finished creating video for 6... saved as videos/video_subsequence_id_6.mp4


Creating video for part=7, is_subsequence=True (querying by subsequence_id)


OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'


  0%|          | 0/11 [00:00<?, ?it/s]

Finished creating video for 7... saved as videos/video_subsequence_id_7.mp4


Creating video for part=8, is_subsequence=True (querying by subsequence_id)


OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'


  0%|          | 0/13 [00:00<?, ?it/s]

Finished creating video for 8... saved as videos/video_subsequence_id_8.mp4


Creating video for part=9, is_subsequence=True (querying by subsequence_id)


OpenCV: FFMPEG: tag 0x5634504d/'MP4V' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'


  0%|          | 0/81 [00:00<?, ?it/s]

Finished creating video for 9... saved as videos/video_subsequence_id_9.mp4


# Generate some common splits based on subsequences¶

In [29]:
from sklearn.model_selection import train_test_split, StratifiedKFold
df.head()

Unnamed: 0,video_id,sequence,video_frame,sequence_frame,image_id,annotations,n_annotations,has_annotations,image_path,subsequence_id
0,0,40258,0,0,0-0,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,1
1,0,40258,1,1,0-1,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,1
2,0,40258,2,2,0-2,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,1
3,0,40258,3,3,0-3,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,1
4,0,40258,4,4,0-4,[],0,False,../input/tensorflow-great-barrier-reef/train_i...,1


In [30]:
df_split  = df.groupby("subsequence_id").agg({'has_annotations': 'max', 'video_frame': 'count'}).astype(int).reset_index()
df_split.head()

Unnamed: 0,subsequence_id,has_annotations,video_frame
0,1,0,16
1,2,1,164
2,3,0,52
3,4,1,58
4,5,0,35


# Train-validation splits for 1%, 5%, 10% and 20%

In [31]:
!mkdir train-validation-split/

In [32]:
def analize_split(df_train, df_val, df):
     # Analize results
    print(f"   Train images                 : {len(df_train) / len(df):.3f}")
    print(f"   Val   images                 : {len(df_val) / len(df):.3f}")
    print()
    print(f"   Train images with annotations: {len(df_train[df_train['has_annotations']]) / len(df[df['has_annotations']]):.3f}")
    print(f"   Val   images with annotations: {len(df_val[df_val['has_annotations']]) / len(df[df['has_annotations']]):.3f}")
    print()
    print(f"   Train images w/no annotations: {len(df_train[~df_train['has_annotations']]) / len(df[~df['has_annotations']]):.3f}")
    print(f"   Val   images w/no annotations: {len(df_val[~df_val['has_annotations']]) / len(df[~df['has_annotations']]):.3f}")
    print()
    print(f"   Train mean annotations       : {df_train['n_annotations'].mean():.3f}")
    print(f"   Val   mean annotations       : {df_val['n_annotations'].mean():.3f}")
    
    print()

In [33]:
for test_size in [0.01, 0.05, 0.1, 0.2]:
    print(f"Generating train-validation split with {test_size*100}% validation")
    df_train_idx, df_val_idx = train_test_split(df_split['subsequence_id'], stratify=df_split["has_annotations"], test_size=test_size, random_state=42)
    df['is_train'] = df['subsequence_id'].isin(df_train_idx)
    df_train, df_val = df[df['is_train']], df[~df['is_train']]
    
    # Print some statistics
    analize_split(df_train, df_val, df)
    
    # Save to file
    f_name = f"train-validation-split/train-{test_size}.csv"
    print(f"Saving file to {f_name}")
    df.to_csv(f_name, index=False)
    print()

Generating train-validation split with 1.0% validation
   Train images                 : 0.996
   Val   images                 : 0.004

   Train images with annotations: 0.989
   Val   images with annotations: 0.011

   Train images w/no annotations: 0.998
   Val   images w/no annotations: 0.002

   Train mean annotations       : 0.505
   Val   mean annotations       : 0.723

Saving file to train-validation-split/train-0.01.csv

Generating train-validation split with 5.0% validation
   Train images                 : 0.946
   Val   images                 : 0.054

   Train images with annotations: 0.966
   Val   images with annotations: 0.034

   Train images w/no annotations: 0.941
   Val   images w/no annotations: 0.059

   Train mean annotations       : 0.527
   Val   mean annotations       : 0.150

Saving file to train-validation-split/train-0.05.csv

Generating train-validation split with 10.0% validation
   Train images                 : 0.918
   Val   images                 : 0.08

In [34]:
!ls -l train-validation-split/

total 12596
-rw-r--r-- 1 root root 3220958 Dec 14 11:07 train-0.01.csv
-rw-r--r-- 1 root root 3222127 Dec 14 11:07 train-0.05.csv
-rw-r--r-- 1 root root 3222782 Dec 14 11:07 train-0.1.csv
-rw-r--r-- 1 root root 3224772 Dec 14 11:07 train-0.2.csv


# Create 5-folds cross validation

In [35]:
df = df.drop("is_train", axis=1)

In [36]:
n_splits = 5
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2021)
for fold_id, (_, val_idx) in enumerate(kf.split(df_split['subsequence_id'], y=df_split["has_annotations"])):
    subseq_val_idx = df_split['subsequence_id'].iloc[val_idx]
    df.loc[df['subsequence_id'].isin(subseq_val_idx), 'fold'] = fold_id
    
df['fold'] = df['fold'].astype(int)
df['fold'].value_counts(dropna=False)

4    7680
0    5223
1    4030
3    3969
2    2599
Name: fold, dtype: int64

In [37]:
for fold_id in df['fold'].sort_values().unique():
    print("=============================")
    print(f"Analyzing fold {fold_id}")
    df_train, df_val = df[df['fold'] != fold_id], df[df['fold'] == fold_id]
    analize_split(df_train, df_val, df)
    print()

Analyzing fold 0
   Train images                 : 0.778
   Val   images                 : 0.222

   Train images with annotations: 0.828
   Val   images with annotations: 0.172

   Train images w/no annotations: 0.764
   Val   images w/no annotations: 0.236

   Train mean annotations       : 0.578
   Val   mean annotations       : 0.257


Analyzing fold 1
   Train images                 : 0.829
   Val   images                 : 0.171

   Train images with annotations: 0.820
   Val   images with annotations: 0.180

   Train images w/no annotations: 0.831
   Val   images w/no annotations: 0.169

   Train mean annotations       : 0.544
   Val   mean annotations       : 0.326


Analyzing fold 2
   Train images                 : 0.889
   Val   images                 : 0.111

   Train images with annotations: 0.822
   Val   images with annotations: 0.178

   Train images w/no annotations: 0.907
   Val   images w/no annotations: 0.093

   Train mean annotations       : 0.502
   Val   mean an

In [38]:
!mkdir cross-validation/

In [39]:
df.to_csv("cross-validation/train-5folds.csv", index=False)

# Create 10-fold cross validation

In [40]:
n_splits = 10
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2021)
for fold_id, (_, val_idx) in enumerate(kf.split(df_split['subsequence_id'], y=df_split["has_annotations"])):
    subseq_val_idx = df_split['subsequence_id'].iloc[val_idx]
    df.loc[df['subsequence_id'].isin(subseq_val_idx), 'fold'] = fold_id
    
df['fold'] = df['fold'].astype(int)
df['fold'].value_counts(dropna=False)

8    5911
0    3250
2    2410
7    2039
1    1987
6    1841
9    1762
5    1610
3    1570
4    1121
Name: fold, dtype: int64

In [41]:
for fold_id in df['fold'].sort_values().unique():
    print("=============================")
    print(f"Analyzing fold {fold_id}")
    df_train, df_val = df[df['fold'] != fold_id], df[df['fold'] == fold_id]
    analize_split(df_train, df_val, df)
    print()

Analyzing fold 0
   Train images                 : 0.862
   Val   images                 : 0.138

   Train images with annotations: 0.937
   Val   images with annotations: 0.063

   Train images w/no annotations: 0.842
   Val   images w/no annotations: 0.158

   Train mean annotations       : 0.557
   Val   mean annotations       : 0.192


Analyzing fold 1
   Train images                 : 0.915
   Val   images                 : 0.085

   Train images with annotations: 0.894
   Val   images with annotations: 0.106

   Train images w/no annotations: 0.921
   Val   images w/no annotations: 0.079

   Train mean annotations       : 0.520
   Val   mean annotations       : 0.354


Analyzing fold 2
   Train images                 : 0.897
   Val   images                 : 0.103

   Train images with annotations: 0.922
   Val   images with annotations: 0.078

   Train images w/no annotations: 0.891
   Val   images w/no annotations: 0.109

   Train mean annotations       : 0.532
   Val   mean an

In [42]:
df.to_csv("D:/79381/Downloads/train-10folds.csv", index=False)