In [2]:
import pandas as pd

## Get annotations from roboflow in the right format
Reasons for this notebook:

Roboflow outputted blank lines for tiles that were marked null (i.e. did not have any annotations on them). These blank lines cause errors when running anchor-optimization scripts and during training of the model. Typically, images without annotations should have the format:
`/path/to/image.jpg,,,,,` See [here](https://github.com/fizyr/keras-retinanet#csv-datasets). The below code removes these blank lines. 

Additionally, Roboflow limits the number of images per free project to 10,000 so we used two projects. The first project is stored in `roboflow2` and the second project is stored in `roboflow_split5`. The below code concatenates the csv files.

Finally, some accidental annotations were made. These annotations are less than a pixel wide/tall and are removed in the below code.

In [12]:
train_path = "../Data/image-level-split/train/train_anno.csv"
val_path = "../Data/image-level-split/valid/valid_anno.csv"
test_path = "../Data/image-level-split/test/test_anno.csv"

In [15]:
#train_path2 = "../roboflow_split5/train/_annotations.csv"
#val_path2 = "../roboflow_split5/valid/_annotations.csv"
#test_path2 = "../roboflow_split5/test/_annotations.csv"

In [7]:
def process_csv(input_path, output_path): 
    
    # remove the blank lines
    
    df1 = pd.read_csv(input_path, header=None)
    print("dataframe 1 shape", df1.shape)

    # rename columns
    
    df1.rename(columns={0:"image", 1:"x1", 2:"y1", 3:"x2", 4:"y2", 5:"class", 6:"image"}, inplace=True)
    
    # remove the empty annotations 
    # see our EDA below for how we decided on 10 pixel cut offs.
    
    df1 = df1[abs(df1['x1'] - df1['x2']) >= 10]
    df1 = df1[abs(df1['y1'] - df1['y2']) >= 10]
    #print("dataframe 1 updated shape", df1.shape)

    # save to file and return the dataframe 
    
    df1.to_csv(output_path, index=False, header=False)
    return df1

In [13]:
train = process_csv(train_path, '../Data/image-level-split/train/annotations_final.csv')
val = process_csv(val_path, '../Data/image-level-split/valid/annotations_final.csv')
test = process_csv(test_path, '../Data/image-level-split/test/annotations_final.csv')

dataframe 1 shape (20242, 7)
dataframe 1 shape (2594, 7)
dataframe 1 shape (2616, 7)


In [29]:
train

Unnamed: 0,image,x1,y1,x2,y2,class,image.1
0,5MSL3787-38_png.rf.001a36c03801da4c7a9cdcad71b...,21,70,60,108,pup,5MSL3787
1,5MSL3787-38_png.rf.001a36c03801da4c7a9cdcad71b...,122,99,162,155,pup,5MSL3787
2,5MSL3787-38_png.rf.001a36c03801da4c7a9cdcad71b...,227,141,272,191,pup,5MSL3787
3,5MSL3787-38_png.rf.001a36c03801da4c7a9cdcad71b...,188,231,219,286,pup,5MSL3787
4,5MSL3787-38_png.rf.001a36c03801da4c7a9cdcad71b...,221,252,258,304,pup,5MSL3787
...,...,...,...,...,...,...,...
20237,5MSL3766-2_png.rf.5d5b6ec4a60deb36e95a419e2367...,272,128,336,244,bull,5MSL3766
20238,5MSL3766-2_png.rf.5d5b6ec4a60deb36e95a419e2367...,38,65,79,92,pup,5MSL3766
20239,5MSL3762-55_png.rf.5db7bcf7697d044ecad8383edd7...,110,135,208,237,bull,5MSL3762
20240,5MSL3762-55_png.rf.5db7bcf7697d044ecad8383edd7...,103,383,242,453,bull,5MSL3762


### EDA to determine smallest legitimate bounding box

Here we get a sense of the size of bounding boxes using the annotations in `roboflow2/train` which contains the bulk of our training images.

In [24]:
df_train1 = pd.read_csv(train_path1, header=None)
df_train1.rename(columns={0:"image", 1:"x1", 2:"y1", 3:"x2", 4:"y2", 5:"class", 6:"image"}, inplace=True)
df_train1

Unnamed: 0,image,x1,y1,x2,y2,class,image.1
0,5MSL3787-38_png.rf.001a36c03801da4c7a9cdcad71b...,21,70,60,108,pup,5MSL3787
1,5MSL3787-38_png.rf.001a36c03801da4c7a9cdcad71b...,122,99,162,155,pup,5MSL3787
2,5MSL3787-38_png.rf.001a36c03801da4c7a9cdcad71b...,227,141,272,191,pup,5MSL3787
3,5MSL3787-38_png.rf.001a36c03801da4c7a9cdcad71b...,188,231,219,286,pup,5MSL3787
4,5MSL3787-38_png.rf.001a36c03801da4c7a9cdcad71b...,221,252,258,304,pup,5MSL3787
...,...,...,...,...,...,...,...
20237,5MSL3766-2_png.rf.5d5b6ec4a60deb36e95a419e2367...,272,128,336,244,bull,5MSL3766
20238,5MSL3766-2_png.rf.5d5b6ec4a60deb36e95a419e2367...,38,65,79,92,pup,5MSL3766
20239,5MSL3762-55_png.rf.5db7bcf7697d044ecad8383edd7...,110,135,208,237,bull,5MSL3762
20240,5MSL3762-55_png.rf.5db7bcf7697d044ecad8383edd7...,103,383,242,453,bull,5MSL3762


In [25]:
df_train1['width'] = abs(df_train1['x1'] - df_train1['x2'])
df_train1['height'] = abs(df_train1['y1'] - df_train1['y2'])

In [26]:
df_train1.sort_values('width', ascending=True).head(20)

Unnamed: 0,image,x1,y1,x2,y2,class,image.1,width,height
7951,5MSL0068-70_png.rf.a34d53c55d2f29770aae773b74e...,332,419,332,420,cow,5MSL0068,0,1
5642,5MSL3592-19_png.rf.618edbe6d00b0c2edc9e544f635...,283,42,283,43,pup,5MSL3592,0,1
9346,5MSL3517-55_png.rf.b29b469a63c1342efb230aa8b6e...,263,266,263,267,pup,5MSL3517,0,1
886,5MSL4210-125_png.rf.09cc8cabe1d85d0ba993871d88...,214,86,214,87,cow,5MSL4210,0,1
6183,5MSL3748-9_png.rf.67e7afa2e9fafdb9e3e64e7da316...,149,395,149,395,pup,5MSL3748,0,0
4744,5MSL3439-113_png.rf.da0acb1f1cc6d4270675764446...,95,171,95,171,pup,5MSL3439,0,0
9359,5MSL3770-139_png.rf.b2981b7f38b08cc099392476e6...,373,332,373,332,cow,5MSL3770,0,0
12402,5MSL3507-165_png.rf.f799cb548f6f4de8de4bb421d1...,379,172,380,173,pup,5MSL3507,1,1
12739,5MSL3779-141_png.rf.fbbaa46b6a552d2f9d3e4518e5...,389,246,390,246,pup,5MSL3779,1,0
5243,5MSL3519-176_png.rf.e0ac82b1101734d4ae164a5aaf...,250,89,251,90,pup,5MSL3519,1,1


In [10]:
df_train1.sort_values('height', ascending=True).head(20)

Unnamed: 0,image,x1,y1,x2,y2,class,width,height
13114,5MSL0019-0_png.rf.120741cd963b30d002adb1d4b882...,452,354,452,354,cow,0,0
9024,5MSL3576-66_png.rf.a5a3531a0b32b2637c379ec32e8...,410,417,413,417,pup,3,0
4991,5MSL3748-9_png.rf.67e7afa2e9fafdb9e3e64e7da316...,149,395,149,395,pup,0,0
8198,5MSL3817-4_png.rf.99ebc1502f240664509650db0242...,403,198,404,198,pup,1,0
9933,5MSL3439-113_png.rf.da0acb1f1cc6d4270675764446...,95,171,95,171,pup,0,0
18599,5MSL3779-141_png.rf.fbbaa46b6a552d2f9d3e4518e5...,389,246,390,246,pup,1,0
15673,5MSL3770-139_png.rf.b2981b7f38b08cc099392476e6...,373,332,373,332,cow,0,0
7674,5MSL3779-94_png.rf.92918e157f8ebcb950fc9e5dce9...,260,345,261,345,pup,1,0
13657,5MSL3749-174_png.rf.1ade9f0908936fd853dc2ae9a0...,417,291,417,291,pup,0,0
4354,5MSL3592-19_png.rf.618edbe6d00b0c2edc9e544f635...,283,42,283,43,pup,0,1


## Quick EDA
Determine the number of elephant seals in each class for each split of the data

In [30]:
train_cows =  len(train[train['class'] == 'cow'])
train_bulls =  len(train[train['class'] == 'bull'])
train_pups =  len(train[train['class'] == 'pup'])

print("In the training dataset, there are", train_bulls, "bulls,", train_cows, "cows, and", train_pups, 
      "pups for a total of", len(train), "seals.")

In the training dataset, there are 1065 bulls, 4501 cows, and 13161 pups for a total of 20225 seals.


In [31]:
val_cows =  len(val[val['class'] == 'cow'])
val_bulls =  len(val[val['class'] == 'bull'])
val_pups =  len(val[val['class'] == 'pup'])

print("In the validation dataset, there are", val_bulls, "bulls,", val_cows, "cows, and", val_pups, 
      "pups for a total of", len(val), "seals.")

In the validation dataset, there are 112 bulls, 673 cows, and 1592 pups for a total of 2594 seals.


In [32]:
test_cows =  len(test[test['class'] == 'cow'])
test_bulls =  len(test[test['class'] == 'bull'])
test_pups =  len(test[test['class'] == 'pup'])

print("In the testing dataset, there are", test_bulls, "bulls,", test_cows, "cows, and", test_pups, 
      "pups for a total of", len(test), "seals.")

In the testing dataset, there are 106 bulls, 634 cows, and 1863 pups for a total of 2612 seals.


A final note: In addition to concatenating the csv files, we need to move all tiles into the same directories. The following commands will move images from `roboflow_split5` to `roboflow2`. 