In [1]:
import pandas as pd

## Get annotations from roboflow in the right format
Reasons for this notebook:

Roboflow outputted blank lines for tiles that were marked null (i.e. did not have any annotations on them). These blank lines cause errors when running anchor-optimization scripts and during training of the model. Typically, images without annotations should have the format:
`/path/to/image.jpg,,,,,` See [here](https://github.com/fizyr/keras-retinanet#csv-datasets). The below code removes these blank lines. 

Additionally, Roboflow limits the number of images per free project to 10,000 so we used two projects. The first project is stored in `roboflow2` and the second project is stored in `roboflow_split5`. The below code concatenates the csv files.

Finally, some accidental annotations were made. These annotations are less than a pixel wide/tall and are removed in the below code.

In [2]:
train_path1 = "../roboflow2/train/_annotations.csv"
val_path1 = "../roboflow2/valid/_annotations.csv"
test_path1 = "../roboflow2/test/_annotations.csv"

In [3]:
train_path2 = "../roboflow_split5/train/_annotations.csv"
val_path2 = "../roboflow_split5/valid/_annotations.csv"
test_path2 = "../roboflow_split5/test/_annotations.csv"

In [4]:
def process_csv(path1, path2, output_path): 
    
    # remove the blank lines
    
    df1 = pd.read_csv(path1, header=None)
    #print("dataframe 1 shape", df1.shape)
    df2 = pd.read_csv(path2, header=None)
    #print("dataframe 2 shape", df2.shape)

    # rename columns
    
    df1.rename(columns={0:"image", 1:"x1", 2:"y1", 3:"x2", 4:"y2", 5:"class"}, inplace=True)
    df2.rename(columns={0:"image", 1:"x1", 2:"y1", 3:"x2", 4:"y2", 5:"class"}, inplace=True)
    
    # remove the empty annotations 
    # see our EDA below for how we decided on 10 pixel cut offs.
    
    df1 = df1[abs(df1['x1'] - df1['x2']) > 10]
    df1 = df1[abs(df1['y1'] - df1['y2']) > 10]
    #print("dataframe 1 updated shape", df1.shape)
    df2 = df2[abs(df2['x1'] - df2['x2']) > 10]
    df2 = df2[abs(df2['y1'] - df2['y2']) > 10]
    #print("dataframe 2 updated shape", df2.shape)

    # concatenate the dfs

    df_final = pd.concat([df1, df2])
    #print("final dataframe shape", df_final.shape)

    # save to file and return the dataframe 
    
    df_final.to_csv(output_path, index=False, header=False)
    return df_final

In [5]:
train = process_csv(train_path1, train_path2, '../roboflow2/train/annotations_final.csv')
val = process_csv(val_path1, val_path2, '../roboflow2/valid/annotations_final.csv')
test = process_csv(test_path1, test_path2, '../roboflow2/test/annotations_final.csv')

In [6]:
train

Unnamed: 0,image,x1,y1,x2,y2,class
0,5MSL3522-115_png.rf.312716cae2c0fe29726f0f1541...,271,113,365,187,cow
1,5MSL3522-115_png.rf.312716cae2c0fe29726f0f1541...,152,416,268,468,cow
2,5MSL3522-115_png.rf.312716cae2c0fe29726f0f1541...,181,327,279,394,cow
3,5MSL3522-115_png.rf.312716cae2c0fe29726f0f1541...,76,280,158,418,bull
4,5MSL3522-115_png.rf.312716cae2c0fe29726f0f1541...,183,189,270,249,cow
...,...,...,...,...,...,...
8774,5MSL0073-97_png.rf.79a05602adeca7aabdd4ff34a9b...,279,49,323,108,pup
8775,5MSL0073-97_png.rf.79a05602adeca7aabdd4ff34a9b...,241,47,303,80,pup
8776,5MSL0073-97_png.rf.79a05602adeca7aabdd4ff34a9b...,304,25,374,58,pup
8777,5MSL0073-97_png.rf.79a05602adeca7aabdd4ff34a9b...,181,13,249,48,pup


### EDA to determine smallest legitimate bounding box

Here we get a sense of the size of bounding boxes using the annotations in `roboflow2/train` which contains the bulk of our training images.

In [7]:
df_train1 = pd.read_csv(train_path1, header=None)
df_train1.rename(columns={0:"image", 1:"x1", 2:"y1", 3:"x2", 4:"y2", 5:"class"}, inplace=True)
df_train1

Unnamed: 0,image,x1,y1,x2,y2,class
0,5MSL3522-115_png.rf.312716cae2c0fe29726f0f1541...,271,113,365,187,cow
1,5MSL3522-115_png.rf.312716cae2c0fe29726f0f1541...,152,416,268,468,cow
2,5MSL3522-115_png.rf.312716cae2c0fe29726f0f1541...,181,327,279,394,cow
3,5MSL3522-115_png.rf.312716cae2c0fe29726f0f1541...,76,280,158,418,bull
4,5MSL3522-115_png.rf.312716cae2c0fe29726f0f1541...,183,189,270,249,cow
...,...,...,...,...,...,...
18888,4MSL0120-0_png.rf.ff5e9e891b37ec4a84bcfd8e5582...,350,161,391,195,pup
18889,4MSL0120-0_png.rf.ff5e9e891b37ec4a84bcfd8e5582...,227,124,266,175,pup
18890,4MSL0120-0_png.rf.ff5e9e891b37ec4a84bcfd8e5582...,336,136,448,173,cow
18891,4MSL0120-0_png.rf.ff5e9e891b37ec4a84bcfd8e5582...,37,146,69,192,pup


In [8]:
df_train1['width'] = abs(df_train1['x1'] - df_train1['x2'])
df_train1['height'] = abs(df_train1['y1'] - df_train1['y2'])

In [9]:
df_train1.sort_values('width', ascending=True).head(20)

Unnamed: 0,image,x1,y1,x2,y2,class,width,height
12543,5MSL4210-125_png.rf.09cc8cabe1d85d0ba993871d88...,214,86,214,87,cow,0,1
13657,5MSL3749-174_png.rf.1ade9f0908936fd853dc2ae9a0...,417,291,417,291,pup,0,0
13114,5MSL0019-0_png.rf.120741cd963b30d002adb1d4b882...,452,354,452,354,cow,0,0
15673,5MSL3770-139_png.rf.b2981b7f38b08cc099392476e6...,373,332,373,332,cow,0,0
15678,5MSL3517-55_png.rf.b29b469a63c1342efb230aa8b6e...,263,266,263,267,pup,0,1
9933,5MSL3439-113_png.rf.da0acb1f1cc6d4270675764446...,95,171,95,171,pup,0,0
4991,5MSL3748-9_png.rf.67e7afa2e9fafdb9e3e64e7da316...,149,395,149,395,pup,0,0
4354,5MSL3592-19_png.rf.618edbe6d00b0c2edc9e544f635...,283,42,283,43,pup,0,1
10480,5MSL3519-176_png.rf.e0ac82b1101734d4ae164a5aaf...,250,89,251,90,pup,1,1
18599,5MSL3779-141_png.rf.fbbaa46b6a552d2f9d3e4518e5...,389,246,390,246,pup,1,0


In [10]:
df_train1.sort_values('height', ascending=True).head(20)

Unnamed: 0,image,x1,y1,x2,y2,class,width,height
13114,5MSL0019-0_png.rf.120741cd963b30d002adb1d4b882...,452,354,452,354,cow,0,0
9024,5MSL3576-66_png.rf.a5a3531a0b32b2637c379ec32e8...,410,417,413,417,pup,3,0
4991,5MSL3748-9_png.rf.67e7afa2e9fafdb9e3e64e7da316...,149,395,149,395,pup,0,0
8198,5MSL3817-4_png.rf.99ebc1502f240664509650db0242...,403,198,404,198,pup,1,0
9933,5MSL3439-113_png.rf.da0acb1f1cc6d4270675764446...,95,171,95,171,pup,0,0
18599,5MSL3779-141_png.rf.fbbaa46b6a552d2f9d3e4518e5...,389,246,390,246,pup,1,0
15673,5MSL3770-139_png.rf.b2981b7f38b08cc099392476e6...,373,332,373,332,cow,0,0
7674,5MSL3779-94_png.rf.92918e157f8ebcb950fc9e5dce9...,260,345,261,345,pup,1,0
13657,5MSL3749-174_png.rf.1ade9f0908936fd853dc2ae9a0...,417,291,417,291,pup,0,0
4354,5MSL3592-19_png.rf.618edbe6d00b0c2edc9e544f635...,283,42,283,43,pup,0,1


## Quick EDA
Determine the number of elephant seals in each class for each split of the data

In [11]:
train_cows =  len(train[train['class'] == 'cow'])
train_bulls =  len(train[train['class'] == 'bull'])
train_pups =  len(train[train['class'] == 'pup'])

print("In the training dataset, there are", train_bulls, "bulls,", train_cows, "cows, and", train_pups, 
      "pups for a total of", len(train), "seals.")

In the training dataset, there are 1478 bulls, 6799 cows, and 19355 pups for a total of 27632 seals.


In [12]:
val_cows =  len(val[val['class'] == 'cow'])
val_bulls =  len(val[val['class'] == 'bull'])
val_pups =  len(val[val['class'] == 'pup'])

print("In the validation dataset, there are", val_bulls, "bulls,", val_cows, "cows, and", val_pups, 
      "pups for a total of", len(val), "seals.")

In the validation dataset, there are 151 bulls, 897 cows, and 2429 pups for a total of 3477 seals.


In [13]:
test_cows =  len(test[test['class'] == 'cow'])
test_bulls =  len(test[test['class'] == 'bull'])
test_pups =  len(test[test['class'] == 'pup'])

print("In the testing dataset, there are", test_bulls, "bulls,", test_cows, "cows, and", test_pups, 
      "pups for a total of", len(test), "seals.")

In the testing dataset, there are 159 bulls, 781 cows, and 2152 pups for a total of 3092 seals.
