In [1]:
import pandas as pd

## Get annotations from roboflow in the right format

Roboflow outputted blank lines for tiles that were marked null (i.e. did not have any annotations on them). These blank lines cause errors when running anchor-optimization scripts and during training of the model. The below code removes these blank lines. 

Typically, images without annotations should have the format:
`/path/to/image.jpg,,,,,` See [here](https://github.com/fizyr/keras-retinanet#csv-datasets).

In [14]:
train_path = "../roboflow1/train/_annotations.csv"
val_path = "../roboflow1/valid/_annotations.csv"
test_path = "../roboflow1/test/_annotations.csv"

In [20]:
df_train = pd.read_csv(train_path, header=None)
df_train

Unnamed: 0,0,1,2,3,4,5
0,5MSL0052-104_png.rf.31865184baab286c9691971b7b...,68,333,119,372,pup
1,5MSL0052-104_png.rf.31865184baab286c9691971b7b...,147,313,200,337,pup
2,5MSL0052-104_png.rf.31865184baab286c9691971b7b...,165,278,227,312,pup
3,5MSL0052-104_png.rf.31865184baab286c9691971b7b...,208,261,267,291,pup
4,5MSL0052-104_png.rf.31865184baab286c9691971b7b...,256,258,306,307,pup
...,...,...,...,...,...,...
9781,5MSL3521-69_png.rf.cfdbbfe710ea2b580c2d78f618b...,364,90,412,127,pup
9782,5MSL3521-69_png.rf.cfdbbfe710ea2b580c2d78f618b...,378,235,421,285,pup
9783,5MSL3521-69_png.rf.cfdbbfe710ea2b580c2d78f618b...,466,290,499,348,pup
9784,5MSL3521-69_png.rf.cfdbbfe710ea2b580c2d78f618b...,440,420,472,474,pup


In [13]:
df_train.to_csv("../roboflow1/train/_annotations.csv", index=False, header=False)

In [15]:
df_val = pd.read_csv(val_path, header=None)
df_val

Unnamed: 0,0,1,2,3,4,5
0,5MSL0107-70_png.rf.019d1b977ae589259de46b2b533...,137,56,267,170,bull
1,5MSL3787-37_png.rf.082ac441e98f201c32413ae57b2...,385,190,435,244,pup
2,5MSL3787-37_png.rf.082ac441e98f201c32413ae57b2...,372,61,428,110,pup
3,5MSL3787-37_png.rf.082ac441e98f201c32413ae57b2...,382,408,416,460,pup
4,5MSL3787-37_png.rf.082ac441e98f201c32413ae57b2...,415,430,445,496,pup
...,...,...,...,...,...,...
1123,5MSL0067-14_png.rf.fce682c5d0b115eadb11f08dd02...,203,91,266,186,cow
1124,5MSL0067-14_png.rf.fce682c5d0b115eadb11f08dd02...,382,203,474,261,cow
1125,5MSL0067-14_png.rf.fce682c5d0b115eadb11f08dd02...,325,92,413,130,cow
1126,5MSL0067-14_png.rf.fce682c5d0b115eadb11f08dd02...,401,72,477,138,cow


In [16]:
df_val.to_csv("../roboflow1/valid/_annotations.csv", index=False, header=False)

In [17]:
df_test = pd.read_csv(test_path, header=None)
df_test

Unnamed: 0,0,1,2,3,4,5
0,5MSL3502-47_png.rf.0088bbf4c8ac5b75b85ef0987eb...,78,16,106,80,pup
1,5MSL3502-47_png.rf.0088bbf4c8ac5b75b85ef0987eb...,209,57,256,105,pup
2,5MSL3502-47_png.rf.0088bbf4c8ac5b75b85ef0987eb...,237,88,262,139,pup
3,5MSL3502-47_png.rf.0088bbf4c8ac5b75b85ef0987eb...,245,123,286,158,pup
4,5MSL3502-47_png.rf.0088bbf4c8ac5b75b85ef0987eb...,305,127,359,167,pup
...,...,...,...,...,...,...
1043,5MSL3778-68_png.rf.ffd12ac0ed6ea6be3d41ee9c826...,227,338,265,448,cow
1044,5MSL3778-68_png.rf.ffd12ac0ed6ea6be3d41ee9c826...,161,290,202,379,cow
1045,5MSL3778-68_png.rf.ffd12ac0ed6ea6be3d41ee9c826...,240,280,328,353,cow
1046,5MSL3778-63_png.rf.ffc3aed1eb148ec2ef72201c8a9...,340,72,394,103,pup


In [18]:
df_test.to_csv("../roboflow1/test/_annotations.csv", index=False, header=False)

## Quick EDA
Determine the number of elephant seals in each class for each split of the data

In [33]:
train_cows =  len(df_train[df_train[5] == 'cow'])
train_bulls =  len(df_train[df_train[5] == 'bull'])
train_pups =  len(df_train[df_train[5] == 'pup'])

print("In the training dataset, there are", train_bulls, "bulls,", train_cows, "cows, and", train_pups, "pups in the",
     len(df_train), "total tiles.")

In the training dataset, there are 520 bulls, 2397 cows, and 6869 pups in the 9786 total tiles.


In [32]:
val_cows =  len(df_val[df_val[5] == 'cow'])
val_bulls =  len(df_val[df_val[5] == 'bull'])
val_pups =  len(df_val[df_val[5] == 'pup'])

print("In the validation dataset, there are", val_bulls, "bulls,", val_cows, "cows, and", val_pups, "pups in the",
     len(df_val), "total tiles.")

In the validation dataset, there are 59 bulls, 286 cows, and 783 pups in the 1128 total tiles.


In [34]:
test_cows =  len(df_test[df_test[5] == 'cow'])
test_bulls =  len(df_test[df_test[5] == 'bull'])
test_pups =  len(df_test[df_test[5] == 'pup'])

print("In the testing dataset, there are", test_bulls, "bulls,", test_cows, "cows, and", test_pups, "pups in the",
     len(df_test), "total tiles.")

In the testing dataset, there are 50 bulls, 290 cows, and 708 pups in the 1048 total tiles.
