Import libraries
====

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

# Data Processing

Taken in part from ak-dataprocessing

In [None]:
TRAIN_PATH = "../input/tensorflow-great-barrier-reef/train_images"
HEIGHT, WIDTH = 720, 1280
image_size = 640

vid0_path = "../input/tensorflow-great-barrier-reef/train_images/video_0"
vid1_path = "../input/tensorflow-great-barrier-reef/train_images/video_1"
vid2_path = "../input/tensorflow-great-barrier-reef/train_images/video_2"
vid_paths = [vid0_path, vid1_path, vid2_path]

vid0_ls = [os.path.join(vid0_path,f) for f in os.listdir(vid0_path)]
vid0_ls = sorted(vid0_ls, key=lambda x: int("".join([i for i in x if i.isdigit()])))

vid1_ls = [os.path.join(vid1_path,f) for f in os.listdir(vid1_path)]
vid1_ls = sorted(vid1_ls, key=lambda x: int("".join([i for i in x if i.isdigit()])))

vid2_ls = [os.path.join(vid2_path,f) for f in os.listdir(vid2_path)]
vid2_ls = sorted(vid2_ls, key=lambda x: int("".join([i for i in x if i.isdigit()])))
files_ls = [vid0_ls, vid1_ls, vid2_ls]

train_df = pd.read_csv("../input/tensorflow-great-barrier-reef/train.csv",
                       sep = r',', skipinitialspace = True)

In [None]:
print(vid0_ls[:3])
print("\n",files_ls[0][:3])

In [None]:
print(len(files_ls[0])+len(files_ls[1])+len(files_ls[2]))
print(len(train_df))

In [None]:
train_df.head(5)

#### Helper functions

In [None]:
def get_oldpath(x):
    if x.video_id == 0:
        path = os.path.join(vid0_path,str(x.video_frame)+".jpg")
    elif x.video_id == 1:
        path = os.path.join(vid1_path,str(x.video_frame)+".jpg")
    else:
        path = os.path.join(vid2_path,str(x.video_frame)+".jpg")
        
    return path

def get_newpath(x):
    filename = f"{x.video_id}_{x.video_frame}.jpg"
    return os.path.join("./dataset", filename)

def get_filename(x):
    return f"{x.video_id}_{x.video_frame}.jpg"

In [None]:
import ast
from tqdm import tqdm
import shutil

#train_df = train_df[train_df.annotations != "[]"]
train_df["annotations"] = train_df["annotations"].map(lambda x : ast.literal_eval(x))

train_df["filepath"] = train_df.apply(lambda x : get_oldpath(x), axis=1)
# train_df["newpath"] = train_df.apply(lambda x : get_newpath(x), axis=1)
train_df["filename"] = train_df.apply(lambda x : get_filename(x), axis=1)

# os.makedirs("./dataset",exist_ok=True)
# for i in tqdm(range(len(train_df))):
#     src = train_df.iloc[i]["filepath"]
#     dst = train_df.iloc[i]["newpath"]
#     if os.path.exists(dst)==False:
#         shutil.copy(src,dst)
    
train_df.head(3)

#### Main Dataframe

In [None]:
df = train_df[train_df["annotations"].str.len() != 0].copy()
df = df.explode("annotations")

df["width"] = [WIDTH]*len(df)
df["height"] = [HEIGHT]*len(df)
df["label"] = ["starfish"]*len(df)

df["xmin"] = df.apply(lambda x : x.annotations["x"], axis=1)
df["ymin"] = df.apply(lambda x : x.annotations["y"], axis=1)
df["xmax"] = df.apply(lambda x : x.annotations["x"]+x.annotations["width"], axis=1)
df["ymax"] = df.apply(lambda x : x.annotations["y"]+x.annotations["height"], axis=1)

df.loc[df["xmax"] > WIDTH, "xmax"] = WIDTH
df.loc[df["ymax"] > HEIGHT, "ymax"] = HEIGHT

df["center"] = df.apply(lambda x : [int((x.xmax+x.xmin)/2) , int((x.ymax+x.ymin)/2)], axis=1)

df = df.drop(["sequence","video_frame","video_id","sequence_frame","image_id","annotations"], axis=1)
df = df.reset_index(drop=True)
df.head(5)

In [None]:
nostar_df = train_df[train_df["annotations"].str.len() == 0].copy()

nostar_df["width"] = [WIDTH]*len(nostar_df)
nostar_df["height"] = [HEIGHT]*len(nostar_df)

nostar_df["label"] = ["no_starfish"]*len(nostar_df)

nostar_df["xmin"] = ''#[0]*len(nostar_df)
nostar_df["ymin"] = ''#[0]*len(nostar_df)
nostar_df["xmax"] = ''#[0]*len(nostar_df)
nostar_df["ymax"] = ''#[0]*len(nostar_df)

nostar_df = nostar_df.drop(["sequence","video_frame","video_id","sequence_frame","image_id","annotations"], axis=1)
nostar_df = nostar_df.reset_index(drop=True)
nostar_df.head(5)

In [None]:
import random
import matplotlib.pyplot as plt
import matplotlib.image as npimg
import cv2

Function draw_annotations edited so that all bounding boxes are now drawn.

In [None]:
def draw_annotations(spot_fish_img, img_row):
    x = df[df['filename']==img_row["filename"]].copy()
    print(f"Number of starfish found is {len(x)}")
    for index in x.index:
        spot_fish_img = cv2.rectangle(spot_fish_img, (x['xmin'][index], x['ymin'][index]),
                  (x['xmax'][index], x['ymax'][index]), (255,255,0), 2)
    return spot_fish_img

Chose a random image and plot it side-by-side with the image showning starfish

In [None]:
img_row = df.iloc[random.randint(0, len(df))]
img_row

original_img = npimg.imread(img_row['filepath'])

fig, axs = plt.subplots(1, 2, figsize=(15, 10))
fig.tight_layout()

axs[0].imshow(original_img)
axs[0].set_title('Original Image')

spot_fish_img = original_img.copy()
draw_annotations(spot_fish_img, img_row)

axs[1].imshow(spot_fish_img)
axs[1].set_title('spotfish Image')

# Determine a reasonable cropping size
Calculate the mean starfish height and width as well as the standard deviation.
Crop size is taken to be the mean plus one standard deviation. Assuming that the height and swidth are normally distributed, then ~84% of starfish will will fall in this size. 

In [None]:
width_df = df["xmax"]-df["xmin"]
print("max width =",width_df.max()," min width =", width_df.min()," mean width =",width_df.mean(),"  std width =",width_df.std() )
print("<x>+sigma=", width_df.mean()+width_df.std())
crop_width = width_df.mean()+width_df.std()#width_df.max()
crop_width = int(crop_width)
if crop_width%2 !=0:
    crop_width = crop_width+1
if crop_width> WIDTH:
    print("crop_width> WIDTH")


height_df = df["ymax"]-df["ymin"]
print("max height=", height_df.max()," min height=",height_df.min()," mean height=",height_df.mean()," std height=", height_df.std())
print("<x>+sigma=", height_df.mean()+height_df.std())
crop_height = height_df.mean()+height_df.std()#height_df.max()
crop_height = int(crop_height)
if crop_height%2 !=0:
    crop_height = crop_height+1
if crop_height> HEIGHT:
    print("crop_height> HEIGHT")
    
print("crop height",crop_height,"crop width", crop_width)

Function to determine the parameters used to crop image

In [None]:
def get_crop_box(center):

    crop_box = [int(center[1]-crop_height/2),int(center[1]+crop_height/2),
                int(center[0]-crop_width/2),int(center[0]+crop_width/2)]
    
    #If box to be cropped is out of bounds of the image shift up/down/left/right
    #print("old cropbox ",crop_box )
    if crop_box[0] < 0:
        crop_box[1] = crop_box[1]-crop_box[0]
        crop_box[0] = 0
        #print("new cropbox ",crop_box )
    if crop_box[1] > HEIGHT:
        crop_box[0] = crop_box[0]-(crop_box[1]-HEIGHT)
        crop_box[1] = HEIGHT
        #print("new cropbox ",crop_box )
    if crop_box[2] < 0:
        crop_box[3] = crop_box[3]-crop_box[2]
        crop_box[2] = 0
        #print("new cropbox ",crop_box )
    if crop_box[3] > WIDTH:
        crop_box[2] = crop_box[2]-(crop_box[3]-WIDTH)
        crop_box[3] = WIDTH
        #print("new cropbox ",crop_box )
    
#     if (crop_box[1]-crop_box[0]) !=crop_height:
#         print('height crop error',(crop_box[1]-crop_box[0]),crop_height)
#     if (crop_box[3]-crop_box[2]) !=crop_width:
#         print('width crop error',(crop_box[3]-crop_box[2]),crop_width)
    return crop_box

# Crop test
Test the cropping pipeline by cropping all starfish which appeared in the previously shown image.

In [None]:
x = df[df['filename']==img_row["filename"]].copy()
print(f"Number of starfish found is {len(x)}")

for index in x.index:
    ###########################
    #tight crop
#     img_crop = original_img[x['ymin'][index]:x["ymax"][index],x['xmin'][index]:x["xmax"][index]]
    
    ###########################
    #equal sized crops
    center = [np.ceil((x["xmax"][index]+x['xmin'][index])/2) , np.ceil((x["ymax"][index]+x['ymin'][index])/2)]
    crop_box = get_crop_box(center)
    img_crop = original_img[crop_box[0]:crop_box[1],crop_box[2]:crop_box[3]]
    ##########################
    plt.figure()
    plt.imshow(img_crop)


Take training and test sets

In [None]:
df_training_clas = df.sample(frac = 0.9, random_state = 2)
df_test_clas = df.drop(df_training_clas.index)

print(len(df)-len(df_training_clas))
print(len(df_test_clas))



Crop training images and save to folder

In [None]:
os.makedirs("./train",exist_ok=True)
os.makedirs("./train/starfish",exist_ok=True)
os.makedirs("./train/no_starfish",exist_ok=True)

#original_img = cv2.imread(df_training_clas["filepath"][0])  
for index in tqdm(df_training_clas.index):#range(4780,4786,1): # 
    #crop images that contain starfish
    filename = f"./train/starfish/{index}_{df_training_clas['filename'][index]}"
    
    center = df_training_clas["center"][index]#[np.ceil((df["xmax"][index]+df['xmin'][index])/2) , np.ceil((df["ymax"][index]+df['ymin'][index])/2)]
    crop_box = get_crop_box(center)
    
    if (os.path.exists(filename)==False) or (os.path.exists(filename)==True):#: #
        #if index>0 and df_training_clas["filepath"][index]!=df_training_clas["filepath"][index-1]:
        original_img = cv2.imread(df_training_clas["filepath"][index]) 
        #######
        #equal sized crops
        
        img_crop = original_img[crop_box[0]:crop_box[1],crop_box[2]:crop_box[3]] 
        #######
        #tight crop
#         img_crop = original_img[df_training_clas['ymin'][index]:df_training_clas["ymax"][index],df_training_clas['xmin'][index]:df_training_clas["xmax"][index]]
        #######
        cv2.imwrite(filename,img_crop)
        
    #crop images that do not contain starfish using the same croping shape as for image containing starfish
    nostar_filename = f"./train/no_starfish/{index}_{nostar_df['filename'][index]}"
    if (os.path.exists(nostar_filename)==False) or (os.path.exists(nostar_filename)==True):#:  #
        nostar_original_img = cv2.imread(nostar_df["filepath"][index])  
        #######
        #equal sized crops
        nostar_img_crop = nostar_original_img[crop_box[0]:crop_box[1],crop_box[2]:crop_box[3]] 
        #######
        #tight crop
#         no_star_img_crop = no_star_original_img[df_training_clas['ymin'][index]:df_training_clas["ymax"][index],df_training_clas['xmin'][index]:df_training_clas["xmax"][index]]
        #######
        cv2.imwrite(nostar_filename,nostar_img_crop)

Check cropping by plotting random images from images just created

In [None]:
dirlist1 = os.listdir('./train/no_starfish')
dirlist2= os.listdir('./train/starfish')
#print(len(dirlist1),len(df),len(dirlist2),len(nostar_df))

fig, axs = plt.subplots(1, 2, figsize=(15, 10))

filename = f"./train/no_starfish/{dirlist1[random.randint(0, len(dirlist1))]}"
axs[0].imshow(npimg.imread(filename))
axs[0].set_title(filename)

filename = f"./train/starfish/{dirlist2[random.randint(0, len(dirlist2))]}"
axs[1].imshow(npimg.imread(filename))
axs[1].set_title(filename)

Crop test images

In [None]:
os.makedirs("./test",exist_ok=True)
os.makedirs("./test/testfolder",exist_ok=True)

#original_img = cv2.imread(df_test_clas["filepath"][0])  
for index in tqdm(df_test_clas.index):#range(4780,4786,1): # 
    #crop images that contain starfish
    filename = f"./test/testfolder/starfish_{index}_{df_test_clas['filename'][index]}"
    
    center = df_test_clas["center"][index]#[np.ceil((df["xmax"][index]+df['xmin'][index])/2) , np.ceil((df["ymax"][index]+df['ymin'][index])/2)]
    crop_box = get_crop_box(center)
    
    if (os.path.exists(filename)==False) or (os.path.exists(filename)==True):#: #
        #if index>0 and df_test_clas["filepath"][index]!=df_test_clas["filepath"][index-1]:
        original_img = cv2.imread(df_test_clas["filepath"][index]) 
        #######
        #equal sized crops
        
        img_crop = original_img[crop_box[0]:crop_box[1],crop_box[2]:crop_box[3]] 
        #######
        #tight crop
#         img_crop = original_img[df_test_clas['ymin'][index]:df_test_clas["ymax"][index],df_test_clas['xmin'][index]:df_test_clas["xmax"][index]]
        #######
        cv2.imwrite(filename,img_crop)
        
    #crop images that do not contain starfish using the same croping shape as for image containing starfish
    nostar_filename = f"./test/testfolder/nostarfish_{index}_{nostar_df['filename'][index]}"
    if (os.path.exists(nostar_filename)==False) or (os.path.exists(nostar_filename)==True):#:  #
        nostar_original_img = cv2.imread(nostar_df["filepath"][index])  
        #######
        #equal sized crops
        nostar_img_crop = nostar_original_img[crop_box[0]:crop_box[1],crop_box[2]:crop_box[3]] 
        #######
        #tight crop
#         no_star_img_crop = no_star_original_img[df_test_clas['ymin'][index]:df_test_clas["ymax"][index],df_test_clas['xmin'][index]:df_test_clas["xmax"][index]]
        #######
        cv2.imwrite(nostar_filename,nostar_img_crop)

Check cropping by plotting random images from images just created

In [None]:
dirlist1 = os.listdir('./test/testfolder')
print(len(dirlist1))
fig, axs = plt.subplots(1, 2, figsize=(15, 10))

filename = f"./test/testfolder/{dirlist1[random.randint(0, len(dirlist1))]}"
axs[0].imshow(npimg.imread(filename))
axs[0].set_title(filename)

filename = f"./test/testfolder/{dirlist1[random.randint(0, len(dirlist1))]}"
axs[1].imshow(npimg.imread(filename))
axs[1].set_title(filename)

# Clasification model using keras

Import libraries

In [None]:

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.utils.vis_utils import plot_model
from keras.utils import np_utils

from keras import backend as K

from keras.preprocessing.image import ImageDataGenerator

Create training and validation sets and load files from directory

In [None]:
train_datagen = ImageDataGenerator(rescale=1 / 255.0,validation_split=0.20)
test_datagen = ImageDataGenerator(rescale=1 / 255.0)

batch_size = 8
train_generator = train_datagen.flow_from_directory(directory=r"/kaggle/working/train/",target_size=(crop_height,crop_width),color_mode="rgb",
                                                    batch_size=batch_size,class_mode="categorical",subset='training',shuffle=True,seed=42)

valid_generator = train_datagen.flow_from_directory(directory=r"/kaggle/working/train/",target_size=(crop_height,crop_width),color_mode="rgb",
                                                    batch_size=batch_size,class_mode="categorical",subset='validation',shuffle=True,seed=42)

test_generator = test_datagen.flow_from_directory(directory=r"/kaggle/working/test/",target_size=(crop_height,crop_width),color_mode="rgb",batch_size=1,
                                                  class_mode=None,shuffle=False,seed=42)

Other models used:

model = Sequential()
model.add(Conv2D(32,kernel_size=(3,3),activation='relu',input_shape=(crop_height, crop_width, 3)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(16, activation='relu'))
model.add(Dense(2, activation='sigmoid'))
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=['accuracy'])

print(model.summary())


num_classes = 2

model = Sequential()
model.add(Conv2D(16, kernel_size=(3, 3),activation='relu',input_shape= (crop_height, crop_width, 3)))

model.add(Conv2D(16, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(16, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Dropout(0.35))

model.add(Flatten())

model.add(Dense(64, activation='relu'))

model.add(Dropout(0.5))

model.add(Dense(num_classes, activation='softmax'))

print(model.summary())

Model architecture

In [None]:
num_classes = 2

model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=(crop_height, crop_width, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.35))

model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

print(model.summary())

In [None]:
#keras.backend.clear_session()

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

model.fit(test_generator,
          batch_size=batch_size,
          steps_per_epoch = train_generator.n//train_generator.batch_size,
          validation_steps = valid_generator.n//valid_generator.batch_size,
          epochs=5,
          verbose=1,
          validation_data=valid_generator)

model.fit(train_generator,
    validation_data = valid_generator,
    steps_per_epoch = train_generator.n//train_generator.batch_size,
    validation_steps = valid_generator.n//valid_generator.batch_size,
    epochs=5)

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

#Train
model.fit(train_generator,
    steps_per_epoch=train_generator.n//train_generator.batch_size,
    epochs=4,
    validation_data=valid_generator, validation_steps=valid_generator.n//valid_generator.batch_size)


Show confusion matrix

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

#Confution Matrix and Classification Report
Y_pred = model.predict(valid_generator)
y_pred = np.argmax(Y_pred, axis=-1)

target_names = ['No Starfish', 'Starfish']
print('Confusion Matrix')
cm = confusion_matrix(valid_generator.classes, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
disp.plot(cmap=plt.cm.Blues)
plt.show()

print(Y_pred)
print(y_pred)
print(valid_generator.classes)

In [None]:
score = model.evaluate(valid_generator)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Show confusion matrix for test images

In [None]:
prediction=model.predict(test_generator)
# predict the class label
y_classes = prediction.argmax(axis=-1)
print(prediction)

predicted_class_indices=np.argmax(prediction,axis=1)
labels = (train_generator.class_indices)
labels = dict((v,k) for k,v in labels.items())
predictions = [labels[k] for k in predicted_class_indices]

filenames=test_generator.filenames
y = []
for index in range(len(filenames)):
    x = filenames[index].split('_')
    x = x[0].split('/')
    if x[1] == 'nostarfish':
        y.append(0)
    else:
        y.append(1)

results=pd.DataFrame({"Star y/n": y,
                      "Predictions":predictions})
print(results)


In [None]:
cm = confusion_matrix(y,predicted_class_indices)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
disp.plot(cmap=plt.cm.Blues)