In [1]:
from FastSAM.fastsam import *
import cv2
import os
import shutil
from utils import compute_blob_mean_and_covariance
import numpy as np
import matplotlib.pyplot as plt
from utils import plotErrorEllipse
import skimage
from ultralytics import SAM
from PIL import Image
import torch
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from transformers import CLIPProcessor, CLIPModel

import warnings
warnings.filterwarnings('ignore')

import time

- Load FastSAM

In [2]:
# Specify path to checkpoint. (If checkpoint does not exist, the implementation in FastSAM repo downloads it.)
fastSamModel = FastSAM('./FastSAM/Models/FastSAM-x.pt')
# fastSamModel = SAM('sam_b.pt')
DEVICE = 'cpu'

- put your filename here

In [3]:
# Define the file location
# file_location = 'fish_img'
file_location = 'fish_img_1'

# Get all image file names in the directory
image_files = [f for f in os.listdir(file_location) if f.endswith('.jpg') or f.endswith('.png')]

# Setup output folder
outputFolder = 'output_test/'

# Check if output folder exists and create it if necessary
if (not os.path.isdir(outputFolder)):
    os.makedirs(outputFolder)

# Sort the image files by their filenames such that frame0 then frame1 then frame2 ... then frame 10 then 11 then 12, etc
image_files.sort(key=lambda x: int(x.split('.')[0].split('e')[-1]))

# For demonstration purpose, we only use 850~900
image_files = image_files[850:950]

- DORI

In [4]:
# Initialize an empty list to store the average luminance of each superpixel
average_luminance = []

# initialize count
count = 0

potential_fish_frames = []

# For FastSam:
# Specify confidence and IoU parameters (see FastSAM paper or rather YOLO v8 documentation)
conf = 0.5
iou = 0.9

# index=0
Y_OUT = np.zeros((1,len(image_files)+10))

SAM_TIME = 0

FIRST_WRITE_TXT = True

# Loop through each image file
for image_file in image_files:

    START_TIME = time.time()
    
    # Read the image
    image = cv2.imread(os.path.join(file_location, image_file))
    
    # Motion Detection (1):

    luminance = np.mean(image)
    count += 1

    if count < 50:
        average_luminance.append(luminance)
        continue
    
    # # Append the average luminance to the list
    average_luminance.append(luminance)

    luminance_diff = abs(luminance - np.mean(average_luminance[count-50:count]))

    if luminance_diff > 1:
        
        potential_fish_frames.append(count)

        # Run FastSAM
        everything_results = fastSamModel(image, device=DEVICE, retina_masks=True, imgsz=1024, conf=conf, iou=iou,)
        prompt_process = FastSAMPrompt(image, everything_results, device=DEVICE)
        segmask = prompt_process.everything_prompt()
        # print(len(segmask))

        # Segment the images for better classification
        image_array = []
        for mask_index in range(len(segmask)):
            # Masked out the segmentation
            potential_fish_img = image.copy()
            segmask = segmask.type(torch.int32)
            potential_fish_img[~segmask[mask_index,:,:],:] = 0

            # Convert to PIL Image
            image_pil = Image.fromarray(potential_fish_img)
            image_array.append(image_pil)

        # Initialize the CLIP model and processor
        model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
        processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
        
        # Classification labels
        # classes = ['fish', 'aquatic plants', 'trash', 'water', 'land', 'ocean', 'submarine']
        classes = ['fish', 'aquatic plants', 'trash', 'water', 'land', 'ocean', 'submarine']
        
        # Process the image for CLIP (resize, convert to tensor, normalize)
        inputs = processor(text=classes, images=image_array, return_tensors="pt", padding=True)

        # Forward pass: get the image features from CLIP
        outputs = model(**inputs)

        # Classification:
        logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        probs = logits_per_image.softmax(dim=1) 
        
        classification_outputs = torch.argmax(probs, dim=1)
        Y_OUT[:,count] = probs[:,0].max().item()

        END_TIME = time.time()

        # If is classified as 'crab'
        if torch.any(classification_outputs==0):
            # Save the image that is detected and classified with fish in the output Folder
            # cv2.imwrite(outputFolder+"crab_"+image_file, image) 
            if FIRST_WRITE_TXT:
                print("fish iter: " + str(count+850) + " " + str(probs[:,0].max().item()) )
                f = open("{outputFolder}/fish_output.txt".format(outputFolder=outputFolder), "w")
                f.write(str(count) + " " + str(probs[:,0].max().item()))
                f.write('\n')
                f.close()
                FIRST_WRITE_TXT = False
            else:
                print("fish iter: " + str(count+850) + " " + str(probs[:,0].max().item()) )
                f = open("{outputFolder}/fish_output.txt".format(outputFolder=outputFolder), "a")
                f.write(str(count) + " " + str(probs[:,0].max().item()))
                f.write('\n')
                f.close()

        classes = ['crab', 'aquatic plants', 'trash', 'water', 'land', 'ocean', 'submarine']
        
        # Process the image for CLIP (resize, convert to tensor, normalize)
        inputs = processor(text=classes, images=image_array, return_tensors="pt", padding=True)

        # Forward pass: get the image features from CLIP
        outputs = model(**inputs)

        # Classification:
        logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        probs = logits_per_image.softmax(dim=1) 
        
        classification_outputs = torch.argmax(probs, dim=1)
        Y_OUT[:,count] = probs[:,0].max().item()

        # If is classified as 'crab'
        if torch.any(classification_outputs==0):
            # Save the image that is detected and classified with fish in the output Folder
            # cv2.imwrite(outputFolder+"crab_"+image_file, image) 
            if FIRST_WRITE_TXT:
                print("crab iter: " + str(count+850) + " " + str(probs[:,0].max().item()) )
                f = open("{outputFolder}/crab_output.txt".format(outputFolder=outputFolder), "w")
                f.write(str(count) + " " + str(probs[:,0].max().item()))
                f.write('\n')
                f.close()
                FIRST_WRITE_TXT = False
            else:
                print("crab iter: " + str(count+850) + " " + str(probs[:,0].max().item()) )
                f = open("{outputFolder}/crab_output.txt".format(outputFolder=outputFolder), "a")
                f.write(str(count) + " " + str(probs[:,0].max().item()))
                f.write('\n')
                f.close()
    


0: 576x1024 1 object, 466.2ms
Speed: 2.9ms preprocess, 466.2ms inference, 1.2ms postprocess per image at shape (1, 3, 1024, 1024)



fish iter: 900 0.49759554862976074
crab iter: 900 0.33682695031166077


0: 576x1024 1 object, 426.6ms
Speed: 1.5ms preprocess, 426.6ms inference, 0.9ms postprocess per image at shape (1, 3, 1024, 1024)



fish iter: 901 0.4980039596557617
crab iter: 901 0.34223178029060364


0: 576x1024 1 object, 425.6ms
Speed: 1.7ms preprocess, 425.6ms inference, 0.9ms postprocess per image at shape (1, 3, 1024, 1024)



fish iter: 902 0.4966365098953247
crab iter: 902 0.3425745964050293


0: 576x1024 1 object, 426.7ms
Speed: 1.5ms preprocess, 426.7ms inference, 1.6ms postprocess per image at shape (1, 3, 1024, 1024)



fish iter: 903 0.5044446587562561
crab iter: 903 0.35114261507987976


0: 576x1024 1 object, 434.8ms
Speed: 2.7ms preprocess, 434.8ms inference, 1.5ms postprocess per image at shape (1, 3, 1024, 1024)



fish iter: 904 0.45147180557250977
crab iter: 904 0.2864118814468384


0: 576x1024 1 object, 433.2ms
Speed: 2.6ms preprocess, 433.2ms inference, 1.5ms postprocess per image at shape (1, 3, 1024, 1024)



fish iter: 905 0.44639888405799866
crab iter: 905 0.28739604353904724


0: 576x1024 1 object, 427.4ms
Speed: 1.6ms preprocess, 427.4ms inference, 1.1ms postprocess per image at shape (1, 3, 1024, 1024)



fish iter: 906 0.4621506929397583
crab iter: 906 0.27979740500450134


0: 576x1024 1 object, 436.0ms
Speed: 1.5ms preprocess, 436.0ms inference, 1.6ms postprocess per image at shape (1, 3, 1024, 1024)



fish iter: 921 0.37873613834381104


0: 576x1024 1 object, 435.1ms
Speed: 2.7ms preprocess, 435.1ms inference, 1.5ms postprocess per image at shape (1, 3, 1024, 1024)


fish iter: 922 0.3959563970565796
