In [1]:
import matplotlib.pyplot as plt
import os
import random
# external imports
import transformers
from transformers import AutoImageProcessor, AutoModelForDepthEstimation
import torch
import torchvision
import time 
import numpy as np
from PIL import Image
import requests
import datasets
from datasets import load_dataset
from torchvision import datasets, transforms
from tqdm import tqdm
import cv2

KeyboardInterrupt: 

# Intial Tests with DepthAnything


### DepthEstimation Example


In [None]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

image_processor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf")
model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf")

# prepare image for the model
inputs = image_processor(images=image, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
    predicted_depth = outputs.predicted_depth

# interpolate to original size
prediction = torch.nn.functional.interpolate(
    predicted_depth.unsqueeze(1),
    size=image.size[::-1],
    mode="bicubic",
    align_corners=False,
)

# visualize the prediction
output = prediction.squeeze().cpu().numpy()
formatted = (output * 255 / np.max(output)).astype("uint8")
depth = Image.fromarray(formatted)
depth

### Compute Budget Test

In [None]:
# import the dataset -> stream it so it does not take too long
train_dataset = load_dataset("sayakpaul/nyu_depth_v2", split="train", 
                                streaming = True, trust_remote_code=True)

# load the dataset into a dataloader
dataset = train_dataset.with_format("torch")
dataloader = torch.utils.data.DataLoader(dataset, num_workers=2, batch_size = 16)

image_processor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf")
model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf")
device = 'cuda' if torch.cuda.is_available() else 'cpu' 
model.to(device)
# prepare image for the model
for i, batch in enumerate(tqdm(dataloader)):
    t0 = time.time()
    image = batch.get('image')
    inputs = image_processor(images=image, return_tensors="pt").to(device)
    inputs.to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_depth = outputs.predicted_depth

    # interpolate to original size
    prediction = torch.nn.functional.interpolate(
        predicted_depth.unsqueeze(1),
        size= [image.size()[1], image.size()[2]],#image.size(),
        mode="bicubic",
        align_corners=False,
    )

    # visualize the prediction
    # output = prediction.squeeze().cpu().numpy()
    # print(output)
    # formatted = (output * 255 / np.max(output)).astype("uint8")
    # depth = Image.fromarray(formatted)
    # plt.imshow(depth)
    print(f"The time taken is:{time.time()-t0}")

For the small model, processing batches of size 16 took about 0.5 seconds on average. 

# Loading and Investigating Eiffel Tower Dataset

In [None]:
image_path = 'eiffel/2020/' #this is the path to the 2020 images
depth_path = 'eiffel/2020/depth/dense/' # this is the path to 2020 ground truth depths

transform = transforms.Compose([
    transforms.ToTensor()
])

In [None]:
dataset = datasets.ImageFolder(image_path, transform = transform)
dataloader = torch.utils.data.DataLoader(dataset, num_workers = 2, batch_size = 16)
sample = next(iter(dataloader))[0].numpy() # get one image
batch.shape

In [None]:
sample = sample[0]

### Sampling one image and getting the depth

In [None]:
# image = Image.open(sample)

image_processor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf")
model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf")

# prepare image for the model
inputs = image_processor(images=sample, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
    predicted_depth = outputs.predicted_depth

# interpolate to original size 
prediction = torch.nn.functional.interpolate(
    predicted_depth.unsqueeze(1),
    size=[sample.shape[1], sample.shape[2]],
    mode="bicubic",
    align_corners=False,
)
# visualize the prediction
output = prediction.squeeze().cpu().numpy()
formatted = (output * 255 / np.max(output)).astype("uint8")
depth = Image.fromarray(formatted)
depth

We have a depth map! But it does not say much. Let's process a batch, and compare the images and the found depth map with the ground truth. Below is code from Madhu's test file that I plan to alter to fit our image files -> should not take too long. 
Then I will process the whole dataset and get some metrics. 

In [None]:
def min_max_normalize(image):
    # Get the minimum and maximum pixel values
    min_val = np.min(image)
    max_val = np.max(image)

    # Normalize the image
    normalized_image = (image - min_val) / (max_val - min_val)
    return normalized_image

In [None]:

def sample_eiffel_depth(image_path, depth_path, k, model, image_processor):
    # Get a list of all files in the directory
    image_files = os.listdir(image_path)

    # Shuffle the list of image files
    random.shuffle(image_files)

    # Select k random images
    selected_images = image_files[:k]

    # Create a grid to display the images
    fig, axes = plt.subplots(k, 3, figsize=(10, 2*k))

    for i, img_file in enumerate(selected_images):
        # Load the original image
        img_path = os.path.join(image_path, img_file)
        assert os.path.exists(img_path)

        original_image = Image.open(img_path)
        original_image_display = cv2.imread(img_path)

        img_lab = cv2.cvtColor(original_image_display,cv2.COLOR_BGR2Lab)
        l,a,b = cv2.split(img_lab)

        clahe = cv2.createCLAHE(clipLimit=50, tileGridSize=(8, 8))
        lumen_clahe = clahe.apply(l)
        equ = cv2.equalizeHist(l)

        updated_lab_img = cv2.merge((lumen_clahe,a,b))

        original_image_display = cv2.cvtColor(updated_lab_img,cv2.COLOR_LAB2LBGR)

        # Prepare image for the model
        inputs = image_processor(images=original_image, return_tensors="pt")

        with torch.no_grad():
            # Forward pass through the model
            outputs = model(**inputs)
            predicted_depth = outputs.predicted_depth

        # Interpolate to original size
        prediction = torch.nn.functional.interpolate(
            predicted_depth.unsqueeze(1),
            size=[np.array(original_image).shape[0], np.array(original_image).shape[1]],
            mode="bicubic",
            align_corners=False,
        )

        # Convert depth prediction to numpy array
        depth_output = prediction.squeeze().cpu().numpy()

        # Plot original image
        axes[i, 0].imshow(original_image)
        axes[i, 0].axis('off')
        axes[i, 0].set_title(f'Image{i}',)

        # Plot depth output
        axes[i, 1].imshow(depth_output,cmap = 'plasma')
        axes[i, 1].axis('off')
        axes[i, 1].set_title(f'Depth{i}')

        # Load ground truth depth from TIFF file
        depth_file = 'depth_' + img_file
        depths = os.path.join(depth_path, depth_file)
        depth_image = Image.open(depths)
        depth_array = np.array(depth_image)
        depth_normalized = min_max_normalize(depth_array)


        # Plot ground truth depth
        axes[i, 2].imshow(depth_normalized, cmap='plasma_r')
        axes[i, 2].axis('off')
        axes[i, 2].set_title(f'Ground Truth Depth {i}')
    plt.show()


In [None]:
image_path = 'eiffel/2020/images/'
depth_path = 'eiffel/2020/depth/dense/depth'
image_processor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf")
model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf")
sample_eiffel_depth(image_path, depth_path, 10, model, image_processor)

# Baseline Results -> No training

We can implement some basic metrics for getting baseline results for the entire dataset. DepthAnything used these metrics.

In [None]:
    def scale_offset(y_pred, y_true):
        scale_factor = np.mean(y_pred) / np.mean(y_true)

        # Adjust the second depth map by the scale factor
        true_scaled = y_pred * scale_factor

         # Calculate the offset
        offset = np.mean(y_pred) - (scale_factor * np.mean(y_true))

        # Adjust the second depth map by the offset
        true_adjusted = true_scaled + offset

        val_min = y_pred.min()
        val_range = y_pred.max() - val_min + 1e-7

        pred_normed = (y_pred - val_min) / val_range

        # apply identical normalization to the denoised image (important!)
        true_adjusted_normed = (true_adjusted - val_min) / val_range

        return pred_normed, true_adjusted_normed

    def absolute_relative_error(y_pred, y_true):
        """
        Calculate the Absolute Relative Error (MARE).

        Parameters:
        y_pred : torch.Tensor
            Predicted depth values.
        y_true : torch.Tensor
            Ground truth depth values.

        Returns:
        float
        Absolute Relative Error (MARE).
        """
        y_pred, y_true = scale_offset(y_pred, y_true)
        # mask = y_true == 0
        # y_true[mask] = 1 
        absolute_relative_error = np.abs(y_pred - y_true) / y_true

        return np.mean(absolute_relative_error)
    
    def root_mean_squared_error(y_pred, y_true, log = False): 
        
        y_pred, y_true = scale_offset(y_pred, y_true)
        if log:
            mask = y_pred > 0.00001
            y_pred = y_pred[mask]
            y_true = y_true[mask]
            y_pred = np.log(y_pred)
            y_true = np.log(y_true)
        mse = np.mean((y_pred - y_true)**2)
        rmse = np.sqrt(mse)
        return rmse

    def delta1_metric(y_pred, y_true, threshold=1.25):
        """
        Calculate the δ1 metric for monocular depth estimation.

        Parameters:
        y_pred : torch.Tensor
            Predicted depth values.
        y_true : torch.Tensor
            Ground truth depth values.
        threshold : float, optional
            Threshold for considering a pixel as correctly estimated (default is 1.25).

        Returns:
        float
            Percentage of pixels for which max(d*/d, d/d*) < threshold.
            
        """
        y_pred, y_true = scale_offset(y_pred.numpy(), y_true.numpy())
        y_pred = torch.from_numpy(y_pred)
        y_true = torch.from_numpy(y_true)
        # Compute element-wise ratios
        ratio_1 = y_true / (y_pred + 1e-7)  # Adding epsilon to avoid division by zero
        ratio_2 = (y_pred + 1e-7) / y_true  # Adding epsilon to avoid division by zero
        
        # Calculate element-wise maximum ratio
        max_ratio = torch.max(ratio_1, ratio_2)
        
        # Count the number of pixels where max_ratio < threshold
        num_correct_pixels = torch.sum(max_ratio < threshold).item()
        
        # Calculate the percentage of pixels satisfying the condition
        total_pixels = y_true.numel()
        percentage_correct = (num_correct_pixels / total_pixels) * 100.0
        
        return percentage_correct
    
    def si_log(y_pred, y_true):
        """
        Calculate the Scale Invarient error that takes into account the global scale of a scene. 
        This metric is sensitive to the relationships between points in the scene, 
        irrespective of the absolute global scale.

        Parameters:
        y_pred : torch.Tensor
            Predicted depth values.
        y_true : torch.Tensor
            Ground truth depth values.
    
        Returns:
        float
            SI Error
            
        """
        bs = y_pred.shape[0]

        y_pred = torch.reshape(y_pred, (bs, -1))
        y_true = torch.reshape(y_true, (bs, -1))

        mask = y_true > 0  # 0=missing y_true
        num_vals = mask.numel()

        log_diff = torch.zeros_like(y_pred)
        log_diff[mask] = torch.log(y_pred[mask]) - torch.log(y_true[mask])
        
        si_log_unscaled = torch.sum(log_diff**2, dim=1) / num_vals - (torch.sum(log_diff, dim=1)**2) / (num_vals**2)
        si_log_score = torch.sqrt(si_log_unscaled) * 100
        
        si_log_score = torch.mean(si_log_score)
        return si_log_score

Let us now get the depths for the whole dataset, and compute baseline results for MAE and delta1. 

In [None]:
class DepthEstimationDataset(torch.utils.data.Dataset):
    def __init__(self, data_dir, label_dir, transforms=None):
        self.data_dir = data_dir
        self.transforms = transforms
        self.image_files = []
        self.mask_files = []
        
        for root, _, files in os.walk(data_dir):
            for file in files:
                if file.endswith('.jpg') or file.endswith('.png'):
                    image_file = os.path.join(root, file)
                    self.image_files.append(image_file)
                    
        for root, _, files in os.walk(label_dir):
            for file in files:
                if file.endswith('.jpg') or file.endswith('.png'):
                    mask_file = os.path.join(root, file)
                    self.mask_files.append(mask_file)

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, index):
        image_file = self.image_files[index]
        mask_file = self.mask_files[index]
        
        image = Image.open(image_file).convert('RGB')
        mask = Image.open(mask_file)
        
        if self.transforms:
            image = self.transforms(image)
            mask = self.transforms(mask)
        
        return image, mask

# Define the data transformations
data_transforms = transforms.Compose([
    #transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Resize((1080, 1920))
    #transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Load the data
train_data = DepthEstimationDataset('eiffel/2020/images/','eiffel/2020/depth/dense/depth', transforms=data_transforms)
# test_data = SegmentationDataset('eiffel/2020/depth/dense/depth', transforms=data_transforms)

# Create data loaders
batch_size = 16
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
# test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [None]:
#https://huggingface.co/docs/transformers/main/en/preprocessing#computer-vision
def evaluate_model(model, dataloader):
    model.eval()
    num_batches = len(dataloader)
    absRel = 0
    rmse = 0
    rmseLog = 0
    delta1 = 0
    si_error = 0
    total = 0
    # Wrap the dataloader with tqdm for progress tracking
    dataloader = tqdm(dataloader, total=num_batches, desc="Evaluation")
    
    for data, labels in dataloader:
        inputs = image_processor(images=data, return_tensors="pt", do_rescale= False).to(device)
        
        
        # no training therefore no calculation of gradients
        with torch.no_grad():
            outputs = model(**inputs)
            predicted_depth = outputs.predicted_depth
            # interpolate to original size
            prediction = torch.nn.functional.interpolate(
                predicted_depth.unsqueeze(1),
                size=[data.shape[2], data.shape[3]],
                mode="bicubic",
                align_corners=False,
            )

            # Convert depth prediction to numpy array and resize to match ground truth depth map size
            depth_output = prediction.squeeze().cpu().numpy()
            labels = labels.squeeze().cpu().numpy()

            # Handle invalid or unexpected depth values
            mask = labels == 0. 
            depth_output[mask] = 0.  # Replace negative or zero values with a small epsilon 
            
            # Calculate metrics
            absRel += np.sum(absolute_relative_error(depth_output, labels))
            
            rmse += np.sum(root_mean_squared_error(depth_output, labels))
            rmseLog += np.sum(root_mean_squared_error(depth_output, labels, log = True))
            
            out_t = torch.from_numpy(depth_output)
            labels_t = torch.from_numpy(labels)
            
            delta1 += np.sum(delta1_metric(out_t, labels_t))
            # si_error += si_log(out_t, labels_t)

            total += data.size(0)
            # Update tqdm progress bar
            dataloader.set_postfix({'absRel': absRel/total, 'RMSE': rmse/total, 'rmseLog': rmseLog/total,
                                    'delta':delta1/total, 'siLog': si_error/total})
        
    total_absRel = absRel/total
    total_rmse = rmse/total
    total_rmseLog = rmseLog/total
    total_delta = delta1/total
    total_si = si_error/total
    
    return total_absRel, total_rmse, total_rmseLog, total_delta, total_si
    
image_processor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf")
model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf")   
device = 'cuda' if torch.cuda.is_available() else 'cpu' 
model.to(device)
result = evaluate_model(model, train_loader)
result

Delta1 is better when it is close to one.
MAE and RMSE are better when they are close to 0. 
We have a good RMSE, an average AbsRel and a poor Delta1. 
What can we say about these baselines? 

In [None]:
batch_size = 4
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
image_processor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-large-hf")
model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-large-hf")   
device = 'cuda' if torch.cuda.is_available() else 'cpu' 
model.to(device)
result_large = evaluate_model(model, train_loader)


In [None]:
result_large

### Summarising the results

In [None]:
# import pandas to use pandas DataFrame
import pandas as pd

small = ("DepthAnything-Small", result[0], result[1], result[2], result[3])
large = ("DepthAnything-Large", result_large[0], result_large[1], result_large[2], result_large[3])

# data in the form of list of tuples
data = [small, large]
 
# create DataFrame using data
df = pd.DataFrame(data, columns =['Model', 'AbsRel', 'RMSE', 'RMSE Log', 'Delta'])
df

### Training

In [None]:
!python PEFT_training/run_training.py


### Testing preprocessing 

In [None]:
image_processor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf")
model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf")
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
image, label = train_data[5]
label = label.numpy().astype(np.uint8)
label = label.transpose(1, 2, 0)  # Convert to shape (1080, 1920, 3)
label.shape

In [None]:

# Convert the image to grayscale
# gray = cv2.cvtColor(label, cv2.COLOR_BGR2GRAY)

# Apply Canny edge detection
edges_original = cv2.Canny(label, threshold1=300, threshold2=450)  # Adjust thresholds as needed
# Apply Gaussian blur to the detected edges
blurred_edges = cv2.GaussianBlur(edges_original, (5, 5), 0) 
inputs = image_processor(images=image, return_tensors="pt", do_rescale = False)

with torch.no_grad():
    # Forward pass through the model
    outputs = model(**inputs)
    predicted_depth = outputs.predicted_depth

# Interpolate to original size
prediction = torch.nn.functional.interpolate(
    predicted_depth.unsqueeze(1),
    size=[np.array(image).shape[1], np.array(image).shape[2]],
    mode="bicubic",
    align_corners=False,
)

# Convert depth prediction to numpy array
depth_output = prediction.squeeze().cpu().numpy()
# # Visualize the results

fig, axes = plt.subplots(4, 1, figsize=(15, 15))

axes[0].imshow(edges_original, cmap='gray')
axes[0].set_title('Detected edges on label')
axes[0].axis('off')

axes[1].imshow(label, cmap='plasma_r')
axes[1].set_title('Label')
axes[1].axis('off')

axes[2].imshow(blurred_edges, cmap='gray')
axes[2].set_title('Blurred Edges Label')
axes[2].axis('off')

axes[3].imshow(depth_output, cmap='plasma_r')
axes[3].set_title('DepthAnything output')
axes[3].axis('off')

plt.show()

In [None]:
root_mean_squared_error(depth_output, label)

In [None]:
root_mean_squared_error(depth_output, blurred_edges)

Okay we get an improved result. Let's try this. 

In [None]:

def preprocess_labels(batch_labels):
    preprocessed_batch = []
    
    for labels in batch_labels:
        # Prepare labels
        labels = labels.numpy().astype(np.uint8)
        labels = labels.transpose(1, 2, 0)
        
        # Apply Canny edge detection
        edges = cv2.Canny(labels, threshold1=300, threshold2=450)  # Adjust thresholds as needed
        
        # Apply Gaussian blur to the detected edges
        blurred = cv2.GaussianBlur(edges, (5, 5), 0)
        
        # Add the preprocessed image to the batch list
        preprocessed_batch.append(blurred)

    return torch.Tensor(np.array(preprocessed_batch))

In [None]:
#https://huggingface.co/docs/transformers/main/en/preprocessing#computer-vision
def evaluate_model(model, dataloader):
    model.eval()
    num_batches = len(dataloader)
    absRel = 0
    rmse = 0
    rmseLog = 0
    delta1 = 0
    si_error = 0
    total = 0
    # Wrap the dataloader with tqdm for progress tracking
    dataloader = tqdm(dataloader, total=num_batches, desc="Evaluation")
    
    for data, labels in dataloader:
        labels = preprocess_labels(labels)
        inputs = image_processor(images=data, return_tensors="pt", do_rescale= False).to(device)
        
        # no training therefore no calculation of gradients
        with torch.no_grad():
            outputs = model(**inputs)
            predicted_depth = outputs.predicted_depth
            # interpolate to original size
            prediction = torch.nn.functional.interpolate(
                predicted_depth.unsqueeze(1),
                size=[data.shape[2], data.shape[3]],
                mode="bicubic",
                align_corners=False,
            )

            # Convert depth prediction to numpy array and resize to match ground truth depth map size
            depth_output = prediction.squeeze().cpu().numpy()
            labels = labels.squeeze().cpu().numpy()

            # Handle invalid or unexpected depth values
            mask = labels == 0. 
            depth_output[mask] = 0.  # Replace negative or zero values with a small epsilon 
            
            # Calculate metrics
            absRel += np.sum(absolute_relative_error(depth_output, labels))
            
            rmse += np.sum(root_mean_squared_error(depth_output, labels))
            rmseLog += np.sum(root_mean_squared_error(depth_output, labels, log = True))
            
            out_t = torch.from_numpy(depth_output)
            labels_t = torch.from_numpy(labels)
            
            delta1 += np.sum(delta1_metric(out_t, labels_t))
            si_error += si_log(out_t, labels_t)

            total += data.size(0)
            # Update tqdm progress bar
            dataloader.set_postfix({'absRel': absRel/total, 'RMSE': rmse/total, 'rmseLog': rmseLog/total,
                                    'delta':delta1/total, 'siLog': si_error/total})
        
    total_absRel = absRel/total
    total_rmse = rmse/total
    total_rmseLog = rmseLog/total
    total_delta = delta1/total
    total_si = si_error/total
    
    return total_absRel, total_rmse, total_rmseLog, total_delta, total_si
    
image_processor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf")
model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf")   
device = 'cuda' if torch.cuda.is_available() else 'cpu' 
model.to(device)
result = evaluate_model(model, train_loader)
result

In [None]:
image_processor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-large-hf")
model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-large-hf")   
device = 'cuda' if torch.cuda.is_available() else 'cpu' 
model.to(device)
result_large = evaluate_model(model, train_loader)
result_large

### Training again with the changes -> experiment 2, 3


In [15]:
!python PEFT_training/run_training.py 

Length of Dataset: 1610
trainable params: 599040 || all params: 25374913 || trainable%: 2.36
[34m[1mwandb[0m: Currently logged in as: [33mcamilla-james[0m ([33mdolphins[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.17.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/home/mundus/cjames706/underwater_depth_estimation/wandb/run-20240525_201611-2q439v0d[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mTesting with 20 epochs, EiffelTowerDataset lr: 0.0001, warmup: 40, optim: AdamW[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/dolphins/DepthUnderwater_training[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/dolphins/DepthUnderwater_training/runs/2q439v0d[0m
EPOCH 1:
  batch 12 los