# Notebook for developing the QC pipeline
## Setup
### User parameters

In [5]:
model_name = "qualityControlV1"

### Imports

In [4]:
import os, sys, pathlib, time, random
# Move working directory to the root of the project
os.chdir("/home/ucloud/EUMothModel")
print("Working directory:", os.getcwd())

# __package__ = ".."
# sys.path.append(os.path.abspath(__package__))

# from rclonemountpy.utils.mount import *
from utils.implicit_mount import *
from utils.dataloader import *

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm

Working directory: /home/ucloud/EUMothModel


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device:", device)
dtype = torch.bfloat16

### Cuda detection & Datatype selection

### Mount

In [3]:
with IOHandler(verbose = False) as mount:
    mount.cd("AMI_GBIF_Pretraining_Data/root")

    mount.cache_file_index(skip = 1000, nmax = 128*10)

    total_size = 0

    pbar = tqdm(enumerate(RemotePathIterator(mount, batch_size=128, n_local_files=128*3)), total = 128*10, leave = False)
    start_time = time.time()

    batch = []
    for i, (l_file, r_file),  in pbar:
        batch += [l_file]
        if len(batch) == 128:
            for file in batch:
                file_size = os.path.getsize(file)
                total_size += file_size
                elapsed_time = time.time() - start_time
                pbar.set_description(f"Total size: {total_size / 1e6:.2f} MB | Elapsed time: {elapsed_time:.2f} s | Speed: {(total_size / elapsed_time) / 1e6:.2f} MB/s")
            batch = []

Connected to sftp://asgersvenning%40ecos.au.dk@io.erda.au.dk:2222
Local directory: /tmp/tmpzot0lq87


  0%|          | 0/1280 [00:00<?, ?it/s]

Cleaning up...


In [None]:
backend = IOHandler(verbose = False)
backend.cd("AMI_GBIF_Pretraining_Data/root")
backend.cache_file_index(skip = 1000, nmax = 128*10)

### Dataloader

In [None]:
weights = torchvision.models.EfficientNet_V2_S_Weights.DEFAULT
image_preprocessing = weights.transforms(antialias=True)
def denormalize(tensor, mean=image_preprocessing.mean, std=image_preprocessing.std):
    """Denormalize a tensor."""
    mean = torch.tensor(mean).view(1, 3, 1, 1).to(torch.float32)
    std = torch.tensor(std).view(1, 3, 1, 1).to(torch.float32)
    return tensor.cpu().to(torch.float32) * std + mean

dataset = RemotePathDataset(
    remote_path_iterator=RemotePathIterator(
        backend,
        batch_size=128,
        n_local_files=128*3,
    ),
    transform=image_preprocessing
)
dataloader = CustomDataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)

In [None]:
for input, label in tqdm(dataloader, leave = True):
    print(input.shape, label.shape)

### Model import
The model has been trained on a small hand-annotated dataset. Unfortunately this is currently a quite messy workflow, since it is done it Google Colab and not on uCloud, where the rest of the processing will be done. This will be fixed later and is an issue of asyncronous software and infrastructure development.

In [None]:
# model: efficientnet_v2_s
model = torchvision.models.efficientnet_v2_s(weights = weights).train(False).half()
num_features = [k for k in [j for j in [i for i in model.children()][0].children()][-1].children()][0].out_channels
num_classes = 3
model.classifier = nn.Sequential(
    nn.Dropout(0.2),
    nn.BatchNorm1d(num_features),
    nn.Linear(num_features, 512),
    nn.BatchNorm1d(512),
    nn.LeakyReLU(),
    nn.Dropout(0.1),
    nn.Linear(512, num_classes),
)
model.load_state_dict(torch.load("models/{model_name}.pt"))
model = model.to(device=device, dtype=dtype)
model.eval()

## Inference

In [None]:
class EmbedEfficientNet(nn.Module):
    def __init__(self, original_model):
        super(EmbedEfficientNet, self).__init__()

        children = list(original_model.children())

        # Extract all layers except the classifier
        self.features = nn.Sequential(*children)[:-1]

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)  # Flatten the tensor
        return x

embeddings_model = EmbedEfficientNet(model)

In [None]:
predictions, images, embeddings = [], [], []

# inference loop
with torch.no_grad():
    for inputs, labels in tqdm(dataloader):
        inputs = inputs.to(device=device, dtype=dtype)
        scores = model(inputs)
        embeddings.append(embeddings_model(inputs).float().cpu().numpy())
        preds = scores.float().exp()
        preds /= preds.sum(dim=1, keepdim=True)
        predictions.append(preds.cpu().numpy())
        images.append(denormalize(inputs).cpu().numpy())

predictions = np.concatenate(predictions)
images = np.concatenate(images)
embeddings = np.concatenate(embeddings)

In [None]:
images[0].flatten()

In [None]:
# Plot predictions 
fig, ax = plt.subplots(34, 3, figsize=(15, 100))

for i in range(100):
    this_row = images[i].transpose(1,2,0)
    # clamp
    this_row = np.clip(this_row, 0, 1)
    ax[i//3, i%3].imshow(this_row)
    ax[i//3, i%3].set_title(f"Prediction: {predictions[i][0]:.0%} {predictions[i][1]:.0%} {predictions[i][2]:.0%}")
    ax[i//3, i%3].axis('off')

plt.tight_layout()
plt.show()

In [None]:
embeddings_normalized = torch.tensor(embeddings)
embeddings_normalized /= embeddings_normalized.norm(dim=1, keepdim=True)
plt.scatter(*torch.pca_lowrank(embeddings_normalized, q=2)[0].numpy().transpose())