# Notebook for developing the QC pipeline
## Setup
### User parameters

In [1]:
model_name = "qualityControlV1"

### Imports

In [2]:
import os, sys, pathlib, time, random
# Move working directory to the root of the project
os.chdir("/home/ucloud/EUMothModel")
print("Working directory:", os.getcwd())

# __package__ = ".."
# sys.path.append(os.path.abspath(__package__))

# from rclonemountpy.utils.mount import *
from utils.implicit_mount import *
from utils.dataloader import *

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm

Working directory: /home/ucloud/EUMothModel


In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device:", device)
dtype = torch.bfloat16

Device: cuda:0


### Cuda detection & Datatype selection

### Mount

In [4]:
backend = None

In [5]:
if backend is not None:
    backend.stop()
backend = IOHandler(verbose = False, clean=True)
backend.start()
backend.cd("AMI_GBIF_Pretraining_Data/root")
backend.cache_file_index()

Connected to sftp://asgersvenning%40ecos.au.dk:@io.erda.au.dk:2222
Local directory: /tmp/tmpo7y7374m
IOHandler.start() is unsafe. Use IOHandler.__enter__() instead if possible.
OBS: Remember to call IOHandler.stop() when you are done.


### Dataloader

In [6]:
weights = torchvision.models.EfficientNet_V2_S_Weights.DEFAULT
image_preprocessing = weights.transforms(antialias=True)
def denormalize(tensor, mean=image_preprocessing.mean, std=image_preprocessing.std):
    """Denormalize a tensor."""
    mean = torch.tensor(mean).view(1, 3, 1, 1).to(torch.float32)
    std = torch.tensor(std).view(1, 3, 1, 1).to(torch.float32)
    return tensor.cpu().to(torch.float32) * std + mean

dataset = RemotePathDataset(
    remote_path_iterator=RemotePathIterator(
        backend,
        batch_size=128,
        max_queued_batches=5,
        n_local_files=2*128*5
    ),
    prefetch=4*32,
    transform=image_preprocessing,
    device=device,
    dtype=dtype,
    return_remote_path=True
)
dataloader = CustomDataLoader(dataset=dataset, batch_size=32, shuffle=True, num_workers=4)

### Model import
The model has been trained on a small hand-annotated dataset. Unfortunately this is currently a quite messy workflow, since it is done it Google Colab and not on uCloud, where the rest of the processing will be done. This will be fixed later and is an issue of asyncronous software and infrastructure development.

Once images have passed QC the images can be moved to an appropriate directory, while maintaining the proper subdirectory structure with:
```lftp
mget -d -O sftp://io.erda.au.dk:/AMI_GBIF_Pretraining_Data/NEW_SUPER_DIRECTORY ./PATH_TO_IMAGE
```

In [7]:
QC_dict = {
    0 : "Larvae",
    1 : "Low",
    2 : "High"
}

# model: efficientnet_v2_s
model = torchvision.models.efficientnet_v2_s(weights = weights).train(False).half()
num_features = [k for k in [j for j in [i for i in model.children()][0].children()][-1].children()][0].out_channels
num_classes = 3
model.classifier = nn.Sequential(
    nn.Dropout(0.2),
    nn.BatchNorm1d(num_features),
    nn.Linear(num_features, 512),
    nn.BatchNorm1d(512),
    nn.LeakyReLU(),
    nn.Dropout(0.1),
    nn.Linear(512, num_classes),
)
model.load_state_dict(torch.load(f"models/{model_name}.pt"))
model = model.to(device=device, dtype=dtype)
model = model.eval()
pass

## Creating an inference test collage

In [None]:
# many_inputs = []

# for i, (input, label, path) in tqdm(enumerate(dataloader), leave = True):
#     many_inputs.append(input)
#     if i == 10:
#         break

In [None]:
# print("Preparing")

# probs_max_l = []
# inf_cls_l = []
# inf_cls_name_l = []

# torch.cuda.empty_cache()

# print("Inference started...")

# with torch.no_grad():
#     for input in tqdm(many_inputs, leave = False):
#         logits = model(input)
#         probs = F.softmax(logits, dim=1)

#         inf_cls, probs_max = probs.argmax(dim=1), probs.max(dim=1).values
#         inf_cls, probs_max = inf_cls.cpu(), probs_max.cpu().float()

#         inf_cls_name = [QC_dict[i.item()] for i in inf_cls]

#         probs_max_l.append(probs_max)
#         inf_cls_l.append(inf_cls)
#         inf_cls_name_l.append(inf_cls_name)

# probs_max = torch.cat(probs_max_l, dim=0)
# inf_cls = torch.cat(inf_cls_l, dim=0)
# inf_cls_name = [i for j in inf_cls_name_l for i in j]

# print("Inference done!")

# ## Subset of 900 random images from the dataset and sort them by their predicted class
# # Generate 900 random indices, ensuring they're within the bounds of the dataset
# random_indices = torch.randperm(len(inf_cls))[:900]

# # Sort the random subset by the predicted class
# idx_sort_subset = torch.argsort(inf_cls[random_indices])

# # Map sorted indices back to the original dataset
# idx_sort = random_indices[idx_sort_subset]

# imgs = torch.concat([i.cpu() for i in many_inputs], dim=0)[idx_sort]
# inf_cls_name = [inf_cls_name[i] for i in idx_sort]
# probs_max = probs_max[idx_sort]

# print("Plotting...")

# # Plot image grid of "input" tensor with the predicted class and probability above each image
# grid = torchvision.utils.make_grid(denormalize(imgs), nrow=30).permute(1, 2, 0).numpy()

# fig, ax = plt.subplots(figsize=(100, 100))
# ax.imshow(grid)
# ax.set_xticks([])
# ax.set_yticks([])

# for i, (cls, prob) in enumerate(zip(inf_cls_name, probs_max)):
#     ax.text(
#         i % 30 * 384 + 384 // 2,
#         i // 30 * 384 + 384 // 2,
#         f"{cls} ({prob*100:.1f}%)",
#         color="black",
#         fontsize=16,
#         # Background color; semi-transparent white
#         fontdict={"family": "monospace", "weight": "bold"},
#         bbox=dict(facecolor="white", alpha=0.5, pad=4, edgecolor="none"),
#         horizontalalignment="center",
#         verticalalignment="center",
#     )

# print("Plotting done!")

# print("Rendering...")

# plt.show()

# print("Rendering done!")

In [None]:
# del logits, probs, inf_cls, probs_max, inf_cls_name, idx_sort, idx_sort_subset, random_indices, grid, fig, ax
# del many_inputs, imgs, input

In [8]:
torch.cuda.empty_cache()
mem_info = [i / 10**9 for i in torch.cuda.mem_get_info(device)]
"Memory usage: {:.2f} / {:.2f} GB".format(*mem_info)

'Memory usage: 15.08 / 15.65 GB'

In [9]:
def move_to_dir(paths, dst_dir, client: "IOHandler", verbose=False):
    # Template: mget -d -O sftp://io.erda.au.dk:/AMI_GBIF_Pretraining_Data/NEW_SUPER_DIRECTORY ./PATH_TO_IMAGES
    # The template above is used to copy a file from one location on the remote server to another location on the remote server,
    # while preserving the directory structure of the source location and automatically creating the necessary subdirectories in the destination location.

    cmd_prefix = "mget -d -O sftp://io.erda.au.dk:/AMI_GBIF_Pretraining_Data/"

    cmd_suffix = []
    for path in paths:
        cmd_suffix.append(f"./{path}")
    
    cmd_suffix = " ".join(cmd_suffix)

    cmd = f"{cmd_prefix}{dst_dir} {cmd_suffix}"
    if verbose:
        print("Executing command:", cmd)
    result = client.execute_command(cmd, blocking=True, execute=True)
    if verbose:
        print("Result:", result)


In [10]:
# Create a log file to ensure that the process can be restored if it is interrupted or an error occurs
assert os.path.isdir("logs"), RuntimeError("Log directory does not exist")

# Manual way
log_files = os.listdir("logs")
log_files = [i for i in log_files if i.endswith(".log")]
log_files = [i for i in log_files if i.startswith(model_name)]
this_log_file = f"{model_name}_{len(log_files)}.log"


with open(f"logs/{this_log_file}", "w") as f:
    f.write("")
    for i, (input, label, path) in tqdm(enumerate(dataloader), leave = True, total=len(dataloader)):
        try:
            pred = model(input).argmax(dim=1).cpu()
            pcls = [QC_dict[i.item()] for i in pred]
            NOT_Larvae = [p for p, c in zip(path, pcls) if c != "Larvae"]
            if len(NOT_Larvae) > 0:
                move_to_dir(NOT_Larvae, "without_larvae", backend)
            if i % 100 == 0:
                f.write(f"Batch {i} completed. {len(NOT_Larvae)} images moved to 'without_larvae'.\n")
        except Exception as e:
            print(f"Error at batch {i}: {e}")
            f.write(f"Error at batch {i}: {e}\n")

Buffer min fill: 51, max fill: 76
Producer thread started.
Consumer thread started.
Consumer thread started.
Consumer thread started.
Consumer thread started.


  0%|          | 0/80309 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Inference