<a href="https://colab.research.google.com/github/cam2149/MachineLearningIV/blob/main/Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

aa_iv_2025_i_object_localization_path = kagglehub.competition_download('aa-iv-2025-i-object-localization')

print('Data source import complete.')


In [None]:
!pip install torchsummary
!pip install -U albumentations

# Imports

In [None]:
import numpy as np
import pandas as pd
import time
import copy

import torch
from torch import nn
from torch import Tensor
from torch.optim import Optimizer
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from multiprocessing import cpu_count

import typing as ty
from numpy.typing import NDArray
import os
import os.path as osp

import albumentations as A
import torchvision
from skimage import io, transform
import cv2
from PIL import Image

from functools import reduce
from torchsummary import summary

import matplotlib.pyplot as plt

# Config

In [None]:
config = {
    "DATA_DIR": "/kaggle/input/aa-iv-2025-i-object-localization/",
    "WORK_DIR": "/kaggle/working",
    "IMG_DIR": "images/images",
    "TRAIN_CSV": "train.csv",
    "obj2id": {"f16": 0, "cougar": 1, "chinook": 2, "ah64": 3, "f15": 4, "seahawk": 5},
    "id2obj": {0: "f16", 1: "cougar", 2: "chinook", 3: "ah64", 4: "f15", 5: "seahawk"},
    "h_real": 720,
    "w_real": 1280,
    "channel": 3,
    "w_resize": 416,
    "h_resize": 416,
    "grayscale": False,
}

In [None]:
torch.manual_seed(32)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device}')
test = torch.ones((100, 100)).to(device)
del test
torch.cuda.empty_cache()

# Random seed

In [None]:
def reset_seed():
    random_seed = 42
    torch.backends.cudnn.enabled = True
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(42)
        torch.cuda.manual_seed_all(42)

        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

reset_seed()

# Function to read CSV

In [None]:
def read_train_csv():
    """
    Read the train csv file and return the dataframe with the necessary columns.

    Args:
    config: Config object

    Returns:
    df: Dataframe with the necessary columns

    usage:
    df = read_train_csv(config)
    """
    df = pd.read_csv(osp.join(config["DATA_DIR"], config["TRAIN_CSV"]))
    df["class_id"] = df["class"].map(config["obj2id"])
    columns_f = ["filename", "xmin", "ymin", "xmax", "ymax", "class_id"]
    df = df[columns_f]
    df[["ymin", "ymax"]] = df[["ymin", "ymax"]].div(config["h_real"], axis=0)
    df[["xmin", "xmax"]] = df[["xmin", "xmax"]].div(config["w_real"], axis=0)
    return df

# Function to get the mean and standard deviation of the channels

In [None]:
def get_channels_std(ds):
    """
    Get the standard deviation and mean of each channel in the data.
    """
    means = np.zeros(3)
    stds = np.zeros(3)
    n_images = 0

    for x in ds:
        img = x["image"].astype(
            np.float32
        )  # Asegúrate de que la imagen está en float para cálculos precisos
        n_images += 1

        for channel in range(3):
            channel_pixels = img[..., channel]
            # Acumular la suma y suma de cuadrados para calcular la media y desviación estándar
            means[channel] += np.mean(channel_pixels)
            stds[channel] += np.std(channel_pixels)

    # Calcular la media y desviación estándar final
    means /= n_images
    stds /= n_images

    return means, stds

# Transforms Functions

In [None]:
class ToTensor(object):
    """
    Convert ndarrays in sample to Tensors for pytorch.

    Arguments:
        sample: a dictionary containing:
            image: sample image in format (H, W, C)
    Returns:
        the image in (C, H, W) format.
    """

    def __call__(self, sample):
        image = sample["image"]

        # swap color axis because
        # numpy image: H x W x C (0,1,2)
        # torch image: C x H x W
        image = image.transpose((2, 0, 1))
        image = torch.from_numpy(image).float()
        sample.update({"image": image})
        return sample


class Normalizer(object):
    """
    Normalize the image by subtracting the mean and dividing by the standard deviation.
    """

    def __init__(self, stds, means):
        """
        Arguments:

            stds: array of length 3 containing the standard deviation of each channel in RGB order.
            means: array of length 3 containing the means of each channel in RGB order.
        """
        self.stds = stds
        self.means = means

    def __call__(self, sample):
        """
        Sample: a dicitonary containing:
            image: sample image in format (C, H, W)
        Returns:
            the image in (C, H, W) format with the channels normalized.
        """
        image = sample["image"]

        for channel in range(3):
            image[channel] = (image[channel] - self.means[channel]) / self.stds[channel]

        sample["image"] = image
        return sample


class AlbumentationsWrapper(object):
    """
    Albumentations Wrapper

    Arguments:
        transform: an albumentations transform receiving an image and bounding boxes.

    Returns:
        the image transformed by the transform object.
    """

    def __init__(self, transform):
        self.transform = transform

    def __call__(self, sample):
        transformed = self.transform(
            image=sample["image"],
            bboxes=sample["bbox"],
            # category_ids=sample['class_id']
        )
        sample["image"] = transformed["image"]
        sample["bbox"] = np.array(transformed["bboxes"])
        return sample


def common_transforms(means, stds):
    """
    Common transformations for the image.
    Arguments:
        means: array of length 3 containing the means of each channel in RGB order.
        stds: array of length 3 containing the standard deviation of each channel in RGB order.
    Returns:
        a list of transformations.
    """
    return [
        ToTensor(),
        Normalizer(
            means=means,
            stds=stds,
        ),
    ]


class TransformComposed:
    """
    Compose a list of transformations.

    Arguments:
        means: array of length 3 containing the means of each channel in RGB order.
        stds: array of length 3 containing the standard deviation of each channel in RGB order.

    Returns:
        a composed transformation.
    """

    def __init__(self, means, stds):
        self.means = means
        self.stds = stds

    def getTransform(self, transforms=[]):
        return torchvision.transforms.Compose(
            [AlbumentationsWrapper(t) for t in transforms]
            + common_transforms(self.means, self.stds)
        )

# Required functions to train a pytorch model

## Class to load dataset

In [None]:
transform_func_inp_signature = ty.Dict[str, NDArray[np.float_]]
transform_func_signature = ty.Callable[
    [transform_func_inp_signature],
    transform_func_inp_signature
]

class militarDataset(Dataset):
    """
    Location image dataset
    """
    def __init__(
        self,
        df: pd.DataFrame,
        root_dir: str,
        labeled: bool = True,
        transform: ty.Optional[ty.List[transform_func_signature]] = None,
        output_size: ty.Optional[tuple] = None  # Añadir parámetro para tamaño de salida
    ) -> None:
        self.df = df
        self.root_dir = root_dir
        self.transform = transform
        self.labeled = labeled
        self.output_size = output_size  # Almacenar el tamaño de salida

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx: int) -> transform_func_signature:
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # Read image
        img_name = os.path.join(self.root_dir, self.df.filename.iloc[idx])
        #img_name = os.path.join(self.root_dir, self.df.iloc[idx]['filename'])
        image = io.imread(img_name)
        #image = cv2.imread(img_name)


        #print(f"Dimensiones originales de la imagen: {image.shape}")  # Agregar para depuración
        if image is None:
            raise FileNotFoundError(f"Image not found: {img_name}")

        if image.ndim == 2:  # Si la imagen está en escala de grises
            image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)  # Convertir a RGB
        elif image.shape[2] == 4:  # Si la imagen es RGBA
            image = image[:, :, :3]

        # Redimensionar la imagen si se especifica un tamaño de salida
        if self.output_size:
            image = cv2.resize(image, self.output_size)  # Redimensionar la imagen

        sample = {'image': image}

        if self.labeled:
            # Read labels
            img_class = self.df.class_id.iloc[idx]
            img_bbox = self.df.iloc[idx, 1:5]

            img_bbox = np.array([img_bbox]).astype('float')
            img_class = np.array([img_class]).astype('int')
            sample.update({'bbox': img_bbox, 'class_id': img_class})

        if self.transform:
            sample = self.transform(sample)

        return sample

## Metrics

In [None]:
def iou(y_true: Tensor, y_pred: Tensor):
    pairwise_iou = torchvision.ops.box_iou(y_true.squeeze(), y_pred.squeeze())
    result = torch.trace(pairwise_iou) / pairwise_iou.size()[0]
    return result

def accuracy(y_true: Tensor, y_pred: Tensor):
    pred = torch.argmax(y_pred, axis=-1)
    y_true = y_true.squeeze()
    correct = torch.eq(pred, y_true).float()
    total = torch.ones_like(correct)
    result = torch.divide(torch.sum(correct), torch.sum(total))
    return result

## Loss function

In [None]:
def loss_fn(y_true, y_preds, alpha: float = 0.5):
    cls_y_true, cls_y_pred = y_true['class_id'].long(), y_preds['class_id'].float().unsqueeze(-1)
    reg_y_true, reg_y_pred = y_true['bbox'].float().squeeze(), y_preds['bbox'].float().squeeze()

    cls_loss = F.cross_entropy(cls_y_pred, cls_y_true)

    reg_loss = F.mse_loss(reg_y_pred, reg_y_true)
    # Adds weights to both tasks
    total_loss = (1 - alpha) * cls_loss + alpha * reg_loss
    return dict(loss=total_loss, reg_loss=reg_loss,cls_loss=cls_loss)

## callbacks

In [None]:
def printer(logs: ty.Dict[str, ty.Any], epoch: int, seconds:int):
    print(f'Epoch #: {epoch} in {seconds} seconds')
    for name, value in logs.items():
        if not name.endswith(f'{epoch}'):
            continue

        if type(value) in [float, int]:
            value = round(value, 4)
        elif type(value) is torch.Tensor:
            value = torch.round(value, decimals=4)

        print(f'\t{name} = {value}')
    print()

## Define model

In [None]:
def get_output_shape(
    model: nn.Sequential, image_dim: ty.Tuple[int, int, int], device: str = "cpu"
) -> ty.Tuple[int, int, int]:
    return model(torch.rand(*(image_dim)).to(device)).data.shape


class Model(nn.Module):
    def __init__(
        self,
        backbone: nn.Module,
        input_shape: ty.Tuple[int, int, int] = (3, 255, 400),
        n_classes: int = 6,
        device: str = "cpu"
    ):
        """
        Model with one input (image) and two outputs:
            1. Digit classification (classification).
            2. Bounding box prediction (regression).

        Arguments:
            input_shape: input shape of the image in format (C, H, W)
            n_classes: number of classes to perfrom classification with
            backbone: Initial model to extract features from the image and pass to clasification and regresion heads.

        Attributes:
            backbone: ConvNet that process the image and
            returns a flattened vector with the information of the
            activations.

            cls_head: MLP that receives the flattened input from the backbone
            and predicts the classification logits for the classes (classficiation task).

            reg_head: MLP that receives the flattened input from the backbone
            and predicts the coordinates of the predicted bounding box (regression task).
        """
        super().__init__()

        self.input_shape = input_shape

        # When doing transfer learning, use pretrained model instead of custom backbone
        self.backbone = backbone

        backbone_output_shape = get_output_shape(self.backbone, [1, *input_shape],device)
        backbone_output_features = reduce(lambda x, y: x * y, backbone_output_shape)

        self.cls_head = nn.Sequential(
            nn.Linear(in_features=backbone_output_features, out_features=768),
            nn.ReLU(),
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, n_classes),
        )

        self.reg_head = nn.Sequential(
            nn.Linear(in_features=backbone_output_features, out_features=768),
            nn.ReLU(),
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 4),
        )

    def forward(self, x: Tensor) -> ty.Dict[str, Tensor]:
        features = self.backbone(x)
        cls_logits = self.cls_head(features)
        pred_bbox = self.reg_head(features)
        predictions = {"bbox": pred_bbox, "class_id": cls_logits}
        return predictions

## Functions to train the model

In [None]:
def normalize_logs(
    logs: ty.Dict[str, ty.Any],
    eval_set: str,
    batch:int,
    epoch:int,
    metrics: ty.Dict[str, ty.Callable[[Tensor, Tensor], Tensor]],
    losses: ty.Optional[ty.Dict[str, Tensor]] = None
) -> ty.Dict[str, ty.Any]:
    """
         Normalizes the logs for a given evaluation set and batch number, optionally with losses.

         Parameters:
         - logs (ty.Dict[str, ty.Any]): A dictionary containing the logs to be normalized.
         - eval_set (str): The name of the evaluation set (e.g., 'train', 'val', or 'test').
         - batch (int): The batch number to normalize the logs.
         - epoch (int): The epoch number for the logs.
         - metrics (ty.Dict[str, ty.Callable[[Tensor, Tensor], Tensor]]): A dictionary of metrics to be normalized.
         - losses (ty.Optional[ty.Dict[str, Tensor]], optional): A dictionary of losses to be normalized. Defaults to None.

         Returns:
         - ty.Dict[str, ty.Any]: A dictionary containing the normalized logs.
    """

    if losses is not None:
        for loss_name, _ in losses.items():
            logs[f'{eval_set}_{loss_name}_{epoch}'] /= batch
    for task_name in metrics:
        for metric_name, metric in metrics[task_name]:
            logs[f'{eval_set}_{metric_name}_{epoch}'] /= batch

    return logs

def evaluate(
    logs: ty.Dict[str, ty.Any],
    labels: ty.Dict[str, Tensor],
    preds: ty.Dict[str, Tensor],
    eval_set: str,
    metrics: ty.Dict[str, ty.Callable[[Tensor, Tensor], Tensor]],
    losses: ty.Optional[ty.Dict[str, Tensor]] = None,
    isFirstBatch: bool = True,
    epoch:int = 1
) -> ty.Dict[str, ty.Any]:
    """
        Evaluates the model's performance on a given dataset and updates the logs with metrics and losses.

        Args:
            logs (Dict[str, Any]): A dictionary containing the current logs of metrics and losses.
            labels (Dict[str, Tensor]): A dictionary containing the ground truth labels for the evaluation.
            preds (Dict[str, Tensor]): A dictionary containing the model's predictions for the evaluation.
            eval_set (str): The name of the dataset being evaluated (e.g., 'train' or 'validation').
            metrics (Dict[str, Callable[[Tensor, Tensor], Tensor]]): A dictionary of metric functions that take
                two tensors (predictions and labels) and return a metric value.
            losses (Optional[Dict[str, Tensor]]): An optional dictionary containing the calculated losses for
                different components of the model. If not provided, it is assumed that there are no losses to evaluate.
            isFirstBatch (bool): A flag indicating whether this is the first batch being evaluated. Defaults to True.
            epoch (int): The current epoch number in the training process. Defaults to 1.

        Returns:
            Dict[str, Any]: A dictionary containing the updated logs with computed metrics and losses, including
            information about the evaluation set and the current epoch.
    """
    if losses is not None:
        for loss_name, loss_value in losses.items():
            if isFirstBatch:
                logs[f'{eval_set}_{loss_name}_{epoch}'] = loss_value
            else:
                logs[f'{eval_set}_{loss_name}_{epoch}'] += loss_value

    for task_name, label in labels.items():
        for metric_name, metric in metrics[task_name]:
            value = metric(label, preds[task_name])
            if isFirstBatch:
                logs[f'{eval_set}_{metric_name}_{epoch}'] = value
            else:
                logs[f'{eval_set}_{metric_name}_{epoch}'] += value

    return logs

def step(
    model: Model,
    optimizer: Optimizer,
    batch: militarDataset,
    loss_fn: ty.Callable[[ty.Dict[str, torch.Tensor]], torch.Tensor],
    device: str,
    train: bool = False,
) -> ty.Tuple[ty.Dict[str, Tensor], ty.Dict[str, Tensor]]:
    """
      Performs a single training or evaluation step with the given model, optimizer, batch, and loss function.

      Parameters:
      - model (Model): PyTorch model to perform the step with.
      - optimizer (Optimizer): PyTorch optimizer for the model.
      - batch (militarDataset): A batch of data to perform the step on.
      - loss_fn (ty.Callable[[ty.Dict[str, torch.Tensor]], torch.Tensor]): Loss function for the model.
      - device (str): Device to run the model on (e.g., 'cuda' or 'cpu').
      - train (bool, optional): Whether this is a training step or an evaluation step. Defaults to False.

      Returns:
      - input_features (ty.Dict[str, Tensor]): A dictionary containing the input features for the model's forward pass.
      - output_values (ty.Dict[str, Tensor]): A dictionary containing the output values from the model's forward pass and the loss function.
    """

    if train:
        optimizer.zero_grad()

    #img = batch['image'].to(device)
    img = batch.pop('image').to(device)

    for k in list(batch.keys()):
        batch[k] = batch[k].to(device)

    preds = model(img.float())
    losses = loss_fn(batch, preds,0.8)
    final_loss = losses['loss']

    if train:
        final_loss.backward()
        optimizer.step()

    return losses, preds


def train(
    model: Model,
    optimizer: Optimizer,
    dataset: DataLoader,
    eval_datasets: ty.List[ty.Tuple[str, DataLoader]],
    loss_fn: ty.Callable[[ty.Dict[str, torch.Tensor]], torch.Tensor],
    metrics: ty.Dict[str, ty.Callable[[Tensor, Tensor], Tensor]],
    callbacks: ty.List[ty.Callable[[ty.Dict[ty.Any, ty.Any]], None]],
    device: str,
    epochs: int = 10,
    early_stopping_loss_val_patience: int = 1
) -> Model:
    """
       Trains a PyTorch model with the specified settings and callbacks.

       Parameters:
       - model (Model): PyTorch model to be trained.
       - optimizer (Optimizer): PyTorch optimizer for the model.
       - dataset (DataLoader): DataLoader for the training dataset.
       - eval_datasets (ty.List[ty.Tuple[str, DataLoader]]): List of tuples, where each tuple contains the name and DataLoader for a validation dataset.
       - loss_fn (ty.Callable[[ty.Dict[str, torch.Tensor]], torch.Tensor]): Loss function for the model.
       - metrics (ty.Dict[str, ty.Callable[[Tensor, Tensor], Tensor]]): Dict of metrics to track during training.
       - callbacks (ty.List[ty.Callable[[ty.Dict[ty.Any, ty.Any]], None]]): List of callbacks to execute during training.
       - device (str): Device to run the model on (e.g., 'cuda' or 'cpu').
       - epochs (int, optional): Number of epochs to train the model for. Defaults to 50.
       - early_stopping_loss_val_patience (int, optional): Number of epochs to wait model to improve before stop, Default 1

       Returns:
       - model (Model): Best trained PyTorch model.
       - best_epoch: Number of the best epoch.
       - logs (Dict[str, ty.Any]): Dictionary with logs of the training
   """
    # Send model to device (GPU or CPU)
    model = model.to(device)
    logs = dict()
    min_loss = np.Inf
    best_model = None
    best_epoch = 0
    num_epochs_patience = 0

    for epoch in range(epochs):

        start_time = time.time()
        batchNumber = 0
        isFirstBatch = True
        model.train()

        for batch in dataset:
            # Send batch to device
            batchNumber += 1
            #print(f'batch # {batchNumber}')
            losses, preds = step(model, optimizer, batch, loss_fn, device, train=True)
            logs = evaluate(logs, batch, preds, 'train', metrics, losses,isFirstBatch,epoch+1)
            isFirstBatch = False

        logs = normalize_logs(logs, 'train', batchNumber, epoch+1, metrics, losses)

        model.eval()
        isFirstBatch = True
        batchNumber = 0

        # Avoids calculating gradients in evaluation dataset.
        with torch.no_grad():

            for name, dataset in eval_datasets:

                for batch in dataset:
                    batchNumber += 1
                    #print(f'eval batch # {batchNumber}')
                    losses, preds = step(model, optimizer, batch, loss_fn, device, train=False)
                    logs = evaluate(logs, batch, preds, name, metrics, losses, isFirstBatch,epoch+1)
                    isFirstBatch = False

                logs = normalize_logs(logs, name, batchNumber, epoch+1, metrics, losses)

        end_time = time.time()
        seconds = end_time - start_time

        for callback in callbacks:
            callback(logs, epoch+1, seconds)

        # model checkpoint
        if logs[f'val_loss_{epoch+1}'] < min_loss:
            best_model=copy.deepcopy(model)
            best_epoch=epoch+1
            torch.save(model, "best_model.pth")
            #print(f"Se mejoró el loss de {min_loss} a {logs[f'val_loss_{epoch+1}']}")
            min_loss = logs[f'val_loss_{epoch+1}']
            num_epochs_patience = 0
        else:
            #print(f"No Se mejoró el loss de {min_loss} a {logs[f'val_loss_{epoch+1}']}")
            num_epochs_patience +=1

        if early_stopping_loss_val_patience >-1 and num_epochs_patience > early_stopping_loss_val_patience:
            print(f"----- patience for early stopping {early_stopping_loss_val_patience} exceeded")
            break

    return best_model, best_epoch, logs

# Split train and validation

In [None]:
reset_seed()

In [None]:
df = read_train_csv()
train_df, val_df = train_test_split(
    df, stratify=df["class_id"], test_size=0.25, random_state=42
)
print(f'training set shape: {train_df.shape}')
print(f'validation set shape: {val_df.shape}')

# Define transforms

In [None]:
root_dir = osp.join(config["DATA_DIR"], config["IMG_DIR"])
train_ds = militarDataset(train_df, root_dir)
means, stds = get_channels_std(train_ds)
common_transforms_I = common_transforms(means,stds)

In [None]:
bbox_params = A.BboxParams(format="albumentations", label_fields=[])

In [None]:
train_data_augmentations = A.Compose([
    A.HorizontalFlip(p=1),
    A.Rotate(limit=45, p=0.5),
    A.AutoContrast(p=0),
    A.Defocus(p=1),
    A.Downscale(p=1),
    A.GaussNoise(p=0),
    A.GaussianBlur(p=0),
    A.HueSaturationValue(p=0),
    A.ISONoise(p=0),
    A.PlanckianJitter(p=0),
    A.PlasmaShadow(p=0),
    A.Posterize(p=0),
    A.RandomFog(p=0),
    A.RandomSnow(p=0),
    A.RandomSunFlare(p=0),
    A.SaltAndPepper(p=0),
    A.Sharpen(p=0),
    A.ZoomBlur(p=0)
    ],
    bbox_params=bbox_params
)

train_transforms = torchvision.transforms.Compose(
    [
        AlbumentationsWrapper(train_data_augmentations),
    ] + common_transforms_I
)

eval_transforms = torchvision.transforms.Compose(common_transforms_I)

# Transfer Learning

In [None]:
class FeatureExtractor(nn.Module):
    def __init__(self, model):
        super(FeatureExtractor, self).__init__()
        # Extract VGG-16 Feature Layers
        self.features = list(model.features)
        self.features = nn.Sequential(*self.features)
        # Extract VGG-16 Average Pooling Layer
        self.pooling = model.avgpool
        # Convert the image into one-dimensional vector
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # It will take the input 'x' until it returns the feature vector called 'out'
        out = self.features(x)
        out = self.pooling(out)
        out = self.flatten(out)
        out = self.dropout(out)
        return out

## Clasification models

The following classification models are available, with or without pre-trained weights:

* AlexNet
* ConvNeXt
* DenseNet
* EfficientNet
* EfficientNetV2
* GoogLeNet
* Inception V3
* MaxVit
* MNASNet
* MobileNet V2
* MobileNet V3
* RegNet
* ResNet
* ResNeXt
* ShuffleNet V2
* SqueezeNet
* SwinTransformer
* VGG
* VisionTransformer
* Wide ResNet

### EfficientNetV2

In [None]:
# from torchvision.models import efficientnet_v2_l, EfficientNet_V2_L_Weights

# Load the efficient net v2 model
# env2_model = efficientnet_v2_l(weights=EfficientNet_V2_L_Weights.DEFAULT)
# pretrained_model = FeatureExtractor(env2_model).to(device)
# pretrained_model

### vgg16

In [None]:
from torchvision.models import vgg16, VGG16_Weights


# Load the vgg16 model
vgg16_model = vgg16(weights=VGG16_Weights.DEFAULT, progress=True)
pretrained_model = FeatureExtractor(vgg16_model).to(device)
pretrained_model

## Object detection

* Faster R-CNN
* FCOS
* RetinaNet
* SSD
* SSDlite

### Retina Net

In [None]:
#from torchvision.models.detection import retinanet_resnet50_fpn_v2, RetinaNet_ResNet50_FPN_V2_Weights

#retina = retinanet_resnet50_fpn_v2(weights=RetinaNet_ResNet50_FPN_V2_Weights.DEFAULT, progress=True)
#pretrained_model = FeatureExtractor(retina).to(device)
#pretrained_model

# Initialize model

In [None]:
train_ds[0]['image'].shape

In [None]:
# Hparams
batch_size = 32
lr = 0.001
w = 256
h = 256
c = 3

# Data
train_ds = militarDataset(train_df, root_dir=root_dir, transform=train_transforms,output_size=(w,h))
val_ds = militarDataset(val_df, root_dir=root_dir, transform=eval_transforms,output_size=(w,h))

train_data = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=cpu_count())
val_data = DataLoader(val_ds, batch_size=batch_size, num_workers=cpu_count())

# Model
model = Model(pretrained_model,(c,h,w),device=device).to(device)
summary(model, model.input_shape)

# Optimizer
optimizer = torch.optim.Adam(lr=lr, params=model.parameters())

# Training

In [None]:
model, best_epoch,logs = train(
    model,
    optimizer,
    train_data,
    eval_datasets=[('val', val_data)],
    loss_fn=loss_fn,
    metrics={
        'bbox': [('iou', iou)],
        'class_id': [('accuracy', accuracy)]
    },
    callbacks=[printer],
    device=device,
    epochs=50,
    early_stopping_loss_val_patience=5
)

# Plot epochs

In [None]:
def plot_losses_and_accuracies(logs: ty.Dict[str, ty.Any]):
    epochs = []
    train_loss = []
    val_loss = []
    train_accuracy = []
    val_accuracy = []
    train_iou = []
    val_iou = []

    # Extraer los datos del diccionario
    for key, value in logs.items():
        if "train_loss" in key:
            epoch = int(key.split('_')[2])
            epochs.append(epoch)
            train_loss.append(value.cpu().detach().numpy())
        elif "val_loss" in key:
            # Guardar el valor de val_loss
            val_loss.append(value.cpu().numpy())
        elif "train_accuracy" in key:
            # Guardar el valor de train_accuracy
            train_accuracy.append(value.cpu().detach().numpy())
        elif "val_accuracy" in key:
            # Guardar el valor de val_accuracy
            val_accuracy.append(value.cpu().numpy())
        elif "train_iou" in key:
            # Guardar el valor de train_iou
            train_iou.append(value.cpu().detach().numpy())
        elif "val_iou" in key:
            # Guardar el valor de val_iou
            val_iou.append(value.cpu().numpy())

    # Ordenar las listas por épocas
    epochs = sorted(set(epochs))  # Asegurarse de que las épocas sean únicas y ordenadas
    train_loss = [train_loss[i-1] for i in epochs]  # Ajustar los índices
    val_loss = [val_loss[i-1] for i in epochs]  # Ajustar los índices
    train_accuracy = [train_accuracy[i-1] for i in epochs]  # Ajustar los índices
    val_accuracy = [val_accuracy[i-1] for i in epochs]  # Ajustar los índices
    train_iou = [train_iou[i-1] for i in epochs]  # Ajustar los índices
    val_iou = [val_iou[i-1] for i in epochs]  # Ajustar los índices

    # Crear el gráfico
    plt.figure(figsize=(12, 8))

    # Graficar Loss
    plt.subplot(2, 2, 1)
    plt.plot(epochs, train_loss, marker='o', label='Train Loss')
    plt.plot(epochs, val_loss, marker='o', label='Val Loss')
    plt.title('Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid()

    # Graficar Accuracy
    plt.subplot(2, 2, 2)
    plt.plot(epochs, train_accuracy, marker='o', label='Train Accuracy')
    plt.plot(epochs, val_accuracy, marker='o', label='Val Accuracy')
    plt.title('Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid()

    # Graficar IoU
    plt.subplot(2, 2, 3)
    plt.plot(epochs, train_iou, marker='o', label='Train IoU')
    plt.plot(epochs, val_iou, marker='o', label='Val IoU')
    plt.title('IoU')
    plt.xlabel('Epoch')
    plt.ylabel('IoU')
    plt.legend()
    plt.grid()

    # Ajustar el layout
    plt.tight_layout()

    # Mostrar el gráfico
    plt.show()

In [None]:
print(f'best model found on epoch {best_epoch}')
plot_losses_and_accuracies(logs)

# Save

In [None]:
torch.save(model, "pretrained_model.pth")

# Submission

In [None]:
# Perform inference on cpu in order to avoid memory problems
device = 'cuda'
model = model.to(device)

test_root_dir = osp.join(config['DATA_DIR'], "images/images")
test_df = pd.read_csv(osp.join(config['DATA_DIR'], "test.csv"))

test_ds = militarDataset(test_df, root_dir=test_root_dir, labeled=False, transform=eval_transforms,output_size=(w,h))#
test_data = DataLoader(test_ds, batch_size=1, num_workers=cpu_count(), shuffle=False)

class_preds = []
bbox_preds = []

for batch in test_data:
    batch_preds = model(batch['image'].float().to(device))

    class_pred = batch_preds['class_id'].argmax(-1).detach().cpu().numpy()
    bbox_pred = batch_preds['bbox'].detach().cpu().numpy()

    class_preds.append(class_pred.squeeze())
    bbox_preds.append(bbox_pred.squeeze())

In [None]:
class_preds = np.array(class_preds)
bbox_preds = np.array(bbox_preds)

In [None]:
submission = pd.DataFrame(
    index=test_df.filename,
    data={
        'class_id': class_preds,
        }
)

In [None]:
submission["xmin"] = bbox_preds[:, 0]*config['w_real']
submission["ymin"] = bbox_preds[:, 1]*config['h_real']
submission["xmax"] = bbox_preds[:, 2]*config['w_real']
submission["ymax"] = bbox_preds[:, 3]*config['h_real']
submission['class']=submission['class_id'].replace(config['id2obj'])

In [None]:
submission['class'].value_counts()

In [None]:
submission.to_csv('submission.csv')