<a href="https://colab.research.google.com/github/cam2149/MachineLearningIV/blob/main/Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
#kagglehub.login()
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [2]:
# prompt: Eliminar todo el contenido de la carpeta /content/kaggle

!rm -rf /content/kaggle


In [3]:
!kaggle competitions download -c aa-iv-2025-i-object-localization
print('Data source import complete.')

Downloading aa-iv-2025-i-object-localization.zip to /content
 24% 7.00M/29.1M [00:00<00:00, 71.1MB/s]
100% 29.1M/29.1M [00:00<00:00, 172MB/s] 
Data source import complete.


In [4]:
# prompt: Descomprimir el archivo unzip aa-iv-2025-i-object-localization.zip dentro de una carperta llamada kaggle

!mkdir kaggle
!mkdir kaggle/working
!unzip aa-iv-2025-i-object-localization.zip -d kaggle


Archive:  aa-iv-2025-i-object-localization.zip
  inflating: kaggle/images/images/image_00001.jpeg  
  inflating: kaggle/images/images/image_00002.jpeg  
  inflating: kaggle/images/images/image_00003.jpeg  
  inflating: kaggle/images/images/image_00004.jpeg  
  inflating: kaggle/images/images/image_00005.jpeg  
  inflating: kaggle/images/images/image_00007.jpeg  
  inflating: kaggle/images/images/image_00011.jpeg  
  inflating: kaggle/images/images/image_00012.jpeg  
  inflating: kaggle/images/images/image_00013.jpeg  
  inflating: kaggle/images/images/image_00014.jpeg  
  inflating: kaggle/images/images/image_00015.jpeg  
  inflating: kaggle/images/images/image_00016.jpeg  
  inflating: kaggle/images/images/image_00017.jpeg  
  inflating: kaggle/images/images/image_00018.jpeg  
  inflating: kaggle/images/images/image_00019.jpeg  
  inflating: kaggle/images/images/image_00020.jpeg  
  inflating: kaggle/images/images/image_00021.jpeg  
  inflating: kaggle/images/images/image_00022.jpeg  

In [5]:
!pip install torchsummary
!pip install -U albumentations



# Imports

In [6]:
import numpy as np
import pandas as pd

import torch
from torch import nn
from torch import Tensor
from torch.optim import Optimizer
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()
from sklearn.model_selection import train_test_split
from multiprocessing import cpu_count

import typing as ty
from numpy.typing import NDArray
import os
import os.path as osp

import albumentations as A
import torchvision
from skimage import io, transform
import cv2
from PIL import Image

from functools import reduce
from torchsummary import summary

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm_notebook().pandas()


0it [00:00, ?it/s]

# Config

In [7]:
config = {
    "DATA_DIR": "/content/kaggle/",
    "WORK_DIR": "/content/kaggle/working",
    "IMG_DIR": "images/images",
    "TRAIN_CSV": "train.csv",
    "obj2id": {"f16": 0, "cougar": 1, "chinook": 2, "ah64": 3, "f15": 4, "seahawk": 5},
    "id2obj": {0: "f16", 1: "cougar", 2: "chinook", 3: "ah64", 4: "f15", 5: "seahawk"},
    "h_real": 720,
    "w_real": 1280,
    "channel": 3,
    "w_resize": 234,
    "h_resize": 416,
    "grayscale": False,
}

In [8]:
torch.manual_seed(32)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device}')
test = torch.ones((100, 100)).to(device)
del test
torch.cuda.empty_cache()

Using cuda


In [9]:
# Random seed

In [10]:
def reset_seed():
    random_seed = 42
    torch.backends.cudnn.enabled = True
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(42)
        torch.cuda.manual_seed_all(42)

        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

reset_seed()

# Function to read CSV

In [11]:
def read_train_csv():
    """
    Read the train csv file and return the dataframe with the necessary columns.

    Args:
    config: Config object

    Returns:
    df: Dataframe with the necessary columns

    usage:
    df = read_train_csv(config)
    """
    df = pd.read_csv(osp.join(config["DATA_DIR"], config["TRAIN_CSV"]))
    df["class_id"] = df["class"].map(config["obj2id"])
    columns_f = ["filename", "xmin", "ymin", "xmax", "ymax", "class_id"]
    df = df[columns_f]
    df[["ymin", "ymax"]] = df[["ymin", "ymax"]].div(config["h_real"], axis=0)
    df[["xmin", "xmax"]] = df[["xmin", "xmax"]].div(config["w_real"], axis=0)
    return df

# Function to get the mean and standard deviation of the channels

In [12]:
def get_channels_std(ds):
    """
    Get the standard deviation and mean of each channel in the data.
    """
    means = np.zeros(3)
    stds = np.zeros(3)
    n_images = 0

    for x in ds:
        img = x["image"].astype(
            np.float32
        )  # Asegúrate de que la imagen está en float para cálculos precisos
        n_images += 1

        for channel in range(3):
            channel_pixels = img[..., channel]
            # Acumular la suma y suma de cuadrados para calcular la media y desviación estándar
            means[channel] += np.mean(channel_pixels)
            stds[channel] += np.std(channel_pixels)

    # Calcular la media y desviación estándar final
    means /= n_images
    stds /= n_images

    return means, stds

# Transforms Functions

In [13]:
class ToTensor(object):
    """
    Convert ndarrays in sample to Tensors for pytorch.

    Arguments:
        sample: a dictionary containing:
            image: sample image in format (H, W, C)
    Returns:
        the image in (C, H, W) format.
    """

    def __call__(self, sample):
        image = sample["image"]

        # swap color axis because
        # numpy image: H x W x C (0,1,2)
        # torch image: C x H x W
        image = image.transpose((2, 0, 1))
        image = torch.from_numpy(image).float()
        sample.update({"image": image})
        return sample


class Normalizer(object):
    """
    Normalize the image by subtracting the mean and dividing by the standard deviation.
    """

    def __init__(self, stds, means):
        """
        Arguments:

            stds: array of length 3 containing the standard deviation of each channel in RGB order.
            means: array of length 3 containing the means of each channel in RGB order.
        """
        self.stds = stds
        self.means = means

    def __call__(self, sample):
        """
        Sample: a dicitonary containing:
            image: sample image in format (C, H, W)
        Returns:
            the image in (C, H, W) format with the channels normalized.
        """
        image = sample["image"]

        for channel in range(3):
            image[channel] = (image[channel] - self.means[channel]) / self.stds[channel]

        sample["image"] = image
        return sample


class AlbumentationsWrapper(object):
    """
    Albumentations Wrapper

    Arguments:
        transform: an albumentations transform receiving an image and bounding boxes.

    Returns:
        the image transformed by the transform object.
    """

    def __init__(self, transform):
        self.transform = transform

    def __call__(self, sample):
        transformed = self.transform(
            image=sample["image"],
            bboxes=sample["bbox"],
            # category_ids=sample['class_id']
        )
        sample["image"] = transformed["image"]
        sample["bbox"] = np.array(transformed["bboxes"])
        return sample


def common_transforms(means, stds):
    """
    Common transformations for the image.
    Arguments:
        means: array of length 3 containing the means of each channel in RGB order.
        stds: array of length 3 containing the standard deviation of each channel in RGB order.
    Returns:
        a list of transformations.
    """
    return [
        ToTensor(),
        Normalizer(
            means=means,
            stds=stds,
        ),
    ]


class TransformComposed:
    """
    Compose a list of transformations.

    Arguments:
        means: array of length 3 containing the means of each channel in RGB order.
        stds: array of length 3 containing the standard deviation of each channel in RGB order.

    Returns:
        a composed transformation.
    """

    def __init__(self, means, stds):
        self.means = means
        self.stds = stds

    def getTransform(self, transforms=[]):
        return torchvision.transforms.Compose(
            [AlbumentationsWrapper(t) for t in transforms]
            + common_transforms(self.means, self.stds)
        )

# Required functions to train a pytorch model

## Class to load dataset

In [14]:
transform_func_inp_signature = ty.Dict[str, NDArray[np.float_]]
transform_func_signature = ty.Callable[
    [transform_func_inp_signature],
    transform_func_inp_signature
]

class militarDataset(Dataset):
    """
    Location image dataset
    """
    def __init__(
        self,
        df: pd.DataFrame,
        root_dir: str,
        labeled: bool = True,
        transform: ty.Optional[ty.List[transform_func_signature]] = None,
        output_size: ty.Optional[tuple] = None  # Añadir parámetro para tamaño de salida
    ) -> None:
        self.df = df
        self.root_dir = root_dir
        self.transform = transform
        self.labeled = labeled
        self.output_size = output_size  # Almacenar el tamaño de salida

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx: int) -> transform_func_signature:
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # Read image
        img_name = os.path.join(self.root_dir, self.df.filename.iloc[idx])
        #img_name = os.path.join(self.root_dir, self.df.iloc[idx]['filename'])
        image = io.imread(img_name)
        #image = cv2.imread(img_name)


        #print(f"Dimensiones originales de la imagen: {image.shape}")  # Agregar para depuración
        if image is None:
            raise FileNotFoundError(f"Image not found: {img_name}")

        if image.ndim == 2:  # Si la imagen está en escala de grises
            image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)  # Convertir a RGB
        elif image.shape[2] == 4:  # Si la imagen es RGBA
            image = image[:, :, :3]

        # Redimensionar la imagen si se especifica un tamaño de salida
        if self.output_size:
            image = cv2.resize(image, self.output_size)  # Redimensionar la imagen

        sample = {'image': image}

        if self.labeled:
            # Read labels
            img_class = self.df.class_id.iloc[idx]
            img_bbox = self.df.iloc[idx, 1:5]

            img_bbox = np.array([img_bbox]).astype('float')
            img_class = np.array([img_class]).astype('int')
            sample.update({'bbox': img_bbox, 'class_id': img_class})

        if self.transform:
            sample = self.transform(sample)

        return sample

## Metrics

In [15]:
def iou(y_true: Tensor, y_pred: Tensor):
    pairwise_iou = torchvision.ops.box_iou(y_true.squeeze(), y_pred.squeeze())
    result = torch.trace(pairwise_iou) / pairwise_iou.size()[0]
    return result

def accuracy(y_true: Tensor, y_pred: Tensor):
    pred = torch.argmax(y_pred, axis=-1)
    y_true = y_true.squeeze()
    correct = torch.eq(pred, y_true).float()
    total = torch.ones_like(correct)
    result = torch.divide(torch.sum(correct), torch.sum(total))
    return result

## Loss function

In [16]:
def loss_fn(y_true, y_preds, alpha: float = 0.5):
    cls_y_true, cls_y_pred = y_true['class_id'].long(), y_preds['class_id'].float().unsqueeze(-1)
    reg_y_true, reg_y_pred = y_true['bbox'].float().squeeze(), y_preds['bbox'].float().squeeze()

    cls_loss = F.cross_entropy(cls_y_pred, cls_y_true)

    reg_loss = F.mse_loss(reg_y_pred, reg_y_true)
    # Adds weights to both tasks
    total_loss = (1 - alpha) * cls_loss + alpha * reg_loss
    return dict(loss=total_loss, reg_loss=reg_loss,cls_loss=cls_loss)

## callbacks

In [17]:
def printer(logs: ty.Dict[str, ty.Any]):
    # print every 10 steps
    if logs['iters'] % 10 != 0:
        return
    print('Iteration #: ',logs['iters'])
    for name, value in logs.items():
        if name == 'iters':
            continue

        if type(value) in [float, int]:
            value = round(value, 4)
        elif type(value) is torch.Tensor:
            value = torch.round(value, decimals=4)

        print(f'\t{name} = {value}')
    print()

## Define model

In [18]:
def get_output_shape(
    model: nn.Sequential, image_dim: ty.Tuple[int, int, int], device: str = "cpu"
) -> ty.Tuple[int, int, int]:
    return model(torch.rand(*(image_dim)).to(device)).data.shape


class Model(nn.Module):
    def __init__(
        self,
        backbone: nn.Module,
        input_shape: ty.Tuple[int, int, int] = (3, 255, 400),
        n_classes: int = 6,
        device: str = "cpu"
    ):
        """
        Model with one input (image) and two outputs:
            1. Digit classification (classification).
            2. Bounding box prediction (regression).

        Arguments:
            input_shape: input shape of the image in format (C, H, W)
            n_classes: number of classes to perfrom classification with
            backbone: Initial model to extract features from the image and pass to clasification and regresion heads.

        Attributes:
            backbone: ConvNet that process the image and
            returns a flattened vector with the information of the
            activations.

            cls_head: MLP that receives the flattened input from the backbone
            and predicts the classification logits for the classes (classficiation task).

            reg_head: MLP that receives the flattened input from the backbone
            and predicts the coordinates of the predicted bounding box (regression task).
        """
        super().__init__()

        self.input_shape = input_shape

        # When doing transfer learning, use pretrained model instead of custom backbone
        self.backbone = backbone

        backbone_output_shape = get_output_shape(self.backbone, [1, *input_shape],device)
        backbone_output_features = reduce(lambda x, y: x * y, backbone_output_shape)

        self.cls_head = nn.Sequential(
            nn.Linear(in_features=backbone_output_features, out_features=768),
            nn.ReLU(),
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, n_classes),
        )

        self.reg_head = nn.Sequential(
            nn.Linear(in_features=backbone_output_features, out_features=768),
            nn.ReLU(),
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 4),
        )

    def forward(self, x: Tensor) -> ty.Dict[str, Tensor]:
        features = self.backbone(x)
        cls_logits = self.cls_head(features)
        pred_bbox = self.reg_head(features)
        predictions = {"bbox": pred_bbox, "class_id": cls_logits}
        return predictions

## Functions to train the model

In [19]:
def evaluate(
    logs: ty.Dict[str, ty.Any],
    labels: ty.Dict[str, Tensor],
    preds: ty.Dict[str, Tensor],
    eval_set: str,
    metrics: ty.Dict[str, ty.Callable[[Tensor, Tensor], Tensor]],
    losses: ty.Optional[ty.Dict[str, Tensor]] = None,
) -> ty.Dict[str, ty.Any]:

    if losses is not None:
        for loss_name, loss_value in losses.items():
            logs[f'{eval_set}_{loss_name}'] = loss_value

    for task_name, label in labels.items():
        for metric_name, metric in metrics[task_name]:
            value = metric(label, preds[task_name])
            logs[f'{eval_set}_{metric_name}'] = value

    return logs

def step(
    model: Model,
    optimizer: Optimizer,
    batch: militarDataset,
    loss_fn: ty.Callable[[ty.Dict[str, torch.Tensor]], torch.Tensor],
    device: str,
    train: bool = False,
) -> ty.Tuple[ty.Dict[str, Tensor], ty.Dict[str, Tensor]]:

    if train:
        optimizer.zero_grad()

    #img = batch['image'].to(device)
    img = batch.pop('image').to(device)

    for k in list(batch.keys()):
        batch[k] = batch[k].to(device)

    preds = model(img.float())
    losses = loss_fn(batch, preds)
    final_loss = losses['loss']

    if train:
        final_loss.backward()
        optimizer.step()

    return losses, preds


def train(
    model: Model,
    optimizer: Optimizer,
    dataset: DataLoader,
    eval_datasets: ty.List[ty.Tuple[str, DataLoader]],
    loss_fn: ty.Callable[[ty.Dict[str, torch.Tensor]], torch.Tensor],
    metrics: ty.Dict[str, ty.Callable[[Tensor, Tensor], Tensor]],
    callbacks: ty.List[ty.Callable[[ty.Dict[ty.Any, ty.Any]], None]],
    device: str,
    train_steps: 100,
    eval_steps: 10,
) -> Model:
    # Send model to device (GPU or CPU)
    model = model.to(device)
    iters = 0
    iterator = iter(dataset)
    assert train_steps > eval_steps, 'Train steps should be greater than the eval steps'

    while iters <= train_steps:
        logs = dict()
        logs['iters'] = iters
        try:
            batch = next(iterator)
        except StopIteration:
            iterator = iter(dataset)
            batch = next(iterator)
        # Send batch to device
        losses, preds = step(model, optimizer, batch, loss_fn, device, train=True)
        logs = evaluate(logs, batch, preds, 'train', metrics, losses)

        # Eval every eval_steps iterations
        if iters % eval_steps == 0:
            # Evaluate
            # Deactives layers that only needed to train
            # https://discuss.pytorch.org/t/model-eval-vs-with-torch-no-grad/19615
            model.eval()

            # Avoids calculating gradients in evaluation dataset.
            with torch.no_grad():

                for name, dataset in eval_datasets:

                    for batch in dataset:
                        losses, preds = step(model, optimizer, batch, loss_fn, device, train=False)
                        logs = evaluate(logs, batch, preds, name, metrics, losses)

        for callback in callbacks:
            callback(logs)

        iters += 1

    return model

# Split train and validation

In [20]:
reset_seed()

In [21]:
df = read_train_csv()
train_df, val_df = train_test_split(
    df, stratify=df["class_id"], test_size=0.25, random_state=42
)
print(f'training set shape: {train_df.shape}')
print(f'validation set shape: {val_df.shape}')

training set shape: (141, 6)
validation set shape: (48, 6)


In [22]:
#Se crean cuatro listas vacías para almacenar:
list_image = list(df.filename)

#Este bucle itera sobre cada nombre de archivo en la lista list_image. tqdm es una librería que muestra una barra de progreso para visualizar el avance del bucle

for i in tqdm(list_image): ## tqdm(list_image)dura 40 segundos
    ruta_imagen = osp.join(config["DATA_DIR"], config["IMG_DIR"], i)
    imagen = io.imread(ruta_imagen)
    shapes = imagen.shape
    dimen = imagen.ndim
    imagen = Image.open(ruta_imagen)
    w, h = imagen.size
    imagen = imagen.resize((w, h), Image.Resampling.LANCZOS)
    imagen.save(ruta_imagen)




100%|██████████| 189/189 [00:08<00:00, 23.46it/s]


# Define transforms

In [23]:
root_dir = osp.join(config["DATA_DIR"], config["IMG_DIR"])
train_ds = militarDataset(train_df, root_dir)
means, stds = get_channels_std(train_ds)
common_transforms_I = common_transforms(means,stds)

In [24]:
bbox_params = A.BboxParams(format="albumentations", label_fields=[])

In [25]:
train_data_augmentations = A.Compose([
    A.HorizontalFlip(p=0),
    A.Rotate(limit=45, p=0),
    A.AutoContrast(p=0),
    A.Defocus(p=0),
    A.Downscale(p=0),
    A.GaussNoise(p=0),
    A.GaussianBlur(p=0),
    A.HueSaturationValue(p=0),
    A.ISONoise(p=0),
    A.PlanckianJitter(p=0),
    A.PlasmaShadow(p=0.5),
    A.Posterize(p=0),
    A.RandomFog(p=0),
    A.RandomSnow(p=0),
    A.RandomSunFlare(p=0),
    A.SaltAndPepper(p=0),
    A.Sharpen(p=0),
    A.ZoomBlur(p=0)
    ],
    bbox_params=bbox_params
)

train_transforms = torchvision.transforms.Compose(
    [
        AlbumentationsWrapper(train_data_augmentations),
    ] + common_transforms_I
)

eval_transforms = torchvision.transforms.Compose(common_transforms_I)

# Transfer Learning

In [26]:
class FeatureExtractor(nn.Module):
    def __init__(self, model):
        super(FeatureExtractor, self).__init__()
        # Extract VGG-16 Feature Layers
        self.features = list(model.features)
        self.features = nn.Sequential(*self.features)
        # Extract VGG-16 Average Pooling Layer
        self.pooling = model.avgpool
        # Convert the image into one-dimensional vector
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        # It will take the input 'x' until it returns the feature vector called 'out'
        out = self.features(x)
        out = self.pooling(out)
        out = self.flatten(out)
        out = self.dropout(out)
        return out

## vgg16

In [27]:
from torchvision.models import vgg16, VGG16_Weights


# Load the vgg16 model
vgg16_model = vgg16(weights=VGG16_Weights.DEFAULT, progress=True)
pretrained_model = FeatureExtractor(vgg16_model).to(device)
pretrained_model

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /root/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:07<00:00, 74.1MB/s]


FeatureExtractor(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=

## EfficientNetV2

In [28]:
# from torchvision.models import efficientnet_v2_l, EfficientNet_V2_L_Weights

# Load the efficient net v2 model
# env2_model = efficientnet_v2_l(weights=EfficientNet_V2_L_Weights.DEFAULT)
# pretrained_model = FeatureExtractor(env2_model).to(device)
# pretrained_model

# Initialize model

In [29]:
train_ds[0]['image'].shape

(720, 1280, 3)

In [30]:
# Hparams
batch_size = 32
lr = 0.001
w = 256
h = 455
c = 3

# Data
train_ds = militarDataset(train_df, root_dir=root_dir, transform=train_transforms,output_size=(w,h))
val_ds = militarDataset(val_df, root_dir=root_dir, transform=eval_transforms,output_size=(w,h))

train_data = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=cpu_count())
val_data = DataLoader(val_ds, batch_size=batch_size, num_workers=cpu_count())

# Model
model = Model(pretrained_model,(c,h,w),device=device).to(device)
summary(model, model.input_shape)

# Optimizer
optimizer = torch.optim.Adam(lr=lr, params=model.parameters())

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 455, 256]           1,792
              ReLU-2         [-1, 64, 455, 256]               0
            Conv2d-3         [-1, 64, 455, 256]          36,928
              ReLU-4         [-1, 64, 455, 256]               0
         MaxPool2d-5         [-1, 64, 227, 128]               0
            Conv2d-6        [-1, 128, 227, 128]          73,856
              ReLU-7        [-1, 128, 227, 128]               0
            Conv2d-8        [-1, 128, 227, 128]         147,584
              ReLU-9        [-1, 128, 227, 128]               0
        MaxPool2d-10         [-1, 128, 113, 64]               0
           Conv2d-11         [-1, 256, 113, 64]         295,168
             ReLU-12         [-1, 256, 113, 64]               0
           Conv2d-13         [-1, 256, 113, 64]         590,080
             ReLU-14         [-1, 256, 

# Training

In [31]:
model = train(
    model,
    optimizer,
    train_data,
    eval_datasets=[('val', val_data)],
    loss_fn=loss_fn,
    metrics={
        'bbox': [('iou', iou)],
        'class_id': [('accuracy', accuracy)]
    },
    callbacks=[printer],
    device=device,
    train_steps=130,
    eval_steps=10
)

Iteration #:  0
	train_loss = 1.0384999513626099
	train_reg_loss = 0.265500009059906
	train_cls_loss = 1.811400055885315
	train_iou = 0.0001
	train_accuracy = 0.09380000084638596
	val_loss = 1.0160000324249268
	val_reg_loss = 0.09350000321865082
	val_cls_loss = 1.938599944114685
	val_iou = 0.1789
	val_accuracy = 0.125

Iteration #:  10
	train_loss = 0.9714999794960022
	train_reg_loss = 0.17890000343322754
	train_cls_loss = 1.7640999555587769
	train_iou = 0.0003
	train_accuracy = 0.1875
	val_loss = 0.9976000189781189
	val_reg_loss = 0.21119999885559082
	val_cls_loss = 1.784000039100647
	val_iou = 0.0
	val_accuracy = 0.1875

Iteration #:  20
	train_loss = 0.9157000184059143
	train_reg_loss = 0.4153999984264374
	train_cls_loss = 1.4158999919891357
	train_iou = 0.0313
	train_accuracy = 0.5
	val_loss = 0.9010999798774719
	val_reg_loss = 0.13449999690055847
	val_cls_loss = 1.6677000522613525
	val_iou = 0.0074
	val_accuracy = 0.5

Iteration #:  30
	train_loss = 0.3720000088214874
	train_reg_l

# Save

In [32]:
torch.save(model, "pretrained_model.pth")

# Submission

In [33]:
# Perform inference on cpu in order to avoid memory problems
device = 'cuda'
model = model.to(device)

test_root_dir = osp.join(config['DATA_DIR'], "images/images")
test_df = pd.read_csv(osp.join(config['DATA_DIR'], "test.csv"))

test_ds = militarDataset(test_df, root_dir=test_root_dir, labeled=False, transform=eval_transforms,output_size=(w,h))#
test_data = DataLoader(test_ds, batch_size=1, num_workers=cpu_count(), shuffle=False)

class_preds = []
bbox_preds = []

for batch in test_data:
    batch_preds = model(batch['image'].float().to(device))

    class_pred = batch_preds['class_id'].argmax(-1).detach().cpu().numpy()
    bbox_pred = batch_preds['bbox'].detach().cpu().numpy()

    class_preds.append(class_pred.squeeze())
    bbox_preds.append(bbox_pred.squeeze())

In [34]:
class_preds = np.array(class_preds)
bbox_preds = np.array(bbox_preds)

In [35]:
submission = pd.DataFrame(
    index=test_df.filename,
    data={
        'class_id': class_preds,
        }
)

In [36]:
submission["xmin"] = bbox_preds[:, 0]*config['w_real']
submission["ymin"] = bbox_preds[:, 1]*config['h_real']
submission["xmax"] = bbox_preds[:, 2]*config['w_real']
submission["ymax"] = bbox_preds[:, 3]*config['h_real']
submission['class']=submission['class_id'].replace(config['id2obj'])

In [37]:
submission['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
chinook,15
f16,14
cougar,12
ah64,11
seahawk,6
f15,5


In [38]:
submission.to_csv('submission.csv')