_____
**Credits:**
- Parts of this notebook are based on [this](https://www.kaggle.com/code/carloalbertobarbano/pytorch-ensemble-pretrained-baselines-training) great notebook by [Carlo Alberto](https://www.kaggle.com/carloalbertobarbano).
- Parts of this notebook are based on [this](https://www.kaggle.com/code/hwigeon/dino-pretrained-weight-inference) great notebook by [HWIGEON OH](https://www.kaggle.com/hwigeon).
_____

## Embeddings Brute Force

On this notebook we try to find the best combination of embeddings simply by trial and error. 
For each submission we try to combine multiple efficientnet networks and observe the resulting score. 

Use it as a reference to check what works well (and what doesn't work well). 

**Make sure to check previous versions so you can compare different ensembles and base models**

In [1]:
import gc
import os
import sys
import torch
import torch.nn as nn
from zipfile import ZipFile
from functools import partial
from torchvision import models
from torchvision import transforms

### Since we don't have internet..
..we have to "bring along" the trained models weights as a datasets. 
I made multiple datasets containing pretrained models of multiple backbones. (see: supported models down below)

**Making the dataset**

Making this dataset was extremely easy:
- I simply executed this notebook with the internet turned on. 
- Saved everything that was downloaded on to the machine. 
- Uploaded the cached files as a seperate dataset.
- Turned the internet off. 
- (profit)


In [2]:
!mkdir -p ~/.cache/torch/hub/checkpoints/

# Torchvision::EfficientNet
!cp -r ../input/torchvision-efficientnets/* ~/.cache/torch/hub/checkpoints/

# Hub::DINO-VIT
!cp -r ../input/torchhub-dino-vit/hub/* ~/.cache/torch/hub/
sys.path.append('/root/.cache/torch/hub/facebookresearch_dino_main')

In [3]:
# code from https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
# Little fix for jit

"""
Mostly copy-paste from timm library.
https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
"""
import math
from functools import partial

import torch
import torch.nn as nn

from utils import trunc_normal_


def drop_path(x, drop_prob: float = 0., training: bool = False):
    if drop_prob == 0. or not training:
        return x
    keep_prob = 1 - drop_prob
    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
    random_tensor.floor_()  # binarize
    output = x.div(keep_prob) * random_tensor
    return output


class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """
    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)


class Mlp(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


class Attention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim ** -0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x, attn


class Block(nn.Module):
    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(
            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

    def forward(self, x):
        y, attn = self.attn(self.norm1(x))
        x = x + self.drop_path(y)
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x


class PatchEmbed(nn.Module):
    """ Image to Patch Embedding
    """
    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()
        num_patches = (img_size // patch_size) * (img_size // patch_size)
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches

        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        B, C, H, W = x.shape
        x = self.proj(x).flatten(2).transpose(1, 2)
        return x


class VisionTransformer(nn.Module):
    """ Vision Transformer """
    def __init__(self, img_size=[224], patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12,
                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
                 drop_path_rate=0., norm_layer=nn.LayerNorm, **kwargs):
        super().__init__()
        self.num_features = self.embed_dim = embed_dim

        self.patch_embed = PatchEmbed(
            img_size=img_size[0], patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
        self.pos_drop = nn.Dropout(p=drop_rate)

        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
        self.blocks = nn.ModuleList([
            Block(
                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
            for i in range(depth)])
        self.norm = norm_layer(embed_dim)

        # Classifier head
        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()

        trunc_normal_(self.pos_embed, std=.02)
        trunc_normal_(self.cls_token, std=.02)
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            trunc_normal_(m.weight, std=.02)
            if isinstance(m, nn.Linear) and m.bias is not None:
                nn.init.constant_(m.bias, 0)
        elif isinstance(m, nn.LayerNorm):
            nn.init.constant_(m.bias, 0)
            nn.init.constant_(m.weight, 1.0)

    def interpolate_pos_encoding(self, x, w: int, h: int):
        npatch = x.shape[1] - 1
        N = self.pos_embed.shape[1] - 1
        if npatch == N and w == h:
            return self.pos_embed
        class_pos_embed = self.pos_embed[:, 0]
        patch_pos_embed = self.pos_embed[:, 1:]
        dim = x.shape[-1]
        w0 = w // self.patch_embed.patch_size
        h0 = h // self.patch_embed.patch_size
        # we add a small number to avoid floating point error in the interpolation
        # see discussion at https://github.com/facebookresearch/dino/issues/8
        w0, h0 = w0 + 0.1, h0 + 0.1
        patch_pos_embed = nn.functional.interpolate(
            patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
            scale_factor=[float(w0 / math.sqrt(N)), float(h0 / math.sqrt(N))],
            mode='bicubic',
        )
        assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)

    def prepare_tokens(self, x):
        B, nc, w, h = x.shape
        x = self.patch_embed(x)  # patch linear embedding

        # add the [CLS] token to the embed patch tokens
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)

        # add positional encoding to each token
        x = x + self.interpolate_pos_encoding(x, w, h)

        return self.pos_drop(x)

    def forward(self, x):
        x = self.prepare_tokens(x)
        for blk in self.blocks:
            x = blk(x)
        x = self.norm(x)
        return x[:, 0]

    def get_last_selfattention(self, x):
        x = self.prepare_tokens(x)
        for i, blk in enumerate(self.blocks):
            if i < len(self.blocks) - 1:
                x = blk(x)
            else:
                # return attention of the last block
                return blk(x, return_attention=True)

    def get_intermediate_layers(self, x, n=1):
        x = self.prepare_tokens(x)
        # we return the output tokens from the `n` last blocks
        output = []
        for i, blk in enumerate(self.blocks):
            x = blk(x)
            if len(self.blocks) - i <= n:
                output.append(self.norm(x))
        return output

def vit_tiny(patch_size=16, **kwargs):
    model = VisionTransformer(
        patch_size=patch_size, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4,
        qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
    return model


def vit_small(patch_size=16, **kwargs):
    model = VisionTransformer(
        patch_size=patch_size, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4,
        qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
    return model


def vit_base(patch_size=16, **kwargs):
    model = VisionTransformer(
        patch_size=patch_size, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
        qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
    return model    

### Bruteforce Configurations

**The fun part:** Now we get to test out different bruteforce configurations. 
We first start with what backbone architectures we want to use and if we should use mean or median. 


I will update this later with different approaches and ideas. 

_____

#### Supported Models 
###### (Last update: 07.17.2022)

**EfficientNet**

- `b1` - EfficientNet B1
- `b2` - EfficientNet B2
- `b3` - EfficientNet B3
- `b4` - EfficientNet B4
- `b5` - EfficientNet B5
- `b6` - EfficientNet B6
- `b7` - EfficientNet B7

**Vision Transformers (ViT) + DINO**
 
- `dino_vitb8` - DINO ViT B8
- `dino_vits8` - DINO ViT S8
- `dino_vits16` - DINO ViT S16
- `dino_vitsb16` - DINO ViT S16
_____

In [4]:
BATCH_BLENDING_TYPE = 'mean'
BACKBONES = ['b1', 'b4', 'dino_vits8', 'dino_vitb8'] # , 'dino_vitb8', 'dino_vits16', 'dino_vitb16'] # 'dino_vitb8', 'b3']    

#### EfficientNet Model

> Credit: [Notebook](https://www.kaggle.com/code/carloalbertobarbano/pytorch-ensemble-pretrained-baselines-training) by [Carlo Alberto](https://www.kaggle.com/carloalbertobarbano).

In [5]:
class EfficientNet(nn.Module):
  def __init__(self, encoder_fn, resize, size):
    super().__init__()
    encoder = encoder_fn(pretrained=True)
    encoder.classifier = nn.AdaptiveAvgPool1d(64)
    self.encoder = encoder
    self.resize = resize 
    self.size = size

  def forward(self, x):
    x = transforms.functional.resize(x, self.resize)
    x = transforms.functional.center_crop(x, self.size)
    x = x/255.
    x = transforms.functional.normalize(x, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    return self.encoder(x)

#### DINO Vit Model
> Credit: [Notebook](https://www.kaggle.com/code/hwigeon/dino-pretrained-weight-inference) by [HWIGEON OH](https://www.kaggle.com/hwigeon).

In [6]:
class DINO(nn.Module):
    def __init__(self, encoder, resize, size):
        super().__init__()
        self.encoder = encoder
        self.pool = nn.AdaptiveAvgPool1d(64)
        self.resize = resize 
        self.size = size
        
    def forward(self, x):
        x = transforms.functional.resize(x, [self.resize, self.resize])
        x = transforms.functional.center_crop(x, [self.size, self.size])
        x = x / 255.
        x = transforms.functional.normalize(x, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        x = self.encoder(x)
        x = self.pool(x)
        return x

#### Bruteforce Parameter Options: "BACKBONES"

In [7]:
eff_encoders_fn = {
    'b0': models.efficientnet_b0,
    'b1': models.efficientnet_b1,
    'b2': models.efficientnet_b2,
    'b3': models.efficientnet_b3,
    'b4': models.efficientnet_b4,
    'b5': models.efficientnet_b5,
    'b6': models.efficientnet_b6,
    'b7': models.efficientnet_b7,    
}

dino_encoders_names = {        
    'dino_vitb8': vit_base,
    'dino_vitb16': vit_base,
    'dino_vits8': vit_small,
    'dino_vits16': vit_small,    
}

sizes = {
    'b0': (256, 224), 'b1': (256, 240), 'b2': (288, 288), 'b3': (320, 300),
    'b4': (384, 380), 'b5': (489, 456), 'b6': (561, 528), 'b7': (633, 600),
    'dino_vitb8': (256, 224), 'dino_vits8': (256, 224), 
    'dino_vitb16': (256, 224), 'dino_vits16': (256, 224),    
}

#### Bruteforce Parameter Options: "BLENDING_TYPE"

In [8]:
class EnsembleMean(nn.Module):
    def __init__(self, encoders):
        super().__init__()
        for idx, encoder in enumerate(encoders): setattr(self, f'encoder{idx}', encoder)
        self.num_encoders = len(encoders)
    
    def forward(self, x):
        y = []
        for name, encoder in self.named_children(): y.append(encoder(x))
        y = torch.cat(y, dim=0)
        return y.mean(dim=0).unsqueeze(0)

class EnsembleMedian(nn.Module):
    def __init__(self, encoders):
        super().__init__()
        for idx, encoder in enumerate(encoders): setattr(self, f'encoder{idx}', encoder)
        self.num_encoders = len(encoders)
    
    def forward(self, x):
        y = []
        for name, encoder in self.named_children(): y.append(encoder(x))
        y = torch.cat(y, dim=0)
        return y.median(dim=0).unsqueeze(0)

### Brute Force Execution

**BACKBONES**

In [9]:
encoders = []
for encoder_name in BACKBONES:
    size = sizes[encoder_name]
    if encoder_name in eff_encoders_fn:
        encoders.append(EfficientNet(encoder_fn=eff_encoders_fn[encoder_name], resize=(size[0], size[0]), size=size))
    elif encoder_name in dino_encoders_names:
        encoder = dino_encoders_names[encoder_name](patch_size=8 if not '16' in encoder_name else 16)
        encoder.load_state_dict(torch.hub.load('facebookresearch/dino:main', encoder_name).state_dict(),)
        encoders.append(DINO(encoder, resize=size[0], size = size[1]))
print('Backbones done.')

Using cache found in /root/.cache/torch/hub/facebookresearch_dino_main
Using cache found in /root/.cache/torch/hub/facebookresearch_dino_main


Backbones done.


**BLENDING_TYPE**

In [10]:
if BATCH_BLENDING_TYPE == 'mean': ensemble = EnsembleMean(encoders)
elif BATCH_BLENDING_TYPE == 'median': ensemble = EnsembleMedian(encoders)
ensemble.eval()
print('Batch blending type done.')

Batch blending type done.


### Submission

Now that we have our models ready, we can submit them and observe the LB score.

In [11]:
saved_model = torch.jit.script(ensemble)
saved_model.save('saved_model.pt')

with ZipFile('submission.zip','w') as zip:           
    zip.write('./saved_model.pt', arcname='saved_model.pt') 