# OpenAI CLIP
Clip from [huggingface](https://huggingface.co/docs/transformers/model_doc/clip) is not jit friendly and doesn't work very well with this competition format.

However, OpenAI made available their own jitted weights on their repo https://github.com/openai/CLIP

This notebook briefly shows how to import it and use it on a simple model ensemble.

In [1]:
!pip install timm torchinfo git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-8rw11k0m
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-8rw11k0m
  Resolved https://github.com/openai/CLIP.git to commit d50d76daa670286dd6cacf3bcd80b5e4823fc8e1
  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting timm
  Downloading timm-0.6.11-py3-none-any.whl (548 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m548.7/548.7 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchinfo
  Downloading torchinfo-1.7.1-py3-none-any.whl (22 kB)
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l- \ done
[?25h  Created wheel for

In [2]:
import os
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchinfo import summary
from torchvision import transforms
import pickle
import timm
from timm.data import resolve_data_config

import clip
from clip.clip import _download, _MODELS

clip_code = 'ViT-L/14@336px'
model_path = _download(_MODELS[clip_code], os.path.expanduser("~/.cache/clip"))
with open(model_path, 'rb') as opened_file:
    clip_vit = torch.jit.load(opened_file, map_location="cuda:0").visual.eval()

100%|███████████████████████████████████████| 891M/891M [00:46<00:00, 20.0MiB/s]


In [3]:
# timm.list_models(pretrained=True)

In [4]:
N_DROP_CONV = 0
N_DROP_CLIP = 0

with open('../input/1009-pca-model/clip-vit.pickle', 'rb') as r:
    pca_clip = pickle.load(r)
with open('../input/1009-pca-model/convnext22k (1).pickle', 'rb') as r:
    pca_conv = pickle.load(r)
    


In [5]:
class BlendModel(nn.Module):
    """ Blend CLIP with another model from TIMM """
    def __init__(self, model2, model2weight, normalize):
        super().__init__()
        self.clip = clip_vit
        self.clip_norm = nn.Sequential(
            transforms.Resize(size=[336, 336]),
            transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]),
        )
        
        self.N_DROP_CONV = N_DROP_CONV
        self.N_DROP_CLIP = N_DROP_CLIP
        
        self.pca_mean_clip = torch.nn.Parameter(torch.tensor(pca_clip.mean_))
        self.pca_matrix_clip = torch.nn.Parameter(torch.tensor(pca_clip.components_))
        
        self.pca_mean_conv = torch.nn.Parameter(torch.tensor(pca_conv.mean_))
        self.pca_matrix_conv = torch.nn.Parameter(torch.tensor(pca_conv.components_))

        self.model2 = timm.create_model(model2, pretrained=True, num_classes=0)
        config = resolve_data_config({}, model=self.model2)
        print(model2, config)
        self.model2_norm = nn.Sequential(
            transforms.Resize(size=config['input_size'][-2:]),
#             transforms.Normalize(mean=config['mean'], std=config['std']),
            ## force clip norm params instead
            transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]),
        )
        self.model2weight = model2weight
        self.normalize = normalize

    def forward(self, x):
        x1 = self.clip_norm(x / 255.0)
        x1 = (self.clip(x1.half()))
        x1 = torch.nn.functional.normalize(x1)
        x1 = (x1 - self.pca_mean_clip) @ (self.pca_matrix_clip).T
        x2 = self.model2_norm(x / 255.0)
        x2 = (self.model2(x2))
        x2 = torch.nn.functional.normalize(x2)
        x2 = (x2 - self.pca_mean_conv) @ (self.pca_matrix_conv).T
        x1 = x1[:, self.N_DROP_CLIP : self.N_DROP_CLIP + 64]
        print(x1.shape)
        x2 = x2[:, self.N_DROP_CONV : self.N_DROP_CONV + 64]
        
        if self.normalize:
            return F.normalize(x1) + F.normalize(x2) * self.model2weight
        else:
            return x1 * (1 - self.model2weight) + x2 * self.model2weight

In [6]:
model = BlendModel(model2='convnext_xlarge_in22k', model2weight=0.2, normalize=False).to('cuda')
# model = BlendModel(model2='convnext_xlarge_384_in22ft1k', model2weight=0.82, normalize=False)
model.eval()

test_input_size = (1, 3, 480, 640)  # the model should work with any input_size
summary(model, input_size=test_input_size)

Downloading: "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth" to /root/.cache/torch/hub/checkpoints/convnext_xlarge_22k_224.pth


convnext_xlarge_in22k {'input_size': (3, 224, 224), 'interpolation': 'bicubic', 'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225), 'crop_pct': 0.875}
torch.Size([1, 64])


To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /usr/local/src/pytorch/aten/src/ATen/native/BinaryOps.cpp:601.)
  return forward_call(*input, **kwargs)


Layer (type:depth-idx)                                  Output Shape              Param #
├─VisualTransformer: 1-1                                --                        1,378,304
│    └─Conv2d: 2-1                                      --                        602,112
│    └─LayerNorm: 2-2                                   --                        2,048
│    └─Transformer: 2-3                                 --                        --
│    │    └─Sequential: 3-1                             --                        302,309,376
│    └─LayerNorm: 2-4                                   --                        2,048
BlendModel                                              [1, 64]                   304,476,928
├─Sequential: 1-2                                       [1, 3, 336, 336]          --
│    └─Resize: 2-5                                      [1, 3, 336, 336]          --
│    └─Normalize: 2-6                                   [1, 3, 336, 336]          --
├─Sequential: 1-3       

In [7]:
saved_model = torch.jit.script(model)
del model

saved_model.save('saved_model.pt')
del saved_model

# Verify saved model

In [8]:
saved_model = torch.jit.load('saved_model.pt').cuda()
input = torch.randint(0, 255, test_input_size, device='cuda', dtype=torch.uint8)

with torch.no_grad():
    output = saved_model(input)
    print(output.dtype, output.shape, output.mean().item())
    assert output.shape == torch.Size([test_input_size[0], 64])
    torch.cuda.synchronize()

[1, 64]
torch.float64 torch.Size([1, 64]) -0.12764165663020005


In [9]:
# %%timeit
# with torch.no_grad():
#     saved_model(input)
# torch.cuda.synchronize()

In [10]:
from zipfile import ZipFile
with ZipFile('submission.zip','w') as zip_file:           
    zip_file.write('./saved_model.pt', arcname='saved_model.pt') 

In [11]:
# !ls -lh