In [1]:
import torch.nn as nn

num_channels = 3
hidden_size = 768
patch_size = 16
conv = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)

In [110]:
import torch

fake_image = torch.rand((1, 3, 224, 168))

print(conv(fake_image).reshape(1, -1, 768).shape)

torch.Size([1, 140, 768])


In [111]:
from transformers import AutoFeatureExtractor, AutoModelForImageClassification

model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224")

In [112]:
from transformers.models.vit.modeling_vit import ViTPatchEmbeddings

embs = ViTPatchEmbeddings(model.config)

In [113]:
embs(fake_image).shape

ValueError: Input image size (224*168) doesn't match model (224*224).

In [158]:
# How to compute the positional embeddings

# BS, hidden dim, x patches, y patches
patches = conv(fake_image)
# print(patches.shape)

# To project to the right hidden embedding dim
hidden_dim = 768
x_projection = torch.nn.Linear(1, hidden_dim)
y_projection = torch.nn.Linear(1, hidden_dim)

batch_size, x_size, y_size = patches.shape[0], patches.shape[2], patches.shape[3]

# x_embeddings
patches_x_embeddings = torch.arange(x_size).view(batch_size, 1, -1) / x_size
patches_x_embeddings = x_projection(patches_x_embeddings.T).view(batch_size, x_size, hidden_dim)
patches_x_embeddings = patches_x_embeddings.expand(y_size, batch_size, x_size, hidden_dim)
patches_x_embeddings = patches_x_embeddings.reshape(batch_size, -1, hidden_dim)

# y_embeddings
patches_y_embeddings = torch.arange(y_size).view(batch_size, 1, -1) / y_size
patches_y_embeddings = y_projection(patches_y_embeddings.T).view(batch_size, y_size, hidden_dim)
patches_y_embeddings = patches_y_embeddings.expand(x_size, batch_size, y_size, hidden_dim)
patches_y_embeddings = patches_y_embeddings.reshape(batch_size, -1, hidden_dim)

patches_positional_embeddings = patches_x_embeddings + patches_y_embeddings

In [165]:
# Retaining the aspect ratio for a given effective resolution

img1 = torch.rand((1, 3, 1024, 512))
img2 = torch.rand((1, 3, 256, 272))

patches_1 = conv(img1)
patches_2 = conv(img2)
print(f'num_patches={patches_1.shape[-1]*patches_1.shape[-2]}')
print(f'num_patches={patches_2.shape[-1]*patches_2.shape[-2]}')

num_patches=2048
num_patches=272


In [35]:
1024 / 16

64.0

In [38]:
img1 = torch.rand((1, 3, 1024, 512))
.shape

torch.Size([1, 768, 2048])

In [45]:
# Retaining the aspect ratio for a given effective resolution
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import math
import cv2 

num_channels = 3
hidden_size = 768
patch_size = 16
conv = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)

effective_resolution = 512

for x_size, y_size in [(1024, 512), (1024, 1024), (256, 256), (1121, 456)]:
    imarray = torch.rand(x_size, y_size,3) * 255
    # im = Image.fromarray(imarray.astype('uint8'))

    aspect_ratio = x_size / y_size

    new_y = np.sqrt(effective_resolution**2/aspect_ratio)
    new_x = new_y * aspect_ratio
    
    
    rounded_y = math.floor(new_y)
    rounded_x = math.floor(new_x)
    # resized_img = im.resize(())
    res = cv2.resize(imarray, dsize=(rounded_y, rounded_x), interpolation=cv2.INTER_CUBIC)
    print(rounded_y, rounded_x)
    print(rounded_x / rounded_y, aspect_ratio)
    print(effective_resolution**2, rounded_x*rounded_y)
    img = torch.tensor(res, dtype=torch.float).view(3, rounded_x, rounded_y).unsqueeze(0)

    patches = conv(img)
    print(f'num_patches={patches.shape[-1]*patches.shape[-2]}')
    
    # Then we need to pad to the max

error: OpenCV(4.5.4) :-1: error: (-5:Bad argument) in function 'resize'
> Overload resolution failed:
>  - src is not a numpy array, neither a scalar
>  - Expected Ptr<cv::UMat> for argument 'src'


In [31]:
patches = patches.view(1, hidden_size, -1)
seq_length = patches.shape[-1]
max_patches = int((effective_resolution**2)/(patch_size**2))
torch.nn.functional.pad(patches, (0,max_patches-seq_length)).shape

torch.Size([1, 768, 1024])

In [1]:
from transformers.models.vit.image_processing_vit import ViTImageProcessor



In [2]:
from transformers.models.vit.feature_extraction_vit import ViTFeatureExtractor

In [9]:
from extractor import PatchPackProcessor
from model import PatchPackModel, PatchPackModelImageClassification
from PIL import Image
import requests

processor = PatchPackProcessor.from_pretrained("google/vit-base-patch16-224")

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)
processor(image, return_tensors="pt")['pixel_values'].shape

torch.Size([1, 3, 480, 640])

In [19]:
from transformers import ViTConfig

config = ViTConfig()
model = PatchPackModelImageClassification(config)

In [11]:
# Load model directly
from transformers import AutoFeatureExtractor, AutoModelForImageClassification

extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224")

In [8]:
type(model)

transformers.models.vit.modeling_vit.ViTForImageClassification

In [14]:
[elem for elem in model.named_parameters()]

[('vit.embeddings.cls_token',
  Parameter containing:
  tensor([[[ 9.8962e-03,  1.4689e-02, -2.6690e-01, -5.8776e-04,  4.0510e-01,
             5.3885e-02, -2.2178e-02,  2.3369e-02,  4.0966e-02, -2.1950e-01,
             3.1851e-04, -1.1463e-02, -1.3613e-02,  1.0975e-03, -1.2994e-02,
            -7.2687e-03,  7.6848e-04,  5.7467e-02,  3.0104e-02, -6.2324e-03,
            -4.9313e-02,  5.0945e-03, -2.9701e-03, -1.9993e-02,  1.9781e-03,
             5.4685e-02,  4.0473e-03, -3.8214e-03,  3.6615e-02, -2.2641e-02,
             4.3593e-04,  2.3658e-02, -1.8996e-02,  1.7957e-02,  1.8261e-02,
             4.6318e-03,  5.1867e-02,  2.9562e-04, -5.8164e-03, -1.2062e-02,
             3.7889e-03, -9.2025e-04,  1.9424e-02,  1.3673e-02,  4.5668e-02,
             4.3681e-01, -1.1325e-02,  2.7734e-04,  3.6051e-02, -1.1955e-03,
            -3.1415e-03, -3.4800e-02,  7.8660e-03,  9.3829e-04,  2.2359e-02,
            -1.2542e-02, -2.9033e-02, -6.6412e-03,  8.0147e-03, -6.5374e-02,
            -6.4394e-0

In [51]:
1024*512

524288

In [None]:
256**2

1024

In [3]:
from transformers import XLMRobertaTokenizer

tokenizer = XLMRobertaTokenizer.from_pretrained(
            "hyunwoongko/asian-bart-ecjk"
        )

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBartTokenizer'. 
The class this function is called from is 'XLMRobertaTokenizer'.


In [6]:
tokenizer.decode(tokenizer('A1')['input_ids'])

'<s> A<unk></s>'