In [1]:
from transformers import BertModel

model = BertModel.from_pretrained("bert-base-uncased")



In [67]:
print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [12]:
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import os
import torchvision

image_size = 224

transform =torchvision.transforms.Compose([
    torchvision.transforms.Resize(image_size), torchvision.transforms.ToTensor()
])

train_datasets = torchvision.datasets.CIFAR100(
    root="./data", train=True, transform=transform, download=True
)

test_datasets = torchvision.datasets.CIFAR100(
    root="./data", train=False, transform=transform, download=True
)

train_dataloader = DataLoader(
    train_datasets, batch_size=32, shuffle=True
)

test_dataloader = DataLoader(
    test_datasets, batch_size=32, shuffle=False
)




Files already downloaded and verified
Files already downloaded and verified


In [26]:
image = next(iter(train_dataloader))[0]

In [33]:
image[0][0]

tensor([[0.9020, 0.9020, 0.9020,  ..., 0.9020, 0.9020, 0.9020],
        [0.9020, 0.9020, 0.9020,  ..., 0.9020, 0.9020, 0.9020],
        [0.9020, 0.9020, 0.9020,  ..., 0.9020, 0.9020, 0.9020],
        ...,
        [0.3922, 0.3922, 0.3922,  ..., 0.4588, 0.4588, 0.4588],
        [0.3922, 0.3922, 0.3922,  ..., 0.4588, 0.4588, 0.4588],
        [0.3922, 0.3922, 0.3922,  ..., 0.4588, 0.4588, 0.4588]])

In [35]:
patch_size = 2
num_channel = 3
batch_size = 32
patch_window = torch.ones((patch_size, patch_size), dtype=torch.long)
patch_window = patch_window.expand(num_channel, patch_size, patch_size).unsqueeze(0).expand(batch_size, num_channel, patch_size, patch_size)


patch = image[:, :, :2, :2] * patch_window





torch.Size([32, 3, 2, 2])


In [49]:
position_ids = torch.tensor(list(range(10)), dtype=torch.long).expand(4, -1)

embed = nn.Embedding(10, 5)

print(embed(position_ids).shape)

torch.Size([4, 10, 5])


In [40]:
cls_embedding = nn.Embedding(1, 768)(torch.tensor(0))
print(cls_embedding.unsqueeze(0).shape)

torch.Size([1, 768])


In [66]:
l = []

for _ in range(8):
    l.append(nn.Linear(5, 5))

l = nn.ModuleList(l)
print(l[0])

Linear(in_features=5, out_features=5, bias=True)


In [60]:
embed_hidden_size, num_layer, num_head = 758, 12, 8

In [77]:
array = []

x = torch.ones(10, 10).unsqueeze(0)

for _ in range(10):
    array.append(nn.Linear(10, 10))

sequence = nn.Sequential(*array)

print(sequence(x))

tensor([[[ 0.1260, -0.1549,  0.2365,  0.1259, -0.1353,  0.0371, -0.2926,
          -0.1077,  0.2875, -0.0581],
         [ 0.1260, -0.1549,  0.2365,  0.1259, -0.1353,  0.0371, -0.2926,
          -0.1077,  0.2875, -0.0581],
         [ 0.1260, -0.1549,  0.2365,  0.1259, -0.1353,  0.0371, -0.2926,
          -0.1077,  0.2875, -0.0581],
         [ 0.1260, -0.1549,  0.2365,  0.1259, -0.1353,  0.0371, -0.2926,
          -0.1077,  0.2875, -0.0581],
         [ 0.1260, -0.1549,  0.2365,  0.1259, -0.1353,  0.0371, -0.2926,
          -0.1077,  0.2875, -0.0581],
         [ 0.1260, -0.1549,  0.2365,  0.1259, -0.1353,  0.0371, -0.2926,
          -0.1077,  0.2875, -0.0581],
         [ 0.1260, -0.1549,  0.2365,  0.1259, -0.1353,  0.0371, -0.2926,
          -0.1077,  0.2875, -0.0581],
         [ 0.1260, -0.1549,  0.2365,  0.1259, -0.1353,  0.0371, -0.2926,
          -0.1077,  0.2875, -0.0581],
         [ 0.1260, -0.1549,  0.2365,  0.1259, -0.1353,  0.0371, -0.2926,
          -0.1077,  0.2875, -0.0581],
 

In [62]:
class VisonTransformer(nn.Module):
    def __init__(self, num_classes, batch_size, image_size, num_channel, patch_size, embed_hidden_size, num_layer, num_head, MultiHeadAttention, encoder):
        super().__init__()
        self.batch_size = batch_size
        self.patch_size = patch_size
        self.image_size = image_size
        self.num_channel = num_channel
        self.num_patch = int((image_size / patch_size) * (image_size / patch_size))
        self.num_token = self.num_patch + 1
        self.num_layer = num_layer
        self.num_head = num_head        
        self.cls_id = torch.tensor(0, dtype=torch.long)
        self.cls_embedding = nn.Embedding(1, embed_hidden_size)
        self.embed_hidden_size = embed_hidden_size
        self.positional_embedding = nn.Embedding(self.num_token, embed_hidden_size)
        self.image_embedding = nn.Linear(patch_size * patch_size * self.num_patch, embed_hidden_size)
        self.layer_norm = nn.LayerNorm((batch_size, self.num_token, embed_hidden_size))
        self.dropout = nn.Dropout(p=0.9)
        self.fc = nn.Linear(embed_hidden_size, num_classes)
        args = (batch_size, image_size, num_channel, patch_size, embed_hidden_size, num_layer, num_head, MultiHeadAttention)
        self.setup_layer(num_layer, encoder, args)

    def image_to_token(self, images, patch_size):
        batch_size, num_channel, width, height = self.batch_size, self.num_channel, self.image_size, self.image_size
        num_patch = self.num_patch
        num_pixel_in_patch = patch_size * patch_size

        patch_window = torch.ones((patch_size, patch_size), dtype=torch.long)
        patch_window = patch_window.unsqueeze(0).expand(num_channel, patch_size, patch_size)\
            .unsqueeze(0).expand(batch_size, num_channel, patch_size, patch_size)
        
        token_list = torch.tensor([])

        for row_idx, in range(0, width, patch_size):
            for col_idx in range(0, height, patch_size):
                token_list = torch.concat([token_list, \
                                           images[row_idx:row_idx + patch_size, col_idx:col_idx + patch_size]], dim=0)
                
        token_list = token_list.transpose(1, 0, 2, 3, 4).view(batch_size, num_patch, num_channel, -1)
        token_list = token_list.view(batch_size, num_patch, -1)

        return token_list
    
    def positional_encoding(self, num_token):
        position_ids = torch.tensor(list(range(num_token)), dtype=torch.long).expand(self.batch_size, -1)
        positional_embeds = self.positional_embedding(position_ids)

        return positional_embeds
    
    def setup_layer(self, num_layer, encoder, args):
        layer_list = []
        for _ in range(num_layer):
            layer_list.append(encoder(args))

        module_list = nn.ModuleList(layer_list)
        self.layer_list = nn.Sequential(*module_list)

    def forward(self, images):
        
        cls_tokens = self.cls_embedding(self.cls_id).unsqueeze(0).expand(batch_size, 1, -1)
        image_tokens = self.image_to_token(images, self.patch_size)
        tokens = torch.concat([cls_tokens, image_tokens], dim=1)
        positional_embeds = self.positional_encoding(self.num_token)
        embed_tokens = tokens + positional_embeds
        encoded_outputs = self.layer_list(embed_tokens)
        cls = encoded_outputs[:, 0, :]
        layer_norm = self.layer_norm(cls)
        dropout = self.dropout(layer_norm)
        outputs = self.fc(dropout)

        return outputs
    
class encoder_layer(nn.Module):
    def __init__(self, batch_size, image_size, num_channel, patch_size, embed_hidden_size, num_layer, num_head, MultiHeadAttention, encoder):
        super().__init__()
        args = (batch_size, image_size, num_channel, patch_size, embed_hidden_size, num_layer, num_head, MultiHeadAttention)
        self.setup_layer(num_layer, encoder, args)

    def setup_layer(self, num_layer, encoder, args):
        layer_list = []
        for _ in range(num_layer):
            layer_list.append(encoder(args))

        module_list = nn.ModuleList(layer_list)
        self.layer_list = nn.Sequential(*module_list)

    def forward(self, tokens):
        encoded_outputs = self.layer_list(tokens)
     
        return encoded_outputs
        
class encoder(nn.Module):
    def __init__(self):
        super().__init__(batch_size, image_size, num_channel, patch_size, embed_hidden_size, num_layer, num_head, MultiHeadAttention)

        num_patch = int((image_size / patch_size) * (image_size / patch_size))
        num_token = num_patch + 1
        self.layer_norm1 = nn.LayerNorm((batch_size, num_token, embed_hidden_size))
        self.layer_norm2 = nn.LayerNorm((batch_size, num_token, embed_hidden_size))
        self.dropout = nn.Dropout(p=0.9)
        self.mlp = nn.Linear(embed_hidden_size, embed_hidden_size)
        self.attention_layer = MultiHeadAttention(embed_hidden_size, num_head)
        self.gelu = nn.GELU()

    def forward(self, tokens):
        layer_norm1 = self.layer_norm1(tokens)
        dropout1 = self.dropout(layer_norm1)
        skip1 = tokens
        concat_attention = self.attention_layer(dropout1)
        outputs_tmp1 = concat_attention + skip1
        skip2 = outputs_tmp1
        layer_norm2 = self.layer_norm2(outputs_tmp1)
        dropout2 = self.dropout(layer_norm2)
        mlp = self.gelu(self.mlp(dropout2))
        outputs = mlp + skip2

        return outputs

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_hidden_size, num_head):

        super().__init__()
        self.attention_layers = []
        self.query_layers = []
        self.key_layers = []
        self.value_layers = []       

    def output_attention(self, tokens, num_head):

        multi_embed_hidden_size = int(embed_hidden_size / num_head)
        for number in range(num_head):
            self.query_layers.append(nn.Linear(embed_hidden_size, multi_embed_hidden_size))
            self.key_layers.append(nn.Linear(embed_hidden_size, multi_embed_hidden_size))
            self.value_layers.append(nn.Linear(embed_hidden_size, multi_embed_hidden_size))

        self.query_layers = nn.ModuleList(self.query_layers)
        self.key_layers = nn.ModuleList(self.key_layers)
        self.value_layers = nn.ModuleList(self.value_layers)

        for number in range(num_head):
            query = self.query_layers[number](tokens)
            key = self.key_layers[number](tokens)
            value = self.value_layers[number](tokens)
            attention = nn.Softmax((query@torch.transpose(key, 1, 2)) / torch.sqrt(multi_embed_hidden_size), dim=-1)@value

            if number > 0: concat_attention = torch.concat([concat_attention, attention], dim=-1)
            else:concat_attention = attention

        return concat_attention
    
    def forward(self, tokens):

        concat_attention = self.output_attention(tokens, self.num_head)

        return concat_attention

            




            






    


        



        



LayerNorm((32, 12545, 758), eps=1e-05, elementwise_affine=True)
