In [16]:
import tensorflow as tf
from tensorflow.keras.layers import Layer,Dense,Flatten,Dropout,LayerNormalization
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input,Embedding,Conv1D,Flatten,Dense,Dropout,LayerNormalization
from tensorflow.keras.models import Model
import numpy as np
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.utils import to_categorical

##### `PatchEmbedding` 
- divides input images into non-overlapping patches,flattens them,and projects each patch into a specified embedding dimension

In [2]:
class PatchEmbedding(Layer):
    def __init__(self,patch_size,embed_dim):
        super(PatchEmbedding,self).__init__()
        self.patch_size=patch_size
        self.embed_dim=embed_dim
        self.projection=Dense(embed_dim)

    def call(self,images):
        # images -> (batch_size,height,width,channels)
        batch_size=tf.shape(images)[0] 
        patches=tf.image.extract_patches(images=images,sizes=[1,self.patch_size,self.patch_size,1],strides=[1,self.patch_size,self.patch_size,1],rates=[1,1,1,1],padding='VALID') # (patch_size,patch_size)
        patch_dims=patches.shape[-1]
        patches=tf.reshape(patches,[batch_size,-1,patch_dims]) # (batch_size,num_patches,patch_dims)
        embeddings=self.projection(patches) # (batch_size,num_patches,embed_dim)
        return embeddings


### `PositionalEncoding`
- adds positional information to the input embeddings

In [3]:
class PositionalEncoding(Layer):
    def __init__(self,num_patches,embed_dim):
        super(PositionalEncoding,self).__init__()
        self.pos_encoding=self.positional_encoding(num_patches,embed_dim)

    def positional_encoding(self,num_patches,embed_dim):
        positions=tf.range(num_patches,dtype=tf.float32)[:,tf.newaxis]  # (num_patches,1)
        div_term=tf.exp(tf.range(0,embed_dim,2,dtype=tf.float32)*-(tf.math.log(10000.0)/embed_dim))
        even_indices=tf.sin(positions*div_term)
        odd_indices=tf.cos(positions*div_term)
        pos_encoding=tf.concat([even_indices,odd_indices],axis=1)
        return pos_encoding[:,:embed_dim]  # (num_patches,embed_dim)

    def call(self,x): #incorporate positional information
        return x+self.pos_encoding


##### `TransformerEncoderBlock`
- combines multi-head self-attention,feed-forward neural networks,residual connections,and normalization layers to process input sequences

In [4]:
class TransformerEncoderBlock(Layer):
    def __init__(self,embed_dim,num_heads,ff_dim,dropout_rate=0.1):
        super(TransformerEncoderBlock,self).__init__()
        self.att=tf.keras.layers.MultiHeadAttention(num_heads=num_heads,key_dim=embed_dim)
        self.ffn=tf.keras.Sequential([Dense(ff_dim,activation='relu'),# (batch_size,seq_len,ff_dim)
                                        Dense(embed_dim)])  # (batch_size,seq_len,embed_dim)
        self.layernorm1=LayerNormalization(epsilon=1e-6) # (batch_size,seq_len,embed_dim)
        self.layernorm2=LayerNormalization(epsilon=1e-6) # (batch_size,seq_len,embed_dim)
        self.dropout1=Dropout(dropout_rate) 
        self.dropout2=Dropout(dropout_rate) 

    def call(self,inputs,training=None):
        attn_output=self.att(inputs,inputs)  #  (batch_size,seq_len,embed_di)
        attn_output=self.dropout1(attn_output,training=training) 
        out1=self.layernorm1(inputs+attn_output)  # (batch_size,seq_len,embed_dim)

        ffn_output=self.ffn(out1)  # (batch_size,seq_len,embed_di)
        ffn_output=self.dropout2(ffn_output,training=training) 

        return self.layernorm2(out1+ffn_output)  # (batch_size,seq_len,embed_di)


In [5]:
def create_vit_model(input_shape,patch_size,embed_dim,num_heads,ff_dim,num_layers,num_classes):
    inputs=tf.keras.Input(shape=input_shape)
    patches=PatchEmbedding(patch_size,embed_dim)(inputs)
    num_patches=(input_shape[0] // patch_size)*(input_shape[1] // patch_size)
    positions=PositionalEncoding(num_patches,embed_dim)(patches)
    x=positions

    for _ in range(num_layers):
        x=TransformerEncoderBlock(embed_dim,num_heads,ff_dim)(x)

    x=LayerNormalization(epsilon=1e-6)(x)
    x=Flatten()(x)
    x=Dense(ff_dim,activation='relu')(x)
    x=Dropout(0.1)(x)
    outputs=Dense(num_classes,activation='softmax')(x)

    return tf.keras.Model(inputs=inputs,outputs=outputs)


Loading the $\texttt{cifar10}$ dataset and normalising the values to $(0,1)$<br>
The y values are one hot encoded

In [6]:
(x_train,y_train),(x_test,y_test)=cifar10.load_data()
x_train=x_train.astype("int32")/255.0
x_test=x_test.astype("int32")/255.0

y_train=to_categorical(y_train,10)
y_test=to_categorical(y_test,10)


Creating the VIT model with the following Model parameters
- Input Shape: `(32, 32, 3)`
- Patch Size: `4`
- Embedding Dimension: `64`
- Number of Attention Heads: `4`
- Feed-Forward Network Dimension: `128`
- Number of Transformer Encoder Layers: `8`
- Number of Classes: `10`

In [7]:
vit_model=create_vit_model(
    input_shape=(32,32,3),
    patch_size=4,
    embed_dim=64,
    num_heads=4,
    ff_dim=128,
    num_layers=8,
    num_classes=10
)




In [8]:
vit_model.summary()

Compiling the model with loss function as $\texttt{crossentropy}$ and $\texttt{Adam}$ optimizer with learning rate $3*10^{-4}$

In [None]:
vit_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

# Train Model
history=vit_model.fit(x_train,y_train,batch_size=64,epochs=100,validation_split=0.2,verbose=1)

Epoch 1/100
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 43ms/step - accuracy: 0.1885 - loss: 2.1956 - val_accuracy: 0.3933 - val_loss: 1.7012
Epoch 2/100
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 26ms/step - accuracy: 0.3994 - loss: 1.6703 - val_accuracy: 0.4722 - val_loss: 1.4432
Epoch 3/100
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 27ms/step - accuracy: 0.4677 - loss: 1.4815 - val_accuracy: 0.5050 - val_loss: 1.3623
Epoch 4/100
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 26ms/step - accuracy: 0.5169 - loss: 1.3435 - val_accuracy: 0.5270 - val_loss: 1.3179
Epoch 5/100
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 26ms/step - accuracy: 0.5535 - loss: 1.2453 - val_accuracy: 0.5601 - val_loss: 1.2388
Epoch 6/100
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 27ms/step - accuracy: 0.5884 - loss: 1.1437 - val_accuracy: 0.5701 - val_loss: 1.2105
Epoch 7/10

Now we are implementing the $\texttt{ByteFormer}$ on cifar10 dataset.<br>
For implementing this, we are converting the RGB images $(32*32*3)$ to $(3072,1)$ with each element representing the corresponding byte.

In [22]:
# vit_model.save("vit.keras")  ## saving the model

In [14]:

(x_train,y_train),(x_test,y_test)=cifar10.load_data()
# flatten 32x32x3 into a 1D array of 3072 bytes
x_train_bytes=x_train.reshape(x_train.shape[0],-1).astype(np.uint8)  #(3072,1)
x_test_bytes=x_test.reshape(x_test.shape[0],-1).astype(np.uint8)

y_train=to_categorical(y_train,10)
y_test=to_categorical(y_test,10)

first_image_bytes=x_train_bytes[0]
x_train_bytes=x_train_bytes.astype(np.float32)
y_train=y_train.astype(np.float32)

print(x_train_bytes.shape) # (50000,3072)
print(x_test_bytes.shape) # (10000,3072)

(50000, 3072)
(10000, 3072)


##### `PositionalEncoding` 
- introduces positional information into the input embeddings

In [None]:
class PositionalEncoding(Layer):
    def __init__(self,num_patches,embed_dim):
        super(PositionalEncoding,self).__init__()
        self.num_patches=num_patches
        self.embed_dim=embed_dim
        self.pos_encoding=self.positional_encoding(num_patches,embed_dim)

    def positional_encoding(self,num_patches,embed_dim):
        # indexes (0 to num_patches-1)
        positions=tf.range(num_patches,dtype=tf.float32)[:,tf.newaxis]  # (num_patches,1)
        div_term=tf.exp(tf.range(0,embed_dim,2,dtype=tf.float32)*-(tf.math.log(10000.0)/embed_dim))  # (embed_dim // 2)
        # positional encoding using sin and cos
        pos_encoding=tf.concat([tf.sin(positions*div_term),tf.cos(positions*div_term)],axis=1)  # (num_patches,embed_dim)
        pos_encoding=tf.expand_dims(pos_encoding,axis=0)  # (1,num_patches,embed_dim)
        return pos_encoding

    def call(self,x):
        # x - (batch_size,num_patches,embed_dim) -> (?,3072,64)
        # applying positional encoding on the last dimension (64)
        pos_encoding_resized=tf.tile(self.pos_encoding,multiples=[tf.shape(x)[0],1,1])  # (batch_size,num_patches,embed_dim)
        return x+pos_encoding_resized 


##### `TransformerEncoderBlock`
As mentioned in the paper, Transfomer blocks of both ViT and ByteFormers is the same

In [17]:
def create_vit_byte_model(input_shape,byte_vocab_size,byte_embed_dim,conv_filters,embed_dim,num_heads,ff_dim,num_layers,num_classes):
    inputs=Input(shape=input_shape)
    x=Embedding(input_dim=byte_vocab_size,output_dim=byte_embed_dim)(inputs)  # (3072,32)
    x=Conv1D(filters=conv_filters,kernel_size=3,strides=2,padding="same",activation="relu")(x)  # (3072,32)
    x=PositionalEncoding(num_patches=1536,embed_dim=embed_dim)(x)  
    for _ in range(num_layers):
        x=TransformerEncoderBlock(embed_dim,num_heads,ff_dim)(x)  

    x=LayerNormalization(epsilon=1e-6)(x)  
    x=Flatten()(x)  
    x=Dense(ff_dim,activation="relu")(x)  
    x=Dropout(0.005)(x)  
    outputs=Dense(num_classes,activation="softmax")(x)
    return Model(inputs=inputs,outputs=outputs)


##### Creating the ByteFomer with the Following Parameters:

- Input Shape: `(3072,)`  
- Byte Vocabulary Size: `256`  
- Byte Embedding Dimension: `128`  
- Number of Convolutional Filters: `32`  
- Embedding Dimension: `32`  
- Number of Attention Heads: `4`  
- Feed-Forward Network Dimension: `256`  
- Number of Transformer Encoder Layers: `4`  
- Number of Classes: `10`  


In [20]:
model=create_vit_byte_model(
    input_shape=(3072,),
    byte_vocab_size=256,
    byte_embed_dim=128,
    conv_filters=32,
    embed_dim=32,
    num_heads=4,
    ff_dim=256,
    num_layers=4,
    num_classes=10
)

Compiling the model with Adam optimizer and crossentropy as loss function

In [21]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()

Fitting the model with batch_size=$16$ for $100$ epochs with train,val split as $20\%$

In [None]:
history=model.fit(x_train_bytes,y_train,batch_size=16, epochs=100, validation_split=0.2,verbose=1)

Epoch 1/100
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m636s[0m 244ms/step - accuracy: 0.0982 - loss: 4.0015 - val_accuracy: 0.0997 - val_loss: 2.3027
Epoch 2/100
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m667s[0m 245ms/step - accuracy: 0.1032 - loss: 2.3028 - val_accuracy: 0.0952 - val_loss: 2.3028
Epoch 3/100
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m585s[0m 234ms/step - accuracy: 0.0992 - loss: 2.3028 - val_accuracy: 0.1003 - val_loss: 2.3027
Epoch 4/100
[1m1080/2500[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m5:01[0m 212ms/step - accuracy: 0.0963 - loss: 2.3027

In [None]:
# model.save("Byteformer.keras")  ## saving the model