In [1]:
from numba import cuda


device = cuda.get_current_device()


print(f"GPU Name: {device.name.decode('utf-8')}")
print(f"Compute Capability: {device.compute_capability}")


mem_info = cuda.current_context().get_memory_info()
total_memory = mem_info[1]  
print(f"Total Memory: {total_memory / (1024**3):.2f} GB")

print(f"Multiprocessor Count: {device.MULTIPROCESSOR_COUNT}")
print(f"Max Threads per Block: {device.MAX_THREADS_PER_BLOCK}")
print(f"Max Block Dimensions: {device.MAX_BLOCK_DIM_X}, {device.MAX_BLOCK_DIM_Y}, {device.MAX_BLOCK_DIM_Z}")
print(f"Max Grid Dimensions: {device.MAX_GRID_DIM_X}, {device.MAX_GRID_DIM_Y}, {device.MAX_GRID_DIM_Z}")
print(f"Warp Size: {device.WARP_SIZE}")


GPU Name: NVIDIA GeForce RTX 4060 Laptop GPU
Compute Capability: (8, 9)
Total Memory: 8.00 GB
Multiprocessor Count: 24
Max Threads per Block: 1024
Max Block Dimensions: 1024, 1024, 64
Max Grid Dimensions: 2147483647, 65535, 65535
Warp Size: 32


# **Parallel**

## Convolution
### Idea
- The idea is to distribute the convolution operator of each output pixel across multiple threads.
### Design
- **Block Configuration (2D block)**: Each block has a size of 16x16 threads, covering a 16x16 region of the output image.
- **Grid Configuration (3D grid)**:  The grid spans the entire output image dimensions (X and Y) and handles batch and output channels along the Z dimension. 
- **Thread Operations**: Each thread computes the value of one output pixel. It loops over all input channels, performing the convolution operation by multiplying input pixel values by corresponding kernel weights and accumulating the result. Then, the computed value is written to the corresponding position in the output feature map.
### Implement

<style>
  .pseudo-code {
    background-color: #e0e0e0;
    color: #333;
    border: 1px solid #ddd;
    padding: 10px;
    border-radius: 5px;
    font-family: 'Courier New', Courier, monospace;
    font-size: 18px;
  }
  .pseudo-code ul {
    margin: 0;
    padding-left: 20px;
  }
  .pseudo-code li {
    margin-bottom: 5px;
  }
  .pseudo-code strong {
    color: #007BFF;
  }
</style>

<div class="pseudo-code">
  <strong>directConv2D_kernel</strong>:
  <ul>
    <li><strong>Inputs</strong>:
      <ul>
        <li>img, output, weight, bias, in_channel, out_channel, batch_size</li>
      </ul>
    </li>
    <li>Calculate out_row, out_col, z_idx using a 3D CUDA grid.</li>
    <li>Determine batch_idx by integer division of z_idx by out_channel.</li>
    <li>Determine out_channel_idx by computing the remainder of z_idx divided by out_channel.</li>
    <li><strong>If</strong> out_row and out_col are within the output image bounds:
      <ul>
        <li>Initialize outPixel with the bias for the current output channel.</li>
        <li><strong>For each</strong> input channel index:
          <ul>
            <li><strong>For each</strong> kernel row:
              <ul>
                <li><strong>For each</strong> kernel column:
                  <ul>
                    <li>Multiply the corresponding weight and image pixel, add to outPixel.</li>
                  </ul>
                </li>
              </ul>
            </li>
          </ul>
        </li>
        <li>Store the computed outPixel in the output image.</li>
      </ul>
    </li>
  </ul>
</div>


### Evaluate

In [2]:
%reset -f
from pandas import DataFrame
from time import time
import numpy as np
import torch
from model_numba.layers_cpu.layer import ConvolutionalLayer
from model_numba.Layers.Layers import DirectConv2DGPU
import pandas as pd  


columns = ["Layer (channel in * channel_out)", "Sequential (s)", "Numba Runtime (s)", "Different (mean)"]
df = DataFrame(columns=columns)

def compare(in_channel, out_channel, img_size):
    global df  
    
    img = torch.rand(size=(4, in_channel, img_size, img_size), dtype=torch.float32)
    weight = torch.rand(size=(out_channel, in_channel, 3, 3), dtype=torch.float32)
    bias = torch.rand(size=(out_channel,), dtype=torch.float32)

    # CPU computation
    cpu_conv = ConvolutionalLayer(in_channel, out_channel, 3, 1, 1, weight.numpy(), bias.numpy())
    start = time()
    cpu_output = cpu_conv.forward(img.numpy())
    end = time()
    cpu_runtime = end - start

  
    start = time()
    numba_output = DirectConv2DGPU(img.numpy(), weight.numpy(), bias.numpy(), True)
    end = time()
    gpu_runtime = end - start


    ret = {
        "Layer (channel in * channel_out)": (in_channel, out_channel),
        "Sequential (s)": cpu_runtime,
        "Numba Runtime (s)": gpu_runtime,
        "Different (mean)": np.mean(np.abs(numba_output - cpu_output))
    }

 
    new_row_df = DataFrame([ret], columns=df.columns)
    df = pd.concat([df, new_row_df], ignore_index=True)


compare(3, 64, 512)
compare(64, 128, 256)
compare(128, 256, 128)
compare(256, 512, 64)
compare(512, 1024, 32)

# Display the DataFrame
df


Unnamed: 0,Layer (channel in * channel_out),Sequential (s),Numba Runtime (s),Different (mean)
0,"(3, 64)",231.343723,1.070358,4.478326e-07
1,"(64, 128)",149.354147,0.801589,4.130972e-05
2,"(128, 256)",89.949023,0.788387,0.0001154061
3,"(256, 512)",59.084347,0.784225,0.0003211508
4,"(512, 1024)",44.7523,0.776366,0.0008802213


## Batch Normalization 2D


### Idea
- Each pixel is normalized independently. Therefore, dividing each pixel per thread speeds up compution.
### Design
- **Block Configuration (1D block)**: Each block has a size of 256 threads.
- **Grid Configuration (3D grid)**:  The X axis handles whole dimensions of output image and channel, Z handles batch.
- **Thread Operations**: Each thread noramlizes the value of one input pixel and write it to the corresponding pixel. 
### Implement
<style>
  .pseudo-code {
    background-color: #e0e0e0; /* Light gray background */
    color: #333; /* Dark gray text */
    border: 1px solid #bbb; /* Gray border */
    padding: 10px;
    border-radius: 5px;
    font-family: 'Courier New', Courier, monospace;
    font-size: 18px;
  }
  .pseudo-code ul {
    margin: 0;
    padding-left: 20px;
  }
  .pseudo-code li {
    margin-bottom: 5px;
  }
  .pseudo-code strong {
    color: #007BFF; /* Blue color for strong text */
  }
</style>

<div class="pseudo-code">
  <strong>batchNorm2D_kernel</strong>:
  <ul>
    <li><strong>Inputs</strong>:
      <ul>
        <li>img, out_img, batchNorm_weight, batchNorm_bias, mean, variance, epsilon</li>
      </ul>
    </li>
    <li>Calculate out_x, out_y using a 2D CUDA grid.</li>
    <li>Determine in channel index.</li>
    <li>Determine batch index.</li>
    <li><strong>Check for valid pixel index</strong> 
      <ul>
        <li>Calculate normalized value of input pixel.</li>
        <li>Store the computed value in corresponding output pixel.</li>
      </ul>
    </li>
  </ul>
</div>

### Evaluate

In [3]:
%reset -f
from pandas import DataFrame
from time import time
import numpy as np
import torch
from model_numba.layers_cpu.layer import BatchNorm2D_CPU
from model_numba.Layers.Layers import batchNorm2D
import pandas as pd  


columns = ["Input Size (C, H, W)", "Sequential (s)", "Numba Runtime (s)", "Different (mean)"]
df = DataFrame(columns=columns)

def compare(in_channel, img_size):
    global df  
    
    img = torch.rand(size=(4, in_channel, img_size, img_size), dtype=torch.float32)
    weight = torch.rand(size=(in_channel,), dtype=torch.float32)
    bias = torch.rand(size=(in_channel,), dtype=torch.float32)
    mean = torch.rand(size=(in_channel,), dtype=torch.float32)
    var = torch.rand(size=(in_channel,), dtype=torch.float32)

    # CPU computation
    cpu_batchnorm = BatchNorm2D_CPU(in_channel, weight.numpy(), bias.numpy())
    start = time()
    cpu_output = cpu_batchnorm.forward(img.numpy(), mean.numpy(), var.numpy())
    end = time()
    cpu_runtime = end - start

  
    start = time()
    numba_output = batchNorm2D(img.numpy(), weight.numpy(), bias.numpy(),mean.numpy(), var.numpy())
    end = time()
    gpu_runtime = end - start


    ret = {
        "Input Size (C, H, W)": (in_channel, img_size, img_size),
        "Sequential (s)": cpu_runtime,
        "Numba Runtime (s)": gpu_runtime,
        "Different (mean)": np.mean(np.abs(numba_output - cpu_output))
    }

 
    new_row_df = DataFrame([ret], columns=df.columns)
    df = pd.concat([df, new_row_df], ignore_index=True)


compare( 64, 512)
compare(128, 256)
compare(256, 128)
compare(512, 64)


# Display the DataFrame
df

Unnamed: 0,"Input Size (C, H, W)",Sequential (s),Numba Runtime (s),Different (mean)
0,"(64, 512, 512)",123.179057,0.43073,5.728351e-09
1,"(128, 256, 256)",60.489841,0.286713,8.855715e-09
2,"(256, 128, 128)",30.162915,0.178511,7.653616e-09
3,"(512, 64, 64)",15.1438,0.0874,6.969669e-09


## ReLu Activation
### Idea
- ReLu Activation applies single element-wise operator: $ ReLU(x)=max(0,x) $ for each input pixels. So, we can compute this operator accoss multiple threads to improve performance.
### Desgin
- **Block Configuration (1D block)**: Each block has a size of 256 threads.
- **Grid Configuration (3D grid)**:  The X axis handles whole dimensions of output image and channels, Z handles batch.
- **Thread Operations**: Each thread applies the ReLU function to one input pixel and write it to the corresponding pixel.
### Implement
<style>
  .pseudo-code {
    background-color: #e0e0e0; /* Light gray background */
    color: #333; /* Dark gray text */
    border: 1px solid #bbb; /* Gray border */
    padding: 10px;
    border-radius: 5px;
    font-family: 'Courier New', Courier, monospace;
    font-size: 16px; /* Larger text size */
  }
  .pseudo-code ul {
    margin: 0;
    padding-left: 20px;
  }
  .pseudo-code li {
    margin-bottom: 5px;
  }
  .pseudo-code strong {
    color: #007BFF; /* Blue color for strong text */
    font-size: 18px; /* Larger size for strong text */
  }
</style>

<div class="pseudo-code">
  <strong>relu_kernel</strong>:
  <ul>
    <li><strong>Inputs</strong>:
      <ul>
        <li>data, batch size, number of channels, img width, imgheight</li>
      </ul>
    </li>
    <li>Calculate <strong>idx</strong> using a 1D CUDA grid.</li>
    <li>Determine the total number of elements as <strong>batch_size * channels * img_width * img_height</strong>.</li>
    <li><strong>If</strong> <em>idx</em> is within the range of total elements:
      <ul>
        <li>Calculate <strong>b</strong> as the integer division of <em>idx</em> by <em>(channels * img_width * img_height)</em>.</li>
        <li>Calculate <strong>c</strong> as the modulo of <em>idx</em> by <em>(channels * img_width * img_height)</em>, then integer division by <em>(img_width * img_height)</em>.</li>
        <li>Calculate <strong>w</strong> as the modulo of <em>idx</em> by <em>(img_width * img_height)</em>, then integer division by <em>img_height</em>.</li>
        <li>Calculate <strong>h</strong> as <em>idx</em> modulo <em>img_height</em>.</li>
        <li>Update <strong>data[b, c, w, h]</strong> to be the maximum of its current value and 0.</li>
      </ul>
    </li>
  </ul>
</div>

### Evaluate

In [4]:
%reset -f
from pandas import DataFrame
from time import time
import numpy as np
import torch
from model_numba.layers_cpu.layer import ReLU_CPU
from model_numba.Layers.Layers import RELU_GPU
import pandas as pd  


columns = ["Input Size (C, H, W)", "Sequential (s)", "Numba Runtime (s)", "Different (mean)"]
df = DataFrame(columns=columns)

def compare(in_channel, img_size):
    global df  
    
    img = torch.rand(size=(4, in_channel, img_size, img_size), dtype=torch.float32)


    # CPU computation
    cpu_batchnorm = ReLU_CPU()
    start = time()
    cpu_output = cpu_batchnorm.forward(img.numpy())
    end = time()
    cpu_runtime = end - start

  
    start = time()
    numba_output = RELU_GPU(img.numpy())
    end = time()
    gpu_runtime = end - start


    ret = {
        "Input Size (C, H, W)": (in_channel, img_size, img_size),
        "Sequential (s)": cpu_runtime,
        "Numba Runtime (s)": gpu_runtime,
        "Different (mean)": np.mean(np.abs(numba_output - cpu_output))
    }

 
    new_row_df = DataFrame([ret], columns=df.columns)
    df = pd.concat([df, new_row_df], ignore_index=True)


compare( 64, 512)
compare(128, 256)
compare(256, 128)
compare(512, 64)


# Display the DataFrame
df

Unnamed: 0,"Input Size (C, H, W)",Sequential (s),Numba Runtime (s),Different (mean)
0,"(64, 512, 512)",81.952738,0.293049,0.0
1,"(128, 256, 256)",41.761168,0.210163,0.0
2,"(256, 128, 128)",20.765348,0.106927,0.0
3,"(512, 64, 64)",10.411028,0.055011,0.0


## Max Pooling 2D
### Idea
- Each output pixel is computed independently. Therefore, we can compute efficiently by parallelizing this layer on GPU.
### Desgin
- **Block Configuration (1D block)**: Each block has a size of 256 threads.
- **Grid Configuration (3D grid)**:  The X axis handles whole dimensions of output image and channels, Z handles batch.
- **Thread Operations**: Each thread compute four max operatiors of input pixels and write it to the corresponding pixel.
### Implement
<style>
  .pseudo-code {
    background-color: #e0e0e0; /* Light gray background */
    color: #333; /* Dark gray text */
    border: 1px solid #bbb; /* Gray border */
    padding: 10px;
    border-radius: 5px;
    font-family: 'Courier New', Courier, monospace;
    font-size: 18px; /* Larger text size */
  }
  .pseudo-code ul {
    margin: 0;
    padding-left: 20px;
  }
  .pseudo-code li {
    margin-bottom: 5px;
  }
  .pseudo-code strong {
    color: #007BFF; /* Blue color for strong text */
    font-size: 18px; /* Larger size for strong text */
  }
</style>

<div class="pseudo-code">
  <strong>MaxPooling_Kernel</strong>:
  <ul>
    <li><strong>Inputs:</strong>
      <ul>
        <li>input, output, number of channels, output height, output width</li>
      </ul>
    </li>
    <li>Calculate <strong>index</strong> using a 1D CUDA grid.</li>
    <li>Determine the total number of output elements using <strong>output.size</strong>.</li>
    <li><strong>If</strong> <em>index</em> is within the range of total output elements:
      <ul>
        <li>Calculate <strong>batch index</strong>, <strong>channel index</strong>, <strong>input pixel index</strong> and <strong>output pixel index</strong>.</li>
        <li>Initialize <strong>max_val</strong> with the value at <em>input[batch index, channel index, input_row, input_col]</em>.</li>
        <li>Update <strong>max_val</strong> by comparing it with adjacent values:
          <ul>
            <li>Check <em>input[batch index, channel index, input_row, input_col + 1]</em>.</li>
            <li>Check <em>input[batch index, channel index, input_row + 1, input_col]</em>.</li>
            <li>Check <em>input[batch index, channel index, input_row + 1, input_col + 1]</em>.</li>
          </ul>
        </li>
      </ul>
    </li>
  </ul>
</div>

### Evaluate

In [5]:
%reset -f
from pandas import DataFrame
from time import time
import numpy as np
import torch
from model_numba.layers_cpu.layer import MaxPooling2D_CPU
from model_numba.Layers.Layers import MaxPooling2D_GPU
import pandas as pd  


columns = ["Input Size (C, H, W)", "Sequential (s)", "Numba Runtime (s)", "Different (mean)"]
df = DataFrame(columns=columns)

def compare(in_channel, img_size):
    global df  
    
    img = torch.rand(size=(4, in_channel, img_size, img_size), dtype=torch.float32)


    # CPU computation
    cpu_batchnorm = MaxPooling2D_CPU()
    start = time()
    cpu_output = cpu_batchnorm.forward(img.numpy())
    end = time()
    cpu_runtime = end - start

  
    start = time()
    numba_output = MaxPooling2D_GPU(img.numpy(),True)
    end = time()
    gpu_runtime = end - start


    ret = {
        "Input Size (C, H, W)": (in_channel, img_size, img_size),
        "Sequential (s)": cpu_runtime,
        "Numba Runtime (s)": gpu_runtime,
        "Different (mean)": np.mean(np.abs(numba_output - cpu_output))
    }

 
    new_row_df = DataFrame([ret], columns=df.columns)
    df = pd.concat([df, new_row_df], ignore_index=True)


compare( 64, 512)
compare(128, 256)
compare(256, 128)
compare(512, 64)


# Display the DataFrame
df

Unnamed: 0,"Input Size (C, H, W)",Sequential (s),Numba Runtime (s),Different (mean)
0,"(64, 512, 512)",33.361417,0.334031,0.0
1,"(128, 256, 256)",16.205704,0.143854,0.0
2,"(256, 128, 128)",8.189884,0.075449,0.0
3,"(512, 64, 64)",4.148149,0.046569,0.0


# **Optimized**

# Version 1: Optimized Convolution layers using Shared Memory

### Idea
- The goal of this version is to optimize the convolution operation by utilizing the shared memory of the GPU, which is much faster than global memory. By dividing the image into smaller tiles and loading them into shared memory, we can reduce the number of redundant global memory accesses, resulting in a significant performance boost.
#### Design
- **2D Blocks**: Each block processes a tile of the input image. The block size is 16x16.
- **3D Grid**: X and Y dimensions Handle the spatial dimensions of the image (width and height) and Z dimension Handles the batch and output channels, allowing for parallel processing across different images and feature maps.  
![alt text](tile.png)  

In the image above, we utilize two tiles: the input tile and the output tile. The input tile serves to copy each input pixel to a shared array, while the output tile is responsible for computing the convolution of each output pixel within its bounds.  To prevent data loss at tile boundaries, we employ a specific formula for calculating thread indices:  

Column index for output:  
-   $col_{out} = cuda.blockIdx.x * OUTPUT TILE SIZE + threadIdx.x$  
    

Row index for output:  
-   $row_{out} = cuda.blockIdx.y * OUTPUT TILE SIZE + threadIdx.y$  

This approach facilitates the overlap between neighboring input tiles.
#### Evaluate

In [6]:
%reset -f
from pandas import DataFrame
from time import time
import numpy as np
import torch
from model_numba.Layers.Layers import DirectConv2DGPU, Convolution2D_GPU
import pandas as pd  


columns = ["Layer (channel in * channel_out)","Direct Conv (s)", "TileShared Conv (s)", "Different (mean)"]
df = DataFrame(columns=columns)

def compare(in_channel, out_channel, img_size):
    global df  
    
    img = torch.rand(size=(4, in_channel, img_size, img_size), dtype=torch.float32)
    weight = torch.rand(size=(out_channel, in_channel, 3, 3), dtype=torch.float32)
    bias = torch.rand(size=(out_channel,), dtype=torch.float32)
    

    # CPU computation
    start = time()
    numba_output_1 = DirectConv2DGPU(img.numpy(), weight.numpy(), bias.numpy(), True)
    end = time()
    direct = end - start

  
    start = time()
    numba_output_2 = Convolution2D_GPU(img.numpy(), weight.numpy(), bias.numpy(), True)
    end = time()
    tile = end - start


    ret = {
        "Layer (channel in * channel_out)": (in_channel, out_channel),
        "Direct Conv (s)": direct,
        "TileShared Conv (s)": tile,
        "Different (mean)": np.mean(np.abs(numba_output_1 - numba_output_2))
    }

 
    new_row_df = DataFrame([ret], columns=df.columns)
    df = pd.concat([df, new_row_df], ignore_index=True)


compare(3, 64, 512)
compare(64, 128, 256)
compare(128, 256, 128)
compare(256, 512, 64)
compare(512, 1024, 32)

# Display the DataFrame
df

Unnamed: 0,Layer (channel in * channel_out),Direct Conv (s),TileShared Conv (s),Different (mean)
0,"(3, 64)",0.708337,0.452569,0.0
1,"(64, 128)",0.525432,0.30858,0.0
2,"(128, 256)",0.492068,0.292706,0.0
3,"(256, 512)",0.481996,0.283913,0.0
4,"(512, 1024)",0.485973,0.323966,0.0


# Version 2: Combie Convolution, ReLu and BatchNorm2D into single kernel

#### Non-Combine convolution
![](original_conv.png)  
#### Combine convolution
![alt text](optimized_conv.png)

- In the non-combined convolution implementation, there are six data transfers between the host and device. In the combined version, the tile-shared convolution kernel performs additional steps—normalization and ReLU activation—after the convolution operation. This approach reduces the number of data transfers to just two.

In [7]:
%reset -f
from pandas import DataFrame
from time import time
import numpy as np
import torch
from model_numba.Layers.Layers import DirectConv2DGPU, Convolution2D_GPU,Combie_TileConv_GPU, RELU_GPU, batchNorm2D
import pandas as pd  


columns = ["Layer (channel in * channel_out)","Non-Combie Conv (s)", "Combie Conv (s)", "Different (mean)"]
df = DataFrame(columns=columns)

def compare(in_channel, out_channel, img_size):
    global df  
    
    img = torch.rand(size=(4, in_channel, img_size, img_size), dtype=torch.float32)
    weight = torch.rand(size=(out_channel, in_channel, 3, 3), dtype=torch.float32)
    bias = torch.rand(size=(out_channel,), dtype=torch.float32)
    batch_weight = torch.rand(size=(in_channel,), dtype=torch.float32)
    batch_bias = torch.rand(size=(in_channel,), dtype=torch.float32)
    mean = torch.rand(size=(in_channel,), dtype=torch.float32)
    var = torch.rand(size=(in_channel,), dtype=torch.float32)

    # CPU computation
    start = time()
    numba_output_1 = Convolution2D_GPU(img.numpy(), weight.numpy(), bias.numpy(), True)
    numba_output_1 = batchNorm2D(numba_output_1, batch_weight.numpy(), batch_bias.numpy(), mean.numpy(), var.numpy())
    numba_output_1 = RELU_GPU(numba_output_1)
    end = time()
    direct = end - start

  
    start = time()
    numba_output_2 = Combie_TileConv_GPU(img.numpy(), weight.numpy(), bias.numpy(),batch_weight.numpy(), batch_bias.numpy(), mean.numpy(), var.numpy(), True)
    end = time()
    tile = end - start


    ret = {
        "Layer (channel in * channel_out)": (in_channel, out_channel),
        "Non-Combie Conv (s)": direct,
        "Combie Conv (s)": tile,
        "Different (mean)": np.mean(np.abs(numba_output_1 - numba_output_2))
    }

 
    new_row_df = DataFrame([ret], columns=df.columns)
    df = pd.concat([df, new_row_df], ignore_index=True)


compare(3, 64, 512)
compare(64, 128, 256)
compare(128, 256, 128)
compare(256, 512, 64)
compare(512, 1024, 32)

# Display the DataFrame
df

Unnamed: 0,Layer (channel in * channel_out),Non-Combie Conv (s),Combie Conv (s),Different (mean)
0,"(3, 64)",0.527608,0.279877,0.0
1,"(64, 128)",0.456157,0.106524,0.0
2,"(128, 256)",0.358132,0.089497,0.0
3,"(256, 512)",0.31899,0.082366,0.0
4,"(512, 1024)",0.320638,0.105859,0.0


# Version 3: Reduce data transfer between layers

*In this version, the goal is to minimize data transfers as much as possible. We achieve this by reducing data transfers at various layers: double convolution, maxpooling-double convolution, and transpose convolution-double convolution. Data is only transferred at the first layer, the last layer, and occasionally at skip connection steps if CPU code is used for concatenation. The optimizations are illustrated below:

- **Maxpooling-Double Convolution**  
  ![alt text](optimize_maxpool.png)  
  This optimization reduces data transfers from 4 to 0, which is the same for transpose convolution-double convolution.

- **Double Convolution**  
  *Original*  
  ![alt text](original_combie_conv.png)  
  *Optimized*  
  ![alt text](optimize_combie_conv.png)  
  This optimization reduces data transfers from 4 to 0 (or 1 if this double convolution is the first layer).

### Table: Number of Data Transfers Per Block

|        | **Original** | **Optimized** |
|--------|--------------|---------------|
| **down_1** | 12 transfers  | 1 transfer    |
| **down_2** | 16 transfers  | 0 transfers   |
| **down_3** | 16 transfers  | 0 transfers   |
| **down_4** | 16 transfers  | 0 transfers   |
| **down_5** | 16 transfers  | 0 transfers   |
| **up_1**   | 16 transfers  | 1 transfer    |
| **up_2**   | 16 transfers  | 1 transfer    |
| **up_3**   | 16 transfers  | 1 transfer    |
| **up_4**   | 16 transfers  | 1 transfer    |
| **out**    | 2 transfers   | 1 transfer    |
| **Total**  | 142 transfers | 6 transfers   |


### Evaluate
-   We run the entire model using version 3 and compare it with pytorch

In [9]:

%reset -f
from model_numba.unet_cuda import Unet_Cuda
import torch
from PIL import Image
from torchvision import transforms

transform = transforms.Compose(
    [
        transforms.Resize((512, 512)),
        transforms.ToTensor()
    ]
)
image = Image.open('test.jpg')
import numpy as np
image = transform(image).unsqueeze(0)
image.shape


from model.unet import U_net
model = U_net(1).eval()
weight = torch.load('weights/weights.pth')
model.load_state_dict(weight)

numba_model = Unet_Cuda(1).eval()


from time import time

start = time()
out_pytorch = model(image)
end = time()
print(f'Pytorch CPU Runtime: {end-start}s')

start = time()
out_numba = numba_model(image.numpy())
end = time()

print(f'Numba Runtime: {end-start}s')

print(f'Mean Difference: {np.mean(np.abs(out_pytorch.cpu().detach().numpy()- out_numba))}')


Pytorch CPU Runtime: 0.7206525802612305s
Numba Runtime: 0.979292631149292s
Mean Difference: 0.000504146097227931
