# U-Net Parallelizing weekly report

In [27]:
import numpy as np
import pickle

from typing import Union
def save_pkl(data: Union[np.ndarray, any], filename: str) -> None:
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

def load_pkl(filename: str) -> any:
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [28]:
rows = ["3x64", "64x64", "64x128", "128x128", "128x256","256x256", "256x512", "512x512", "512x1024", "1024x1024"]
columns = [ "torch_conv","direct_conv_cpu",  "tile_shared_conv"]

In [29]:
import os
BATCH_SIZE = 4
IN_CHANNELS = 3
OUT_CHANNELS = 64
IMG_HEIGHT = 512
IMG_WIDTH = 512
KERNEL_SIZE = 3

os.makedirs('input/data', exist_ok=True)
os.makedirs('input/img', exist_ok=True)
os.makedirs('input/scripts', exist_ok=True)

# Generate random input data for image_3x64
image_3x64 = np.random.uniform(-1.0, 1.0, (BATCH_SIZE, 3, IMG_HEIGHT, IMG_WIDTH)).astype(np.float32)  # (batch_size, input_channels, height, width)
save_pkl(image_3x64, 'input/img/image_torch_3x64.pkl')
save_pkl(image_3x64, 'input/img/image_direct_3x64.pkl')
save_pkl(image_3x64, 'input/img/image_tile_3x64.pkl')

# Loop through the configurations to generate weights and biases
for row in rows:
    in_channels, out_channels = map(int, row.split('x'))
    weight = np.random.uniform(0.0, 1.0, (out_channels, in_channels, KERNEL_SIZE, KERNEL_SIZE)).astype(np.float32)
    bias = np.random.uniform(0.0, 1.0, (out_channels)).astype(np.float32)

    # Save weights and biases to pickle files
    save_pkl(weight, f'input/data/weight_{row}.pkl')
    save_pkl(bias, f'input/data/bias_{row}.pkl')

#np.save('image.npy', image, fmt='%f%f%f%f')
##np.save('weight.npy', weight, fmt='%f')
#np.save('bias.npy', bias, fmt='%f')


In [30]:
!ls -lh

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [31]:
import pandas as pd
all_results = pd.DataFrame(columns=columns, index=rows, dtype=float)
all_results

Unnamed: 0,torch_conv,direct_conv_cpu,tile_shared_conv
3x64,,,
64x64,,,
64x128,,,
128x128,,,
128x256,,,
256x256,,,
256x512,,,
512x512,,,
512x1024,,,
1024x1024,,,


## Sequential version 1: Using convolutional layer derived from torch.nn

This is the library-based version which allow to execute on both CPU and GPU units. We would consider this version as the ground truth in terms of correctness and speed when comparing with other customized version.

### Design

### Implement

In [32]:
%%writefile input/scripts/sequential_torch.py
import pickle
import time
import torch
import torch.nn as nn
import sys
import numpy as np
#sys.argv: BATCH_SIZE IN_CHANNEL OUT_CHANNEL IMG_SIZE
if len(sys.argv) != 6:
    raise "Invalid Arguments."

BATCH_SIZE = int(sys.argv[1])
IN_CHANNELS = int(sys.argv[2])
OUT_CHANNELS = int(sys.argv[3])
IMG_HEIGHT = int(sys.argv[4])
IMG_WIDTH = int(sys.argv[4])
FLAG = bool(sys.argv[5])

KERNEL_SIZE = 3

max_pooling = nn.MaxPool2d(kernel_size=2, stride= 2, padding =0)

from typing import Union
def save_pkl(data: Union[np.ndarray, any], filename: str) -> None:
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

def load_pkl(filename: str) -> any:
    with open(filename, 'rb') as f:
        return pickle.load(f)

# Load parameters from pickle file

image = load_pkl(f'input/img/image_torch_{IN_CHANNELS}x{OUT_CHANNELS}.pkl')
weight = load_pkl(f'input/data/weight_{IN_CHANNELS}x{OUT_CHANNELS}.pkl')
bias = load_pkl(f'input/data/bias_{IN_CHANNELS}x{OUT_CHANNELS}.pkl')


torch_img = torch.tensor(image, requires_grad=False).cpu()
torch_weight = torch.tensor(weight, requires_grad=False).cpu()
torch_bias = torch.tensor(bias, requires_grad=False).cpu()

if FLAG:
    torch_img = max_pooling(torch_img)

conv = nn.Conv2d(IN_CHANNELS, OUT_CHANNELS, KERNEL_SIZE, padding=KERNEL_SIZE // 2, bias=True).cpu()
conv.weight.data = torch_weight
conv.bias.data = torch_bias

start = time.time()
torch_output = conv(torch_img).cpu().detach().numpy()
end = time.time()
print(f'Processing time: {(end - start): .6f} s')

if IN_CHANNELS != OUT_CHANNELS:
    new_channel_in, new_channel_out = OUT_CHANNELS, OUT_CHANNELS
else:
    new_channel_in, new_channel_out = OUT_CHANNELS, OUT_CHANNELS*2

save_pkl(torch_output, f'input/img/image_torch_{new_channel_in}x{new_channel_out}.pkl')

Overwriting input/scripts/sequential_torch.py


### Evaluate

#### Runtime

##### 3 x 64

In [59]:
%%capture output_3x64
!python Test_convolution_gpu.py 4 3 64 512

In [60]:
print(output_3x64.stdout)

Direct CPU time: 219.56978702545166
TileSharedConv2D_GPU time: 0.5565063953399658
PyTorch Conv2D time: 0.11388325691223145
Max difference (Shared): 2.861023e-06
Mean difference (Shared): 1.3732371e-07



##### 64 x 64

In [61]:
%%capture output_64x64
!python Test_convolution_gpu.py 4 64 64 512

In [62]:
print(output_64x64.stdout)

Direct CPU time: 295.68580627441406
TileSharedConv2D_GPU time: 0.6943941116333008
PyTorch Conv2D time: 0.11925268173217773
Max difference (Shared): 0.012087822
Mean difference (Shared): 0.0016611373



64 x 128

In [63]:
%%capture output_64x128
!python Test_convolution_gpu.py 4 64 128 256

In [64]:
print(output_64x128.stdout)

Direct CPU time: 147.268803358078
TileSharedConv2D_GPU time: 0.6604487895965576
PyTorch Conv2D time: 0.11014032363891602
Max difference (Shared): 0.011403441
Mean difference (Shared): 0.0016632475



##### 128 x 128

In [65]:
%%capture output_128x128
!python Test_convolution_gpu.py 4 128 128 256

In [66]:
print(output_128x128.stdout)

Direct CPU time: 174.84809803962708
TileSharedConv2D_GPU time: 0.874976634979248
PyTorch Conv2D time: 0.11024022102355957
Max difference (Shared): 0.016788483
Mean difference (Shared): 0.0023460574



##### 128 x 256

In [67]:
%%capture output_128x256
!python Test_convolution_gpu.py 4 128 256 128

In [68]:
print(output_128x256.stdout)

Direct CPU time: 102.0986578464508
TileSharedConv2D_GPU time: 0.687300443649292
PyTorch Conv2D time: 0.11735010147094727
Max difference (Shared): 0.016106129
Mean difference (Shared): 0.0023465196



##### 256 x 256

In [69]:
%%capture output_256x256
!python Test_convolution_gpu.py 4 256 256 128

In [70]:
print(output_256x256.stdout)

Direct CPU time: 119.82527160644531
TileSharedConv2D_GPU time: 0.7156195640563965
PyTorch Conv2D time: 0.12391853332519531
Max difference (Shared): 0.023344994
Mean difference (Shared): 0.0033124883



##### 256 x 512

In [71]:
%%capture output_256x512
!python Test_convolution_gpu.py 4 256 512 64

In [72]:
print(output_256x512.stdout)

Direct CPU time: 58.33586835861206
TileSharedConv2D_GPU time: 0.5865521430969238
PyTorch Conv2D time: 0.10978102684020996
Max difference (Shared): 0.024612427
Mean difference (Shared): 0.0032992968



##### 512 x 512

In [73]:
%%capture output_512x512
!python Test_convolution_gpu.py 4 512 512 64

In [74]:
print(output_512x512.stdout)

Direct CPU time: 89.73126649856567
TileSharedConv2D_GPU time: 0.705134391784668
PyTorch Conv2D time: 0.10934782028198242
Max difference (Shared): 0.029963493
Mean difference (Shared): 0.004656828



##### 512 x 1024

In [75]:
%%capture output_512x1024
!python Test_convolution_gpu.py 4 512 1024 32

In [76]:
print(output_512x1024.stdout)

Direct CPU time: 44.3155300617218
TileSharedConv2D_GPU time: 0.567936897277832
PyTorch Conv2D time: 0.11000347137451172
Max difference (Shared): 0.030963898
Mean difference (Shared): 0.004632779



##### 1024 x 1024

In [77]:
%%capture output_1024x1024
!python Test_convolution_gpu.py 4 1024 1024 32

In [78]:
print(output_1024x1024.stdout)

Direct CPU time: 73.316823720932
TileSharedConv2D_GPU time: 0.8284497261047363
PyTorch Conv2D time: 0.11344146728515625
Max difference (Shared): 0.042533875
Mean difference (Shared): 0.0065273787



#### Correctness

## Sequential version 2: Using only numpy to calculate convolutional operation

### Analysis

### Design

### Implement

In [None]:
%%writefile input/scripts/sequential_direct.py
import time
import pickle
from model_numba.layers_cpu.layer import *
import sys

#sys.argv: BATCH_SIZE IN_CHANNEL OUT_CHANNEL IMG_SIZE
if len(sys.argv) !=5:
    raise "Invalid Arguments."

BATCH_SIZE = int(sys.argv[1])
IN_CHANNELS = int(sys.argv[2])
OUT_CHANNELS = int(sys.argv[3])
IMG_HEIGHT = int(sys.argv[4])
IMG_WIDTH = int(sys.argv[4])
KERNEL_SIZE = 3

from typing import Union
def save_pkl(data: Union[np.ndarray, any], filename: str) -> None:
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

def load_pkl(filename: str) -> any:
    with open(filename, 'rb') as f:
        return pickle.load(f)

# Load parameters from pickle file

image = load_pkl(f'input/img/image_direct_{IN_CHANNELS}x{OUT_CHANNELS}.pkl')
weight = load_pkl(f'input/data/weight_{IN_CHANNELS}x{OUT_CHANNELS}.pkl')
bias = load_pkl(f'input/data/bias_{IN_CHANNELS}x{OUT_CHANNELS}.pkl')

conv_layer = ConvolutionalLayer(IN_CHANNELS, OUT_CHANNELS, KERNEL_SIZE, padding=KERNEL_SIZE // 2, weights=weight, bias=bias)  # Create a Conv instance

start = time.time()
output = conv_layer.forward(image)
end = time.time()

print(f'Processing time: {(end - start): .2f} s')

new_channel_in, new_channel_out = ..., ...
if IN_CHANNELS != OUT_CHANNELS:
    new_channel_in, new_channel_out = OUT_CHANNELS, OUT_CHANNELS
else:
    new_channel_in, new_channel_out = OUT_CHANNELS, OUT_CHANNELS*2

save_pkl(torch_output, f'input/img/image_direct_{new_channel_in}x{new_channel_out}.pkl')

#output_file = open('output_2.pkl','wb')
#pickle.dump(output, output_file)
#output_file.close()


Processing time: 107.51536822319031 s


## Parallel Version 1: Using Numba to parallel Direct Convolution

## Parallel Version 2: Tiled Convolution Using Numba and Shared Memory

## Design

# Input:
-   Image: the input image with shape [BATCH_SIZE, CHANNEL, IMG_HEIGHT, IMG_WIDTH] 
-   Weight: the weight of convolution with shape [CHANNEL_OUT, CHANNEL_IN, KERNEL_SIZE, KERNEL_SIZE] 
-   Bias: the bias of colution with shape [CHANNEL_OUT] 
# Idea:
-   Tiled shared memory convolution optimizes the convolution operation by dividing the input feature map into smaller -  tiles, which are loaded into the fast shared memory of the GPU.
-   In this implementation, each thread handles the copying of one input pixel to shared memory, and the block size is set equal to the input tile size. 
-   The output tile size is calculated as input_tile_size - kernel_size + 1.
-   To index the threads, we use the formula tid_x = blockIdx.x * OUTPUT_TILE_SIZE + threadIdx.x and same for y. This formula ensures that neighboring input tiles overlap, allowing threads to handle pixels at the boundaries of the input tiles effectively.

![Tile](tile.png)

## Implement

In [None]:
from model_numba.Layers.init import *
@cuda.jit
def tileSharedConv2D_kernel(img, out_img, weight, bias, channel_in: int, channel_out: int, batch_size: int):

    _, _, z_idx = cuda.grid(3)

    batch_idx, out_channel_idx = z_idx // channel_out, z_idx % channel_out

    shared_col, shared_row = cuda.threadIdx.x, cuda.threadIdx.y

    col_out = cuda.blockIdx.x * OUTPUT_TILE_SIZE + shared_col
    row_out = cuda.blockIdx.y * OUTPUT_TILE_SIZE + shared_row

    col_in = col_out - KERNEL_SIZE // 2 # KERNEL_SIZE // 2 mean padding = 1 
    row_in = row_out - KERNEL_SIZE // 2

    # Allocate shared memory for one input channel
    sharedImg = cuda.shared.array(shape=(INPUT_TILE_SIZE, INPUT_TILE_SIZE), dtype=img.dtype)

    outPixel = bias[out_channel_idx]

    isOutputPixel = shared_col < OUTPUT_TILE_SIZE and shared_row < OUTPUT_TILE_SIZE and col_out < img.shape[3] and row_out < img.shape[2]

    for in_channel_idx in range(channel_in):
        #Copy img tile to shared memory
        if row_in >= 0 and row_in < img.shape[2] and col_in >= 0 and col_in < img.shape[3]:
            sharedImg[shared_row, shared_col] = img[batch_idx, in_channel_idx, row_in, col_in]
        else:
            sharedImg[shared_row, shared_col] = 0.0
        cuda.syncthreads()

        if isOutputPixel:
            for index in range(KERNEL_SIZE_AS_1D_ARR):
                row = index // KERNEL_SIZE
                col = index % KERNEL_SIZE
                outPixel += weight[out_channel_idx, in_channel_idx, row, col] * sharedImg[shared_row + row, shared_col + col]

        cuda.syncthreads()

    if isOutputPixel:
        out_img[batch_idx, out_channel_idx, row_out, col_out] = outPixel

In [None]:
%%writefile input/scripts/parallel_tiled.py
from model_numba.Layers.Layers import Convolution2D_GPU
import sys
import time
import pickle

import numpy as np
#sys.argv: BATCH_SIZE IN_CHANNEL OUT_CHANNEL IMG_SIZE
if len(sys.argv) != 5:
    raise "Invalid Arguments."

BATCH_SIZE = int(sys.argv[1])
IN_CHANNELS = int(sys.argv[2])
OUT_CHANNELS = int(sys.argv[3])
IMG_HEIGHT = int(sys.argv[4])
IMG_WIDTH = int(sys.argv[4])
KERNEL_SIZE = 3

import numpy as np

from typing import Union
def save_pkl(data: Union[np.ndarray, any], filename: str) -> None:
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

def load_pkl(filename: str) -> any:
    with open(filename, 'rb') as f:
        return pickle.load(f)

#Load Image and parameters
image = load_pkl(f'input/img/image_tile_{IN_CHANNELS}x{OUT_CHANNELS}.pkl')
weight = load_pkl(f'input/data/weight_{IN_CHANNELS}x{OUT_CHANNELS}.pkl')
bias = load_pkl(f'input/data/bias_{IN_CHANNELS}x{OUT_CHANNELS}.pkl')


start = time.time()
output =Convolution2D_GPU(image, weight, bias)
end = time.time()

print(f'Processing time: {(end - start): .2f} s')
new_channel_in, new_channel_out = ..., ...
if IN_CHANNELS != OUT_CHANNELS
    new_channel_in, new_channel_out = OUT_CHANNELS, OUT_CHANNELS
else:
    new_channel_in, new_channel_out = OUT_CHANNELS, OUT_CHANNELS*2

save_pkl(output, f'input/img/image_tile_{new_channel_in}x{new_channel_out}.pkl')