<a href="https://colab.research.google.com/github/abonti123/Cuda-And-OpenCV-implemented-for-object-detection-using-YOLO-architecture/blob/main/Abonti.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-krbkw5xf
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-krbkw5xf
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4295 sha256=e9e071594dbc6c68208841924c8deb3d9d836da62b49004f14d5287b9abc60c0
  Stored in directory: /tmp/pip-ephem-wheel-cache-hk1ny7ip/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [None]:
!nvcc --version
!nvidia-smi

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0
Fri Oct 20 06:14:20 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   58C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+-------

In [None]:
%%cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void process_kernel1(float *input1, float *input2, float *output, int datasize) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int idy = threadIdx.y + blockIdx.y * blockDim.y;
    int idz = threadIdx.z + blockIdx.z * blockDim.z;
    int index = idz * (gridDim.x * blockDim.x * gridDim.y * blockDim.y) + idy * (gridDim.x * blockDim.x) + idx;
    if(index<datasize)
        output[index] = sinf(input1[index]) + cosf(input2[index]);
}
__global__ void process_kernel2(float *input,float *output,int datasize){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int idy = threadIdx.y + blockIdx.y * blockDim.y;
    int idz = threadIdx.z + blockIdx.z * blockDim.z;
    int index = idz * (gridDim.x * blockDim.x * gridDim.y * blockDim.y) + idy * (gridDim.x * blockDim.x) + idx;
    if(index<datasize)
        output[index] = logf(input[index]);
}
__global__ void process_kernel3(float *input,float *output,int datasize){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if(idx<datasize)
        output[idx] = sqrtf(input[idx]);
}

int main(void){
    int datasize = 1024;
    float *input1, *input2, *input, *output;
    cudaError_t cudaStatus;
    cudaStatus = cudaMallocManaged(&input1, datasize*sizeof(float));
    if(cudaStatus != cudaSuccess){
        fprintf(stderr, "cudaMallocManaged failed for input1: %s\n", cudaGetErrorString(cudaStatus));
        return 1;
    }
    cudaStatus = cudaMallocManaged(&input2, datasize*sizeof(float));
    if(cudaStatus != cudaSuccess){
        fprintf(stderr, "cudaMallocManaged failed for input2: %s\n", cudaGetErrorString(cudaStatus));
        cudaFree(input1);
        return 1;
    }
    cudaStatus = cudaMallocManaged(&input, datasize*sizeof(float));
    if(cudaStatus != cudaSuccess){
        fprintf(stderr, "cudaMallocManaged failed for input: %s\n", cudaGetErrorString(cudaStatus));
        cudaFree(input1);
        cudaFree(input2);
        return 1;
    }
    cudaStatus = cudaMallocManaged(&output, datasize*sizeof(float));
    if(cudaStatus != cudaSuccess){
        fprintf(stderr, "cudaMallocManaged failed for output: %s\n", cudaGetErrorString(cudaStatus));
        cudaFree(input1);
        cudaFree(input2);
        cudaFree(input);
        return 1;
    }
    for(int i=0; i<datasize; i++){
        input1[i] = (float)rand()/RAND_MAX;
        input2[i] = (float)rand()/RAND_MAX;
    }
    dim3 threadsPerBlock1(16, 16, 4);
    dim3 blocksPerGrid1((datasize+threadsPerBlock1.x-1)/threadsPerBlock1.x,
                        (datasize+threadsPerBlock1.y-1)/threadsPerBlock1.y,
                        (datasize+threadsPerBlock1.z-1)/threadsPerBlock1.z
);
process_kernel1<<<blocksPerGrid1, threadsPerBlock1>>>(input1, input2, output, datasize);
cudaStatus = cudaDeviceSynchronize();
if(cudaStatus != cudaSuccess){
    fprintf(stderr, "Kernel1 launch failed: %s\n", cudaGetErrorString(cudaStatus));
    cudaFree(input1);
    cudaFree(input2);
    cudaFree(input);
    cudaFree(output);
    return 1;
}
dim3 threadsPerBlock2(8, 8, 2);
dim3 blocksPerGrid2((datasize+threadsPerBlock2.x-1)/threadsPerBlock2.x,
                    (datasize+threadsPerBlock2.y-1)/threadsPerBlock2.y,
                    (datasize+threadsPerBlock2.z-1)/threadsPerBlock2.z);
process_kernel2<<<blocksPerGrid2, threadsPerBlock2>>>(output, input, datasize);
cudaStatus = cudaDeviceSynchronize();
if(cudaStatus != cudaSuccess){
    fprintf(stderr, "Kernel2 launch failed: %s\n", cudaGetErrorString(cudaStatus));
    cudaFree(input1);
    cudaFree(input2);
    cudaFree(input);
    cudaFree(output);
    return 1;
}
dim3 threadsPerBlock3(128, 1, 1);
dim3 blocksPerGrid3((datasize+threadsPerBlock3.x-1)/threadsPerBlock3.x, 1, 1);
process_kernel3<<<blocksPerGrid3, threadsPerBlock3>>>(input, output, datasize);
cudaStatus = cudaDeviceSynchronize();
if(cudaStatus != cudaSuccess){
    fprintf(stderr, "Kernel3 launch failed: %s\n", cudaGetErrorString(cudaStatus));
    cudaFree(input1);
    cudaFree(input2);
    cudaFree(input);
    cudaFree(output);
    return 1;
}
for(int i = 0; i < datasize; i++){
    printf("Output[%d] = %f\n", i, output[i]);
}
cudaFree(input1);
cudaFree(input2);
cudaFree(input);
cudaFree(output);

return 0;
}

Output[0] = 0.715281
Output[1] = 0.582091
Output[2] = 0.756029
Output[3] = 0.216794
Output[4] = 0.342753
Output[5] = 0.487409
Output[6] = 0.453041
Output[7] = 0.594269
Output[8] = 0.546012
Output[9] = nan
Output[10] = nan
Output[11] = nan
Output[12] = 0.271901
Output[13] = 0.341262
Output[14] = 0.772838
Output[15] = 0.383973
Output[16] = 0.652892
Output[17] = 0.615679
Output[18] = 0.190084
Output[19] = 0.072896
Output[20] = 0.446563
Output[21] = 0.133020
Output[22] = 0.444145
Output[23] = 0.533536
Output[24] = nan
Output[25] = 0.635909
Output[26] = nan
Output[27] = 0.735229
Output[28] = 0.248931
Output[29] = 0.603798
Output[30] = nan
Output[31] = 0.606143
Output[32] = 0.338426
Output[33] = 0.295320
Output[34] = 0.493390
Output[35] = 0.639965
Output[36] = 0.141288
Output[37] = 0.663604
Output[38] = 0.140496
Output[39] = 0.654700
Output[40] = 0.695077
Output[41] = 0.247416
Output[42] = 0.721645
Output[43] = nan
Output[44] = 0.332094
Output[45] = 0.707165
Output[46] = 0.484373
Output[47] 

In [None]:
# Import necessary libraries
import numpy as np
from numba import cuda

# Define the CUDA kernel function
@cuda.jit
def convolution_kernel(input_image, kernel, output_image):
    # Get the coordinates of the current thread
    row, col = cuda.grid(2)

    # Calculate the output pixel value for the current thread
    if row < output_image.shape[0] and col < output_image.shape[1]:
        output_value = 0.0
        for i in range(kernel.shape[0]):
            for j in range(kernel.shape[1]):
                input_row = row + i - kernel.shape[0] // 2
                input_col = col + j - kernel.shape[1] // 2
                if input_row >= 0 and input_row < input_image.shape[0] and input_col >= 0 and input_col < input_image.shape[1]:
                    output_value += input_image[input_row, input_col] * kernel[i, j]
        output_image[row, col] = output_value
def main():
    # Define the input image and kernel
    input_image = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float32)
    kernel = np.array([[0, 1, 0], [1, -4, 1], [0, 1, 0]], dtype=np.float32)

    # Allocate memory on the GPU
    input_image_gpu = cuda.to_device(input_image)
    kernel_gpu = cuda.to_device(kernel)
    output_image_gpu = cuda.device_array_like(input_image_gpu)
    threads_per_block = (16, 16)
    blocks_per_grid_x = (input_image.shape[0] + threads_per_block[0] - 1) // threads_per_block[0]
    blocks_per_grid_y = (input_image.shape[1] + threads_per_block[1] - 1) // threads_per_block[1]
    blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)
    convolution_kernel[blocks_per_grid, threads_per_block](input_image_gpu, kernel_gpu, output_image_gpu)
    output_image = output_image_gpu.copy_to_host()
    print("Output Image:")
    print(output_image)
main()



Output Image:
[[  2.   1.  -4.]
 [ -3.   0.  -7.]
 [-16. -11. -22.]]


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
x = torch.randn(1, 3, 32, 32)  # batch_size=1, channels=3, height=32, width=32

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        return x

net = Net()

In [None]:
conv_output = net.conv1(x)
pool_output = net.pool(conv_output)

In [None]:
print(conv_output.shape)
print(pool_output.shape)

torch.Size([1, 16, 32, 32])
torch.Size([1, 16, 16, 16])
