In [3]:
!pip install pycuda

In [9]:
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import numpy as np
import time

# CUDA kodu, block-shared memory tanımı vs
cuda_code = """
__global__ void convolutionalLayerKernelWithSharedMemory(float *input, float *output, float *filter, int width, int height, int filterWidth, int filterHeight) {
    extern __shared__ float sharedMem[];

    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int row_o = blockIdx.y * blockDim.y + ty; // Output row indexi
    int col_o = blockIdx.x * blockDim.x + tx; // Output column indexi
    int row_i = row_o - filterHeight / 2;     // Thread için input row indexi
    int col_i = col_o - filterWidth / 2;      // Thread için input column indexi
    int sharedMemIndex = ty * blockDim.x + tx;

    // inputu shared memorye yükleme
    if (row_i >= 0 && row_i < height && col_i >= 0 && col_i < width) {
        sharedMem[sharedMemIndex] = input[row_i * width + col_i];
    } else {
        sharedMem[sharedMemIndex] = 0;
    }
    __syncthreads();

    // Filtrenin uygulanması
    if (ty < blockDim.y - filterHeight + 1 && tx < blockDim.x - filterWidth + 1) {
        float sum = 0;
        for (int i = 0; i < filterHeight; ++i) {
            for (int j = 0; j < filterWidth; ++j) {
                sum += sharedMem[(ty + i) * blockDim.x + tx + j] * filter[i * filterWidth + j];
            }
        }
        if (row_o < height - filterHeight + 1 && col_o < width - filterWidth + 1) {
            output[row_o * (width - filterWidth + 1) + col_o] = sum;
        }
    }
}
"""
mod = SourceModule(cuda_code)

# CUDA kernel referansı (çağrılması)
convolution_with_shared_memory = mod.get_function("convolutionalLayerKernelWithSharedMemory")


# GPU CNN işlemleri
def forward_pass(input_data, filter_data):
    input_height, input_width = input_data.shape
    filter_height, filter_width = filter_data.shape
    output_height, output_width = input_height - filter_height + 1, input_width - filter_width + 1

    input_data_gpu = cuda.mem_alloc(input_data.nbytes)
    filter_gpu = cuda.mem_alloc(filter_data.nbytes)
    output_data_gpu = cuda.mem_alloc(output_height * output_width * input_data.itemsize)

    cuda.memcpy_htod(input_data_gpu, input_data)
    cuda.memcpy_htod(filter_gpu, filter_data)

    block_size = (16, 16, 1)
    grid_size = (int(np.ceil(output_width / 16)), int(np.ceil(output_height / 16)))

    convolution(input_data_gpu, output_data_gpu, filter_gpu, np.int32(input_width), np.int32(input_height), np.int32(filter_width), np.int32(filter_height), block=block_size, grid=grid_size)

    output_data = np.empty((output_height, output_width), dtype=np.float32)
    cuda.memcpy_dtoh(output_data, output_data_gpu)

    return output_data

# CPU CNN işlemleri
def convolution_cpu(input_data, filter_data):
    input_height, input_width = input_data.shape
    filter_height, filter_width = filter_data.shape
    output_height, output_width = input_height - filter_height + 1, input_width - filter_width + 1

    output_data = np.zeros((output_height, output_width))

    for i in range(output_height):
        for j in range(output_width):
            output_data[i, j] = np.sum(input_data[i:i+filter_height, j:j+filter_width] * filter_data)

    return output_data

# Performans kontrolü
def measure_performance(input_data, filter_data):
    start_time = time.time()
    output_cpu = convolution_cpu(input_data, filter_data)
    cpu_time = time.time() - start_time

    start_time = time.time()
    output_gpu = forward_pass(input_data, filter_data)
    gpu_time = time.time() - start_time

    return cpu_time, gpu_time

# Farklı matrix (görsel) ve filtre büyüklükleri tanımı
sizes = [(256, 256), (512, 512), (1024, 1024)]
filter_sizes = [(3, 3), (5, 5), (7, 7)]

# Hepsi için hesaplama ve ölçümlerin yapılması
for size in sizes:
    for filter_size in filter_sizes:
        input_data = np.random.rand(*size).astype(np.float32)
        filter_data = np.random.rand(*filter_size).astype(np.float32)

        cpu_time, gpu_time = measure_performance(input_data, filter_data)
        print(f"Input Size: {size}, Filter Size: {filter_size}, CPU Time: {cpu_time:.4f} s, GPU Time: {gpu_time:.4f} s")

  globals().clear()


Input Size: (256, 256), Filter Size: (3, 3), CPU Time: 0.3968 s, GPU Time: 0.0013 s
Input Size: (256, 256), Filter Size: (5, 5), CPU Time: 0.4007 s, GPU Time: 0.0008 s
Input Size: (256, 256), Filter Size: (7, 7), CPU Time: 0.3707 s, GPU Time: 0.0008 s
Input Size: (512, 512), Filter Size: (3, 3), CPU Time: 1.5598 s, GPU Time: 0.0020 s
Input Size: (512, 512), Filter Size: (5, 5), CPU Time: 2.1167 s, GPU Time: 0.0015 s
Input Size: (512, 512), Filter Size: (7, 7), CPU Time: 2.2184 s, GPU Time: 0.0014 s
Input Size: (1024, 1024), Filter Size: (3, 3), CPU Time: 6.1974 s, GPU Time: 0.0062 s
Input Size: (1024, 1024), Filter Size: (5, 5), CPU Time: 7.4688 s, GPU Time: 0.0035 s
Input Size: (1024, 1024), Filter Size: (7, 7), CPU Time: 6.1675 s, GPU Time: 0.0036 s
