In [None]:
!nvidia-smi

In [None]:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

__global__ void matrixMultiplyGPU(float *A, float *B, float *C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < N && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < N; k++) {
            sum += A[row * N + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}

int main(int argc, char **argv) {
    int N = 512;
    if (argc > 1) {
        N = atoi(argv[1]);
    }

    size_t matrix_size = N * N * sizeof(float);
    
    float *host_A = (float *)malloc(matrix_size);
    float *host_B = (float *)malloc(matrix_size);
    float *host_C = (float *)malloc(matrix_size);
    
    for (int i = 0; i < N * N; i++) {
        host_A[i] = rand() % 100 / 100.0f;
        host_B[i] = rand() % 100 / 100.0f;
    }    
    float *device_A;
    float *device_B;
    float *device_C;
    
    cudaMalloc((void**)&device_A, matrix_size);
    cudaMalloc((void**)&device_B, matrix_size);
    cudaMalloc((void**)&device_C, matrix_size);
    
    cudaMemcpy(device_A, host_A, matrix_size, cudaMemcpyHostToDevice);
    cudaMemcpy(device_B, host_B, matrix_size, cudaMemcpyHostToDevice);
    
    dim3 threadsPerBlock(16, 16);
    
    // Calculate how many blocks we need
    int num_blocks = (N + 15) / 16;
    dim3 numBlocks(num_blocks, num_blocks);
    
    printf("Using %d x %d blocks with 16 x 16 threads per block\n", num_blocks, num_blocks);
    
    cudaEvent_t start_event, stop_event;
    cudaEventCreate(&start_event);
    cudaEventCreate(&stop_event);
    
    //start time
    cudaEventRecord(start_event);    
    matrixMultiplyGPU<<<numBlocks, threadsPerBlock>>>(device_A, device_B, device_C, N);
    cudaEventRecord(stop_event); //stop time
    cudaEventSynchronize(stop_event);
    
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start_event, stop_event);
    
    printf("Naive CUDA execution time (N=%d): %f ms\n", N, milliseconds);
    printf("In %f seconds\n", milliseconds / 1000.0f);
    
    cudaMemcpy(host_C, device_C, matrix_size, cudaMemcpyDeviceToHost);
    
    volatile float prevent_optimization = host_C[0];
    
    cudaFree(device_A);
    cudaFree(device_B);
    cudaFree(device_C);
    
    free(host_A);
    free(host_B);
    free(host_C);

    return 0;
}

In [None]:
!nvcc matrix_gpu.cu -o matrix_gpu

In [None]:
!./matrix_gpu 512
!./matrix_gpu 1024
!./matrix_gpu 2048