In [None]:
!apt-get install nvidia-cuda-toolkit

In [15]:
%%writefile multithreading.cu
#include <cuda_runtime.h>
#include <iostream>

#define CHECK_CUDA(call)                                                   \
  {                                                                        \
    cudaError_t err = call;                                                \
    if (err != cudaSuccess) {                                              \
      std::cerr << "CUDA Error: " << cudaGetErrorString(err)               \
                << " at " << __FILE__ << ":" << __LINE__ << std::endl;     \
      std::exit(EXIT_FAILURE);                                             \
    }                                                                      \
  }

// CUDA kernel: each thread adds one element
__global__ void vectorAdd(const float* A, const float* B, float* C, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
        C[idx] = A[idx] + B[idx];
    }
}

int main() {
    const int N = 1 << 20;           // 1M elements
    const size_t bytes = N * sizeof(float);

    // 1) Allocate host memory
    float *h_A = (float*)malloc(bytes);
    float *h_B = (float*)malloc(bytes);
    float *h_C = (float*)malloc(bytes);

    // 2) Initialize inputs
    for (int i = 0; i < N; ++i) {
        h_A[i] = static_cast<float>(i);
        h_B[i] = static_cast<float>(2*i);
    }

    // 3) Allocate device memory
    float *d_A, *d_B, *d_C;
    CHECK_CUDA(cudaMalloc(&d_A, bytes));
    CHECK_CUDA(cudaMalloc(&d_B, bytes));
    CHECK_CUDA(cudaMalloc(&d_C, bytes));

    // 4) Copy data to device
    CHECK_CUDA(cudaMemcpy(d_A, h_A, bytes, cudaMemcpyHostToDevice));
    CHECK_CUDA(cudaMemcpy(d_B, h_B, bytes, cudaMemcpyHostToDevice));

    // 5) Launch kernel with enough threads to cover N elements
    int threadsPerBlock = 256;
    int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
    CHECK_CUDA(cudaGetLastError());
    CHECK_CUDA(cudaDeviceSynchronize());

    // 6) Copy result back to host
    CHECK_CUDA(cudaMemcpy(h_C, d_C, bytes, cudaMemcpyDeviceToHost));

    // 7) Verify a few results
    bool ok = true;
    for (int i = 0; i < 5; ++i) {
        float expected = h_A[i] + h_B[i];
        if (fabs(h_C[i] - expected) > 1e-5) {
            ok = false;
            break;
        }
    }
    std::cout << (ok ? "PASS\n" : "FAIL\n");

    // 8) Clean up
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(h_A); free(h_B); free(h_C);

    return 0;
}


Writing multithreading.cu


In [12]:
import torch
prop = torch.cuda.get_device_properties(0)
print(f"Name: {prop.name}, Compute: {prop.major}.{prop.minor}")

Name: Tesla T4, Compute: 7.5


In [13]:
prop

_CudaDeviceProperties(name='Tesla T4', major=7, minor=5, total_memory=15095MB, multi_processor_count=40, uuid=d3950b2c-c928-d970-a2e0-3e25122fef76, L2_cache_size=4MB)

In [16]:
!nvcc -std=c++14 -O2 multithreading.cu \
    -gencode arch=compute_75,code=sm_75 \
    -gencode arch=compute_75,code=compute_75 \
    -o multithreading

In [17]:
!./multithreading

PASS
