In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-pbf4wcwd
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-pbf4wcwd
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4304 sha256=1293fa8e2bf9c5b8df059a23e3f71f9af837727a8d049e49fb39763d65653d6b
  Stored in directory: /tmp/pip-ephem-wheel-cache-uvpfbhv3/wheels/f3/08/cc/e2b5b0e1c92df07dbb50a6f024a68ce090f5e7b2316b41756d
Successfully built NVCCPlugin
Installing collecte

In [3]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [10]:
%%cu
#include < iostream>   // cout, endl
#include < numeric>    // iota, fill

///////////////////////////////////////////////////////////////////////////////
// STUDENTS PART (feel free to code)
///////////////////////////////////////////////////////////////////////////////

// 2^27 float elements need 1.5 gigabytes of memory (2*0.5GB input, 0.5GB output)
// You can set numElements to 1024 if you only want to test a single block
// constexpr size_t numElements = 1024;
constexpr size_t numElements = 1UL<<27;

// write a kernel where each thread calculates the sum of two input values and
// stores the result in the output array
__global__
void add_kernel(const float * a_in, const float * b_in, float * c_out, size_t n)
{
    // your code
 
}

// if you are bored try to write the kernel where each thread calculates multiple
// additions using a for loop
// (you have to uncomment the kernel in the main function to use it)
__global__
void strided_add_kernel(const float * a_in, const float * b_in, float * c_out, size_t n)
{
    // your code

}

///////////////////////////////////////////////////////////////////////////////
// MAIN PROGRAM (take a look at what the program does)
///////////////////////////////////////////////////////////////////////////////

int main () {
    // choose GPU 0 (GTX 1080 (Pascal) 8GB RAM)
    // or GPU 1 (Titan X (Maxwell) 12GB RAM)
    cudaSetDevice(0);                                                     

    // pointers to host arrays
    float * h_a = nullptr;
    float * h_b = nullptr;
    float * h_c = nullptr;
    // pointers to device arrays
    float * d_a = nullptr;
    float * d_b = nullptr;
    float * d_c = nullptr;

    const size_t arraySize = sizeof(float)*numElements;

    // allocate pinned host memory
    cudaMallocHost(&h_a, arraySize);                                     
    cudaMallocHost(&h_b, arraySize);                                     
    cudaMallocHost(&h_c, arraySize);                                    

    // allocate device memory
    cudaMalloc(&d_a, arraySize);                                        
    cudaMalloc(&d_b, arraySize);                                        
    cudaMalloc(&d_c, arraySize);                                        

    // fill h_a and h_b with stuff
    std::iota(h_a, h_a+numElements, 0);             // (0, 1, 2, 3, ..., N-1)
    std::fill(h_b, h_b+numElements, 1);             // (1, 1, 1, 1, ..., 1)
 
    // measure time for vector addition on single-threaded host
     for (size_t index = 0; index < numElements; index++)
        h_c[index] = h_a[index] + h_b[index];
  
    // measure time for vector addition on multi-threaded host
    for (size_t index = 0; index < numElements; index++)
        h_c[index] = h_a[index] + h_b[index];
    std::cout << '\n';

     // copy data from host to device
    cudaMemcpy(d_a, h_a, arraySize, cudaMemcpyHostToDevice);             
    cudaMemcpy(d_b, h_b, arraySize, cudaMemcpyHostToDevice);             
  
    // Note, the next line is not needed in practice. However, we overwrite
    // the device vector d_c to prevent spurious false positives. As an example,
    // if another student writes the correct result to d_c and the GPU assigns
    // the same address range during your run (this happens quite often) then
    // you might pass the test below even if you process nothing!
    cudaMemset(d_c, 0, arraySize);                                      

    // invoke the kernel
    int threadsPerBlock = 1024;
    int numBlocks = (numElements + threadsPerBlock - 1) / threadsPerBlock;
    add_kernel<<< numBlocks, threadsPerBlock >>>(d_a, d_b, d_c, numElements);
 
    // uncomment the following lines for the strided kernel
    threadsPerBlock = 1024;
    numBlocks = 1024;
    strided_add_kernel<<< numBlocks, threadsPerBlock >>>(d_a, d_b, d_c, numElements);
 
    // copy result from device to host
     cudaMemcpy(h_c, d_c, arraySize, cudaMemcpyDeviceToHost);            

     // check if result computed correctly by CUDA
    bool no_errors = true;
    for (size_t index = 0; index < numElements; index++) {
        if (h_c[index] != h_a[index] + h_b[index]) {
            std::cout << "first error at position " << index << std::endl;
            no_errors = false;
            break;
        }
    }

    // free memory allocations
    cudaFreeHost(h_a);
    cudaFreeHost(h_b);
    cudaFreeHost(h_c);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
 
    if(no_errors)
        std::cout << "CUDA programming is fun!" << std::endl;
}


first error at position 0

