In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-r_8xvk8i
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-r_8xvk8i
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4304 sha256=9fd661cb485cfd066c7b22a352e0e8a628034dc84b202ad47ff5b8d7ad8f6e66
  Stored in directory: /tmp/pip-ephem-wheel-cache-vki2o_ft/wheels/f3/08/cc/e2b5b0e1c92df07dbb50a6f024a68ce090f5e7b2316b41756d
Successfully built NVCCPlugin
Installing collecte

In [3]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [4]:
%%cu
#include <iostream>
#include <fstream>
#include <string>
#include <cstdio>
#include <vector>

#define SDIV(x,y)(((x)+(y)-1)/(y))

#define CUERR {                                                            \
        cudaError_t err;                                                       \
        if ((err = cudaGetLastError()) != cudaSuccess) {                       \
            std::cout << "CUDA error: " << cudaGetErrorString(err) << " : "    \
                    << __FILE__ << ", line " << __LINE__ << std::endl;       \
            exit(1);                                                           \
        }                                                                      \
    }


template <
    typename index_t,
    typename value_t>
void load_binary(
    const value_t * data,
    const index_t length,
    std::string filename) {

    std::ifstream ifile(filename.c_str(), std::ios::binary);

    if(!ifile.good()) {
        throw std::runtime_error{"can't open file " + filename};
    }

    ifile.read((char*) data, sizeof(value_t)*length);
    ifile.close();
}


///////////////////////////////////////////////////////////////////////////////
// FINISHED KERNEL (you don't have to change anything)
///////////////////////////////////////////////////////////////////////////////

template <
    typename index_t,
    typename value_t,
    uint32_t chunk_size = 32>
__global__
void shared_covariance_kernel(
    const value_t * data,
    value_t * cov,
    const index_t num_entries,
    const index_t num_features)
{
    // convenience variables
    const index_t base_x = blockIdx.x*chunk_size;
    const index_t base_y = blockIdx.y*chunk_size;

    const index_t thid_y = threadIdx.y;
    const index_t thid_x = threadIdx.x;

    const index_t x = base_x + thid_x;
    const index_t y = base_y + thid_y;

    // optional early exit: -500ms
    if (base_x > base_y) return;

    // allocate shared memory
    __shared__ value_t s_cache_x[chunk_size][chunk_size];
    __shared__ value_t s_cache_y[chunk_size][chunk_size];

    // compute the number of chunks to be computed
    const index_t num_chunks = SDIV(num_entries, chunk_size);

    // accumulated value of scalar product
    value_t accum = 0;

    // for each chunk
    for (index_t chunk = 0; chunk < num_chunks; chunk++) {

            // assign thread IDs to rows and columns
            const index_t row   = thid_y + chunk*chunk_size;
            const index_t col_x = thid_x + base_x;
            const index_t col_y = thid_x + base_y;

            // check if valid row or column indices
            const bool valid_row   = row   < num_entries;
            const bool valid_col_x = col_x < num_features;
            const bool valid_col_y = col_y < num_features;

            // fill shared memory with tiles where thid_y enumerates
            // image identifiers (entries) and thid_x denotes feature
            // coordinates (pixels). s_cache_x corresponds to x and
            // s_cache_y to y where cov[x,y] is the pairwise covariance
            s_cache_x[thid_y][thid_x] = valid_row*valid_col_x ?
                                      data[row*num_features+col_x] : 0;
            s_cache_y[thid_y][thid_x] = valid_row*valid_col_y ?
                                      data[row*num_features+col_y] : 0;

            // this is needed to ensure that all threads finished writing
            // shared memory
            __syncthreads();

            // optional early exit: -100ms
            if (x <= y)
                // here we actually evaluate the scalar product
                for (index_t entry = 0; entry < chunk_size; entry++)
                    accum += s_cache_y[entry][thid_y]*s_cache_x[entry][thid_x];

            // this is needed to ensure that shared memory can be over-
            // written again in the next iteration
            __syncthreads();
    }

    // since cov[x,y] = cov[y,x] we only compute one entry
    if (y < num_features && x <= y)
        cov[y*num_features+x] =
        cov[x*num_features+y] = accum;//num_entries;

}

///////////////////////////////////////////////////////////////////////////////
// DATA STRUCTURES
///////////////////////////////////////////////////////////////////////////////

template <uint64_t num_gpus, uint64_t num_streams>
struct partition {
    static constexpr uint64_t num_slots = num_gpus*num_streams;
    uint64_t offsets[num_slots];
    uint64_t counts[num_slots];

    partition(uint64_t length) {

        uint64_t batch_size = (length+num_slots-1)/num_slots;

        for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
            for (uint64_t stream = 0; stream < num_streams; stream++) {
                const uint64_t slot = gpu*num_streams+stream;
                const uint64_t lower = slot*batch_size;
                const uint64_t upper = std::min(lower+batch_size, length);
                const uint64_t count = upper-lower;

                offsets[slot] = lower;
                counts[slot] = count;
            }
        }
    }

    uint64_t get_size(uint64_t gpu, uint64_t stream) const {
        return counts[gpu*num_streams+stream];
    }

    uint64_t get_offset(uint64_t gpu, uint64_t stream) const {
        return offsets[gpu*num_streams+stream];
    }
};

///////////////////////////////////////////////////////////////////////////////
// MAIN PROGRAM (take a look at what the program does)
///////////////////////////////////////////////////////////////////////////////

int main () {

    constexpr uint64_t num_images = 10000, num_rows = 55, num_cols = 45;
    constexpr uint64_t num_pixels = num_rows * num_cols;

    // pointer for data matrix and mean vector
    float * h_data = nullptr;
    cudaMallocHost(&h_data, sizeof(float)*num_images*num_pixels); 
    load_binary(h_data, num_images*num_pixels, "/content/celebA_centered.10000.bin");
                
    std::cout << "Load input file" << std::endl;
    constexpr uint64_t num_gpus = 2, num_streams = 4;
    partition< num_gpus, num_streams > part(num_images);

    float * d_data[num_gpus][num_streams];
    float * d_cov [num_gpus][num_streams];
    float * h_cov [num_gpus][num_streams];

    cudaStream_t streams[num_gpus][num_streams];

    for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
        cudaSetDevice(0);
        for (uint64_t stream = 0; stream < num_streams; stream++) {

            const uint64_t part_size = part.get_size(gpu, stream);
            const uint64_t part_bytes = sizeof(float)*part_size*num_pixels;
            const uint64_t cov_bytes = sizeof(float)*num_pixels*num_pixels;

            cudaStreamCreate(&streams[gpu][stream]);
            cudaMalloc    (&d_data[gpu][stream], part_bytes); CUERR
            cudaMalloc    (&d_cov[gpu][stream], cov_bytes); CUERR
            cudaMallocHost(&h_cov[gpu][stream], cov_bytes); CUERR
            cudaMemset    (d_data[gpu][stream], 0, part_bytes); CUERR
            cudaMemset    (d_cov[gpu][stream], 0, cov_bytes); CUERR
            cudaMemset    (h_cov[gpu][stream], 0, cov_bytes); CUERR
        }
    } 
   std::cout << "Memory and streams init" << std::endl;
 
    ///////////////////////////////////////////////////////////////////////////
    // STUDENTS PART (fill in the gaps)
    ///////////////////////////////////////////////////////////////////////////
    for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
        cudaSetDevice(0);
        for (uint64_t stream = 0; stream < num_streams; stream++) {
            // offset where the part begins in h_data
            const uint64_t part_offset = part.get_offset(gpu, stream) * num_pixels;
            // number of images in the part
            const uint64_t part_size   = part.get_size(gpu, stream);

            const uint64_t part_bytes  = sizeof(float)*part_size*num_pixels;

            ///////////////////////////////////////////////////////////////////
            // copy data from h_data to d_data using a different stream for each part
            //cudaMemcpyAsync(...);
            cudaMemcpyAsync(d_data[gpu][stream], h_data+part_offset, part_bytes,
                            cudaMemcpyHostToDevice, streams[gpu][stream]);
            ///////////////////////////////////////////////////////////////////
  
            const dim3 blocks(SDIV(num_pixels, 32), SDIV(num_pixels, 32));
            const dim3 threads(32, 32);
            ///////////////////////////////////////////////////////////////////
            // call shared_covariance_kernel in the stream
            //shared_covariance_kernel<<<...>>>
            //   (d_data[gpu][stream], d_cov[gpu][stream], part_size, num_pixels);
            shared_covariance_kernel<<< blocks, threads, 0, streams[gpu][stream] >>>
                (d_data[gpu][stream], d_cov[gpu][stream], part_size, num_pixels);
            ///////////////////////////////////////////////////////////////////
   
            const uint64_t cov_bytes = sizeof(float)*num_pixels*num_pixels;
            ///////////////////////////////////////////////////////////////////
            // copy results from d_cov to h_cov gpu using the stream
            //cudaMemcpyAsync(...);
            cudaMemcpyAsync(h_cov[gpu][stream], d_cov[gpu][stream], cov_bytes,
                            cudaMemcpyDeviceToHost, streams[gpu][stream]);
            ///////////////////////////////////////////////////////////////////
         }
    } 
    ///////////////////////////////////////////////////////////////////////////
    std::cout << "Computation" << std::endl;
 

    for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {
        cudaSetDevice(0);
        for (uint64_t stream = 0; stream < num_streams; stream++)
            cudaStreamSynchronize(streams[gpu][stream]);                  
    }
   std::cout << "Stream sync" << std::endl;
 
    std::vector<float> h_result(num_pixels*num_pixels);  
    std::vector<float> h_truth(num_pixels*num_pixels);  
    //float * h_result = nullptr, 
    //float * h_truth = nullptr;
    //cudaMallocHost(&h_result, sizeof(float)*num_pixels*num_pixels);       
    //cudaMallocHost(&h_truth,  sizeof(float)*num_pixels*num_pixels);      
    //cudaMemset(h_result, 0, sizeof(float)*num_pixels*num_pixels);         
 
    for (uint64_t i = 0; i < num_pixels*num_pixels; i++)
        for (uint64_t gpu = 0; gpu < num_gpus; gpu++)
            for (uint64_t stream = 0; stream < num_streams; stream++) {
                h_result[i] += h_cov[gpu][stream][i];
            }
             
   std::cout << "Add partials" << std::endl;
 
    for (uint64_t i = 0; i < num_pixels*num_pixels; i++)
        h_result[i] /= num_images;

   std::cout << "Start error check" << std::endl;

    load_binary(h_truth.data(), num_pixels*num_pixels, "/content/celebA_covariance.10000.bin");
    std::cout << "Loaded truth file" << std::endl;
    bool no_errors = true;
    for (uint64_t i = 0; i < num_pixels*num_pixels; i++) {
        const auto res  = h_result[i] -  h_truth[i];
        if (res*res > 10) {
            std::cout <<  "ERROR: " <<  h_result[i] << " " << h_truth[i]
                      << " " << (i % (num_pixels))
                      << " " << (i / (num_pixels)) << std::endl;
            no_errors = false;
            break;
        }
    }
   std::cout << "End error check" << std::endl;
 
    for(uint64_t gpu = 0; gpu < num_gpus; gpu++) {
        cudaSetDevice(0);
        for (uint64_t stream = 0; stream < num_streams; stream++) {
            cudaFree    (d_data[gpu][stream]);
            cudaFree    (d_cov[gpu][stream]);
            cudaFreeHost(h_cov[gpu][stream]);
            cudaStreamSynchronize(streams[gpu][stream]);
            cudaStreamDestroy(streams[gpu][stream]);
        }
    } 

    cudaFreeHost(h_data);                                                 
    std::cout << '\n';

    if(no_errors)
        std::cout << "CUDA is fun" << std::endl;

}

terminate called after throwing an instance of 'std::runtime_error'
  what():  can't open file /content/celebA_centered.10000.bin

