In [13]:
 !nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [14]:
!git clone https://github.com/andreinechaev/nvcc4jupyter.git

fatal: destination path 'nvcc4jupyter' already exists and is not an empty directory.


In [15]:
!git clone https://github.com/andreinechaev/nvcc4jupyter.git

fatal: destination path 'nvcc4jupyter' already exists and is not an empty directory.


In [16]:
import sys
sys.path.append('/content/nvcc4jupyter')

In [17]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [18]:
%%cu

#include <stdio.h>
#include <cuda_runtime_api.h>

/******************************************************************************
  Very simple CUDA program that shows the principles of copying data to and
  from a GPU and dynamic memory allocation on a GPU. The standard pattern for
  a lot of GPU work is:

    1) Prepare the data on the host part of the program. By host we mean the
       the CPU. In this case, the h_n integer is set to 19. The h_ prefix
       indicates a host variable, i.e. one that we will use with the CPU side
       of the program.
    2) Allocate memory on the device. By device we mean GPU. In this case a
       single integer, identified by d_n, is allocated using cudaMalloc. The
       d_ prefix indicates a device variable, i.e. one that we will use with
       the GPU side of the program.
    3) Transfer data from the host to device. In this case cudaMemcpy is used
       to copy the contents of h_n to d_n.
    4) The kernel function is invoked. In this case the kernel function is
       called kernel and is defined as __global__ which means a function
       that will execute on the device but is invoked from the host. The
       <<<1,1>>> part indicates that we want to execute the kernel with one
       thread block consisting of one thread. The kernel function here will
       only be invoked once in total.
    5) The kernel function is executed, which in this case sets the contents
       of the memory pointed to by d_n to 97.
    6) Data is copied from the device to the host. In this case the contents
       of memory pointed to by d_n are copied into the h_n variable.
    7) Dynamically allocated memory is freed using cudaFree.
    8) Results are output. In this case the value of h_n is printed, and if
       all goes well should print 97.

  CUDA functions return an integer code. If this code is not equal to zero
  something has gone wrong. cudaGetErrorString returns a description of an
  error given its code This program is rather paranoid and checks
  the return codes of all call CUDA function calls and terminates the program
  if zero was not returned.

  To compile:
    nvcc -o 01 01.cu

  Dr Kevan Buckley, University of Wolverhampton, 2018
******************************************************************************/

__global__ void kernel(int *n){
  *n = 97; // this is an arbitary number, just to see some results.
}

int main() {
  cudaError_t error;
  int *d_n;
  int h_n = 19;

  error = cudaMalloc(&d_n, sizeof(int));
  if(error){
    fprintf(stderr, "cudaMalloc on d_n returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaMemcpy(d_n, &h_n, sizeof(int), cudaMemcpyHostToDevice);
  if(error){
    fprintf(stderr, "cudaMemcpy to d_n returned %d %s\n", error,
      cudaGetErrorString(error));
  }

  kernel <<<1,1>>>(d_n);
  cudaThreadSynchronize();

  error = cudaMemcpy(&h_n, d_n, sizeof(int), cudaMemcpyDeviceToHost);
  if(error){
    fprintf(stderr, "cudaMemcpy to h_n returned %d %s\n", error,
      cudaGetErrorString(error));
  }

  error = cudaFree(d_n);
  if(error){
    fprintf(stderr, "cudaFree on d_n returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  printf("result: h_n = %d\n", h_n);
  return 0;
}

result: h_n = 97



In [19]:
%%cu

#include <stdio.h>
#include <cuda_runtime_api.h>

/******************************************************************************
  Very simple CUDA program that shows the principles of copying data to and
  from a GPU and dynamic memory allocation on a GPU. The standard pattern for
  a lot of GPU work:

    1) Prepare the data on the host part of the program. By host we mean the
       the CPU. In this case, the h_n integer is set to 19. The h_ prefix
       indicates a host variable, i.e. one that we will use with the CPU side
       of the program.
    2) Allocate memory on the device. By device we mean GPU. In this case a
       single integer, identified by d_n, is allocated using cudaMalloc. The
       d_ prefix indicates a device variable, i.e. one that we will use with
       the GPU side of the program.
    3) Transfer data from the host to device. In this case cudaMemcpy is used
       to copy the contents of h_n to d_n.
    4) The kernel function is invoked. In this case the kernel function is
       called kernel and is defined as __global__ which means a function
       that will execute on the device but is invoked from the host. The
       <<<1,1>>> part indicates that we want to execute the kernel with one
       thread block consisting of one thread. The kernel function here will
       only be invoked once in total.
    5) The kernel function is executed, which in this case sets the contents
       of the memory pointed to by d_n to 97.
    6) Data is copied from the device to the host. In this case the contents
       of memory pointed to by d_n are copied into the h_n variable.
    7) Dynamically allocated memory is freed using cudaFree.
    8) Results are output. In this case the value of h_n is printed, and if
       all goes well should print 97.

  This version does no checking for errors. Its purpose is to show the main
  functionality. For the "real" version of the program error checking should be
  included. See 01.cu for this.

  To compile:
    nvcc 01b.cu

  Dr Kevan Buckley, University of Wolverhampton, 2018
******************************************************************************/

__global__ void kernel(int *n){
  *n = 97; // this is an arbitary number, just to see some results.
}

int main() {
  int *d_n;
  int h_n = 19;

  cudaMalloc(&d_n, sizeof(int));

  cudaMemcpy(d_n, &h_n, sizeof(int), cudaMemcpyHostToDevice);

  kernel <<<1,1>>>(d_n);

  cudaThreadSynchronize();

  cudaMemcpy(&h_n, d_n, sizeof(int), cudaMemcpyDeviceToHost);

  cudaFree(d_n);

  printf("result: h_n = %d\n", h_n);
  return 0;
}



result: h_n = 97



In [21]:
%%cu

#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <math.h>
#include <cuda_runtime_api.h>

/******************************************************************************
  The main hinderance to programming Tesla architecture GPUs is that they are
  "running blind" - you cannot use printf to output results so debugging is
  very difficult. With the introduction of the Fermi architecture printf was
  enabled. This program needs to use printf whilst exploring thread ids so
  cannot be run on GPUs with compute capability less than 2.0. A compiler
  directive to enforce this is shown below.

  cudaThreadSynchronize() ensures that all GPU threads have completed execution
  before the host code continues.

  The code demonstrates thread indexing. Threads are grouped into blocks.
  Blocks are grouped into a grid. Both the grid and blocks are 3 dimensional
  so a specific thread needs to be indexed using the x, y, z index of
  the block in the grid and the x, y, z index of the block in the thread.

  The first two examples use default 1 dimensional grid and block, so only the
  x components of the indices is important. The other 2 examples demonstrate
  using three dimensional grid and blocks respectively.

  The long printf call in the kernel cannot be broken into two calls or the
  output becomes interleaved.

  Be careful not to run a program that will call printf too many times as this
  can fill buffers and cause a system crash.

  To compile:
    nvcc -o 02 02.cu

  Dr Kevan Buckley, University of Wolverhampton, 2018
******************************************************************************/

__global__ void kernel(){
  printf(
    "blockIdx.x=%-5d blockIdx.y=%-5d blockIdx.z=%-5d threadIdx.x=%-5d threadIdx.y=%-5d threadIdx.z=%-5d\n",
    blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z);
}

int main() {

  printf("Running with kernel <<<3,2>>>()\n");
  kernel <<<3,2>>>();
  cudaThreadSynchronize();

  printf("\nRunning with kernel <<<2,4>>>()\n");
  kernel <<<2,4>>>();
  cudaThreadSynchronize();

  dim3 dim(2, 3, 4);
  printf("\nRunning with kernel <<<dim,2>>>()\n");
  kernel <<<dim,2>>>();
  cudaThreadSynchronize();

  printf("\nRunning with kernel <<<2, dim>>>()\n");
  kernel <<<2, dim>>>();
  cudaThreadSynchronize();

  return 0;
}



Running with kernel <<<3,2>>>()
blockIdx.x=2     blockIdx.y=0     blockIdx.z=0     threadIdx.x=0     threadIdx.y=0     threadIdx.z=0    
blockIdx.x=2     blockIdx.y=0     blockIdx.z=0     threadIdx.x=1     threadIdx.y=0     threadIdx.z=0    
blockIdx.x=0     blockIdx.y=0     blockIdx.z=0     threadIdx.x=0     threadIdx.y=0     threadIdx.z=0    
blockIdx.x=0     blockIdx.y=0     blockIdx.z=0     threadIdx.x=1     threadIdx.y=0     threadIdx.z=0    
blockIdx.x=1     blockIdx.y=0     blockIdx.z=0     threadIdx.x=0     threadIdx.y=0     threadIdx.z=0    
blockIdx.x=1     blockIdx.y=0     blockIdx.z=0     threadIdx.x=1     threadIdx.y=0     threadIdx.z=0    

Running with kernel <<<2,4>>>()
blockIdx.x=0     blockIdx.y=0     blockIdx.z=0     threadIdx.x=0     threadIdx.y=0     threadIdx.z=0    
blockIdx.x=0     blockIdx.y=0     blockIdx.z=0     threadIdx.x=1     threadIdx.y=0     threadIdx.z=0    
blockIdx.x=0     blockIdx.y=0     blockIdx.z=0     threadIdx.x=2     threadIdx.y=0     threadId

In [22]:
%%cu

#include <stdio.h>
#include <cuda_runtime_api.h>

/******************************************************************************
  This program adds two arrays of integers together and stores the results in
  another array. As the arrays are quite small then a single block can be used
  for all threads. Memory needs to be allocated on the device before copying
  the arrays to it. On completion of the execution only the result array needs
  to be copied back to the host. See next example for exactly the same thing
  but with the error checking removed. That version will enable you to see the
  sequence of operations more easily.

  Compile with:
    nvcc -o 03 03.cu

  Dr Kevan Buckley, University of Wolverhampton, 2018
******************************************************************************/

int h_a[] = {253, 215, 223, 116, 90, 184, 119, 180, 150, 175, 175, 18, 70, 18,
           103, 183, 247, 99, 175, 71, 230, 22, 75, 146, 87, 27, 157, 22, 176,
           109, 190, 182, 65, 146, 252, 49, 153, 181, 247, 11, 1, 13, 171, 159,
           170, 205, 222, 46, 64, 134, 56, 191, 149, 64, 0, 174, 204, 118, 22,
           51, 14, 7, 20, 25, 3, 226, 15, 216, 99, 113, 10, 151, 41, 189, 204,
           198, 120, 92, 64, 97, 231, 185, 198, 118, 225, 197, 60, 252, 189,
           186, 161, 81, 18, 243, 25, 233, 38, 212, 49, 173, 155, 113, 233,
           56, 252, 134, 40, 16, 80, 192, 79, 50, 67, 158, 241, 231, 19, 165,
           212, 76, 192, 161, 136, 224, 43, 39, 156, 27};
int h_b[] = {135, 113, 155, 52, 145, 172, 55, 112, 121, 248, 84, 216, 186, 111,
           107, 135, 149, 111, 184, 188, 60, 8, 238, 30, 35, 132, 210, 229,
           153, 126, 8, 27, 21, 134, 250, 166, 240, 226, 121, 132, 221, 175,
           247, 185, 68, 98, 178, 43, 65, 165, 1, 187, 16, 172, 251, 9, 191,
           101, 193, 241, 167, 16, 108, 231, 117, 234, 59, 194, 164, 168,
           242, 73, 202, 238, 211, 42, 92, 202, 202, 223, 5, 186, 220, 171,
           165, 111, 45, 212, 79, 64, 235, 47, 245, 207, 20, 164, 189, 163,
           160, 129, 27, 22, 16, 88, 58, 10, 149, 254, 52, 57, 167, 138, 71,
           132, 183, 228, 178, 60, 190, 32, 23, 175, 193, 160, 250, 216, 145,
           147};
int h_c[128];

int *d_a, *d_b, *d_c;

__global__ void kernel(int *a, int *b, int *c){
  int i = threadIdx.x;
  c[i] = a[i] + b[i];
}

int main() {
  cudaError_t error;

  error = cudaMalloc(&d_a, sizeof(int) * 128);
  if(error){
    fprintf(stderr, "cudaMalloc on d_a returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaMalloc(&d_b, sizeof(int) * 128);
  if(error){
    fprintf(stderr, "cudaMalloc on d_b returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaMalloc(&d_c, sizeof(int) * 128);
  if(error){
    fprintf(stderr, "cudaMalloc on d_c returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaMemcpy(d_a, &h_a, sizeof(int) * 128, cudaMemcpyHostToDevice);
  if(error){
    fprintf(stderr, "cudaMemcpy to d_b returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaMemcpy(d_b, &h_b, sizeof(int) * 128, cudaMemcpyHostToDevice);
  if(error){
    fprintf(stderr, "cudaMemcpy to d_b returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  kernel <<<1,128>>>(d_a, d_b, d_c);
  cudaThreadSynchronize();

  error = cudaMemcpy(h_c, d_c, sizeof(int) * 128, cudaMemcpyDeviceToHost);
  if(error){
    fprintf(stderr, "cudaMemcpy to h_c returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaFree(d_a);
  if(error){
    fprintf(stderr, "cudaFree on d_a returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaFree(d_b);
  if(error){
    fprintf(stderr, "cudaFree on d_b returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaFree(d_c);
  if(error){
    fprintf(stderr, "cudaFree on d_c returned %d %s\n", error,
      cudaGetErrorString(error));;
    exit(1);
  }

  int i;
  for(i=0;i<128;i++){
    printf("%-3d + %-3d = %-4d\n", h_a[i], h_b[i], h_c[i]);
  }
  return 0;
}

253 + 135 = 388 
215 + 113 = 328 
223 + 155 = 378 
116 + 52  = 168 
90  + 145 = 235 
184 + 172 = 356 
119 + 55  = 174 
180 + 112 = 292 
150 + 121 = 271 
175 + 248 = 423 
175 + 84  = 259 
18  + 216 = 234 
70  + 186 = 256 
18  + 111 = 129 
103 + 107 = 210 
183 + 135 = 318 
247 + 149 = 396 
99  + 111 = 210 
175 + 184 = 359 
71  + 188 = 259 
230 + 60  = 290 
22  + 8   = 30  
75  + 238 = 313 
146 + 30  = 176 
87  + 35  = 122 
27  + 132 = 159 
157 + 210 = 367 
22  + 229 = 251 
176 + 153 = 329 
109 + 126 = 235 
190 + 8   = 198 
182 + 27  = 209 
65  + 21  = 86  
146 + 134 = 280 
252 + 250 = 502 
49  + 166 = 215 
153 + 240 = 393 
181 + 226 = 407 
247 + 121 = 368 
11  + 132 = 143 
1   + 221 = 222 
13  + 175 = 188 
171 + 247 = 418 
159 + 185 = 344 
170 + 68  = 238 
205 + 98  = 303 
222 + 178 = 400 
46  + 43  = 89  
64  + 65  = 129 
134 + 165 = 299 
56  + 1   = 57  
191 + 187 = 378 
149 + 16  = 165 
64  + 172 = 236 
0   + 251 = 251 
174 + 9   = 183 
204 + 191 = 395 
118 + 101 = 219 
22  + 193 = 21

In [23]:
%%cu

#include <stdio.h>
#include <cuda_runtime_api.h>

/******************************************************************************
  This program adds two arrays of integers together and stores the results in
  another array. See the previous example for exactly the same thing but with
  error checking not included. This version will enable you to see the
  sequence of operations more easily.

  Make a summary of the algorithmm in high level terms so that you can use it
  as a basis for your own work in future.

  Compile with:
    nvcc -o 04 04.cu

  Dr Kevan Buckley, University of Wolverhampton, 2018
******************************************************************************/

int h_a[] = {253, 215, 223, 116, 90, 184, 119, 180, 150, 175, 175, 18, 70, 18,
           103, 183, 247, 99, 175, 71, 230, 22, 75, 146, 87, 27, 157, 22, 176,
           109, 190, 182, 65, 146, 252, 49, 153, 181, 247, 11, 1, 13, 171, 159,
           170, 205, 222, 46, 64, 134, 56, 191, 149, 64, 0, 174, 204, 118, 22,
           51, 14, 7, 20, 25, 3, 226, 15, 216, 99, 113, 10, 151, 41, 189, 204,
           198, 120, 92, 64, 97, 231, 185, 198, 118, 225, 197, 60, 252, 189,
           186, 161, 81, 18, 243, 25, 233, 38, 212, 49, 173, 155, 113, 233,
           56, 252, 134, 40, 16, 80, 192, 79, 50, 67, 158, 241, 231, 19, 165,
           212, 76, 192, 161, 136, 224, 43, 39, 156, 27};
int h_b[] = {135, 113, 155, 52, 145, 172, 55, 112, 121, 248, 84, 216, 186, 111,
           107, 135, 149, 111, 184, 188, 60, 8, 238, 30, 35, 132, 210, 229,
           153, 126, 8, 27, 21, 134, 250, 166, 240, 226, 121, 132, 221, 175,
           247, 185, 68, 98, 178, 43, 65, 165, 1, 187, 16, 172, 251, 9, 191,
           101, 193, 241, 167, 16, 108, 231, 117, 234, 59, 194, 164, 168,
           242, 73, 202, 238, 211, 42, 92, 202, 202, 223, 5, 186, 220, 171,
           165, 111, 45, 212, 79, 64, 235, 47, 245, 207, 20, 164, 189, 163,
           160, 129, 27, 22, 16, 88, 58, 10, 149, 254, 52, 57, 167, 138, 71,
           132, 183, 228, 178, 60, 190, 32, 23, 175, 193, 160, 250, 216, 145,
           147};
int h_c[128];

int *d_a, *d_b, *d_c;

__global__ void kernel(int *a, int *b, int *c){
  int i = threadIdx.x;
  c[i] = a[i] + b[i];
}

int main() {
  cudaMalloc(&d_a, sizeof(int) * 128);
  cudaMalloc(&d_b, sizeof(int) * 128);
  cudaMalloc(&d_c, sizeof(int) * 128);

  cudaMemcpy(d_a, &h_a, sizeof(int) * 128, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &h_b, sizeof(int) * 128, cudaMemcpyHostToDevice);

  kernel <<<1,128>>>(d_a, d_b, d_c);
  cudaThreadSynchronize();

  cudaMemcpy(h_c, d_c, sizeof(int) * 128, cudaMemcpyDeviceToHost);

  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  int i;
  for(i=0;i<128;i++){
    printf("%-3d + %-3d = %-4d\n", h_a[i], h_b[i], h_c[i]);
  }
  return 0;
}



253 + 135 = 388 
215 + 113 = 328 
223 + 155 = 378 
116 + 52  = 168 
90  + 145 = 235 
184 + 172 = 356 
119 + 55  = 174 
180 + 112 = 292 
150 + 121 = 271 
175 + 248 = 423 
175 + 84  = 259 
18  + 216 = 234 
70  + 186 = 256 
18  + 111 = 129 
103 + 107 = 210 
183 + 135 = 318 
247 + 149 = 396 
99  + 111 = 210 
175 + 184 = 359 
71  + 188 = 259 
230 + 60  = 290 
22  + 8   = 30  
75  + 238 = 313 
146 + 30  = 176 
87  + 35  = 122 
27  + 132 = 159 
157 + 210 = 367 
22  + 229 = 251 
176 + 153 = 329 
109 + 126 = 235 
190 + 8   = 198 
182 + 27  = 209 
65  + 21  = 86  
146 + 134 = 280 
252 + 250 = 502 
49  + 166 = 215 
153 + 240 = 393 
181 + 226 = 407 
247 + 121 = 368 
11  + 132 = 143 
1   + 221 = 222 
13  + 175 = 188 
171 + 247 = 418 
159 + 185 = 344 
170 + 68  = 238 
205 + 98  = 303 
222 + 178 = 400 
46  + 43  = 89  
64  + 65  = 129 
134 + 165 = 299 
56  + 1   = 57  
191 + 187 = 378 
149 + 16  = 165 
64  + 172 = 236 
0   + 251 = 251 
174 + 9   = 183 
204 + 191 = 395 
118 + 101 = 219 
22  + 193 = 21

In [25]:
%%cu

#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <math.h>
#include <cuda_runtime_api.h>

/******************************************************************************
  This program adds two arrays of integers together and stores the results in
  another array. As the arrays are quite big there will be a problem.
  At the time of writing most GPUs have a limit of 512 threads per block
  so the kernel launch will fail (unless better GPUs are in place).
  Run this program to see what a kernel launch failure looks like. Note this
  program has a big memory leak.

  Update for 2018 - most of our GPUs allow 1024 threads per block. Whilst it
  still causes a kernel launch failure, this may not be true when utilising
  new hardware in the future.

  Compile with:
    nvcc -o 05 05.cu

  Dr Kevan Buckley, University of Wolverhampton, March 2015, Updated 2018
******************************************************************************/

int h_a[]=
{215,100,200,204,233,50,85,196,71,141,122,160,93,131,243,234,162,183,36,155,4,62,
35,205,40,102,33,27,255,55,131,214,156,75,163,134,126,249,74,197,134,197,102,
228,72,90,206,235,17,243,134,22,49,169,227,89,16,5,117,16,60,248,230,217,68,138,
96,194,131,170,136,10,112,238,238,184,72,189,163,90,176,42,112,225,212,84,58,
228,89,175,244,150,168,219,112,236,101,208,175,233,123,55,243,235,37,225,164,
110,158,71,201,78,114,57,48,70,142,106,43,232,26,32,126,194,252,239,175,98,191,
94,75,59,149,62,39,187,32,203,42,190,19,243,13,133,45,61,204,187,168,247,163,
194,23,34,133,20,17,52,118,209,146,193,13,40,255,52,227,32,255,13,222,18,1,236,
152,46,41,100,233,209,91,141,148,115,175,25,135,193,77,254,147,224,191,161,9,
191,213,236,223,212,250,190,231,251,170,127,41,212,227,19,166,63,161,58,179,81,
84,59,18,162,57,166,130,248,71,139,184,28,120,151,241,115,86,217,111,0,88,153,
213,59,172,123,123,78,182,46,159,10,105,178,172,163,88,47,155,160,187,84,189,51,
235,175,167,65,136,22,66,224,175,23,28,92,147,151,170,73,198,73,84,48,251,0,211,
84,48,111,245,235,195,178,31,175,98,198,241,234,220,52,203,140,76,231,232,223,
127,147,41,70,221,126,118,217,126,74,46,175,186,35,154,126,214,185,45,56,127,31,
35,92,83,238,232,159,214,209,126,85,100,168,155,66,38,18,27,165,93,73,84,23,109,
239,149,67,168,195,124,40,226,160,132,53,142,109,212,100,62,83,186,163,252,86,
229,34,105,1,200,198,75,29,221,184,12,114,252,181,53,121,221,24,25,98,77,168,
207,33,13,13,117,199,177,113,30,150,148,135,152,92,77,227,122,43,156,134,158,
152,59,212,17,25,236,43,123,57,211,74,91,224,88,208,168,9,65,199,160,214,78,56,
50,156,28,172,200,184,51,102,80,111,59,98,136,39,142,3,97,97,78,188,66,166,141,
235,175,207,178,79,165,1,136,216,158,164,132,102,92,184,205,173,39,8,16,175,48,
158,179,145,0,1,78,66,167,219,46,87,170,225,167,80,226,47,40,128,212,172,231,48,
100,180,222,140,189,238,59,237,141,238,126,141,240,204,208,152,168,254,239,83,
223,150,163,194,198,203,67,154,120,42,203,221,223,170,105,156,152,165,137,37,
148,8,179,132,213,131,28,125,130,12,208,98,163,115,36,105,63,104,4,183,146,208,
149,114,122,254,15,19,164,152,56,56,161,236,188,118,112,217,243,242,230,196,85,
137,56,122,243,119,226,247,47,117,199,196,231,65,194,246,84,103,143,141,159,48,
122,92,167,234,53,155,221,28,95,50,165,151,173,152,15,143,144,62,4,88,2,236,153,
197,227,238,44,114,124,203,163,247,39,74,225,93,230,191,121,69,242,31,221,159,
183,236,47,72,42,51,160,45,32,58,242,3,41,30,118,166,234,25,157,18,100,127,111,
75,62,233,144,48,8,110,208,192,91,255,9,134,51,169,179,83,227,165,87,12,196,205,
178,174,231,80,192,76,207,48,151,13,25,40,62,34,150,14,227,242,14,236,120,65,
150,43,149,121,208,237,134,149,186,57,67,162,137,4,238,88,52,133,102,78,174,165,
113,68,180,85,54,194,66,174,4,216,218,153,82,170,134,217,64,65,18,131,227,156,
135,210,245,188,88,92,11,6,1,124,74,181,209,129,119,20,48,123,236,11,21,62,182,
156,23,246,222,42,121,193,199,1,148,188,190,236,24,201,242,25,70,61,207,24,191,
70,44,240,194,24,251,216,87,177,116,111,167,82,153,33,20,96,35,168,29,225,149,
53,171,135,79,241,196,31,9,131,102,54,115,41,78,111,1,166,32,118,21,199,201,175,
233,222,16,12,134,45,237,28,99,152,163,179,138,104,210,147,235,56,202,95,97,24,
206,99,191,239,217,212,182,162,132,159,128,148,171,7,193,153,35,36,50,199,216,
188,47,170,80,27,227,26,122,69,51,73,168,242,56,129,199,239,36,75,143,164,223,
59,172,161,213,208,197,7,151,158,195,198,72,19,225,44,45,92,113,96,165,25,83,
222,155,26,206,191,102,93,100,69,153,17,230,110,225,172,117,120,74,57,62,147,77,
32,191,122,124,49,219,34,75,47,0,230,73,207,166,176,44,11,245,198,28,220,53,254,
137,170,119,212,228,182,103,49,214,39,172,82,88,136,117,163,183,117,138,1,68,49,
177,113,60,167,56,89,131,109,87,12,24,207,225,252,133,72,46,91,112,218,174,201,
98,35,109,26,153,247,27,222,40,205,80,101,116,136,190,248,245,22,5,13,229,230,
10,107,47,56,199,159,18,118,104,117,154,213,144,51,205,172,18,246,121,98,91,238,
234,26,230,224,49,235,237,22,210,247,130,1,48,73,160,67,191,9,184,89,223,72,141,
172,245,159,163,110,2,254,93,236,25,67,205,74,47,186,97,1,178,227,2,226,45,163,
37,236,172,222,70,140,39,212,57,28,115,220,139,118,219,232,98,244,44,48,63,92,
234,160,93,157,132,96,128,177,4,166,158,177,133,229,61,172,185,118,201,45,83,85,
163,46,62,6,35,106,54,98,199,33,3,37,190,135,134,63,57,138,229,216,60,106,189,
122,23,118,240,225,163,68,54,71,114,117,77,150,224,131,248,167,164,252,204,99,
131,82,162,189,221,136,149,25,243,82,147,11,201,132,236,109,200,35,180,59,152,2,
209,121,133,202,32,42,198,236,142,74,63,49,8,28,186,157,54,173,240,201,184,186,
78,165,39,23,200,220,82,97,222,36,218,99,238,251,142,181,232,29,0,39,78,8,68,8,
165,122,182,150,68,111,80,146,20,119,169,221,83,252,63,49,32,25,149,15,21,36,
196,253,65,196,36,144,204,105,153,114,227,79,9,40,191,89,187,211,209,100,177,37,
97,240,87,129,10,236,145,31,17,86,28,82,27,65,227,232,171,124,90,143,204,100,
183,140,189,114,95,142,215,17,180,56,1,11,186,11,248,75,43,9,162,72,92,189,137,
63,165,52,188,0,195,137,100,123,21,34,237,117,177,196,134,101,253,136,112,183,
148,104,3,191,113,165,7,206,99,145,14,8,198,202,9,138,84,109,5,105,144,242,223,
65,183,102,166,180,238,22,108,131,127,112,67,241,22,74,191,121,220,205,130,162,
152,139,44,237,249,50,87,137,36,54,202,220,156,113,145,139,136,254,15,7,110,82,
249,132,156,184,254,121,134,128,27,31,11,72,12,5,122,99,142,159,154,89,124,55,
203,13,195,83,12,210,91,122,36,84,255,193,13,254,58,148,126,86,179,138,159,192,
143,26,36,30};

int h_b[] =
{179,207,22,31,89,108,179,16,150,164,253,75,69,17,243,97,82,253,215,70,152,142,
217,47,101,227,217,81,26,11,165,205,218,187,236,51,39,160,68,190,68,66,9,138,84,
253,235,166,250,195,237,146,82,198,194,183,170,155,9,196,167,174,146,130,106,
127,182,146,31,250,80,100,61,90,238,145,87,218,56,81,157,38,228,239,236,166,167,
151,66,176,91,233,95,238,107,201,109,33,91,141,28,171,241,90,5,223,235,93,185,
36,174,87,75,146,71,55,57,238,206,123,159,42,101,255,24,208,200,134,242,35,19,
14,207,4,104,213,227,84,50,157,121,225,244,196,116,59,252,173,42,203,41,202,245,
142,201,14,94,146,149,80,181,168,95,133,173,200,90,145,29,141,46,151,110,35,91,
226,95,88,144,137,35,185,84,25,71,30,40,166,176,189,246,102,101,86,235,19,31,70,
164,60,211,210,212,66,245,47,37,84,135,181,222,171,110,51,196,182,81,236,92,1,
169,83,104,15,169,83,34,201,154,199,6,110,153,218,176,143,9,213,228,145,138,195,
60,249,246,1,175,72,238,11,74,151,94,178,167,8,6,202,209,160,145,215,14,43,177,
190,187,187,148,160,77,30,99,138,24,90,139,199,162,121,211,236,17,50,159,185,58,
165,132,12,69,22,228,84,66,150,18,253,82,167,158,159,198,2,42,222,92,181,166,
255,47,121,236,65,171,139,250,230,48,127,242,118,149,215,202,215,109,221,213,
191,132,115,94,74,117,136,41,210,62,207,210,110,73,191,175,245,74,170,219,123,
41,206,242,190,165,189,150,18,154,107,210,31,223,49,106,84,186,147,39,249,99,
250,103,172,185,23,161,4,194,125,127,235,76,114,170,241,47,65,4,202,172,215,233,
139,8,84,224,194,232,8,187,75,2,35,248,187,58,154,191,252,24,63,232,100,177,147,
86,224,212,90,170,129,49,148,13,58,233,237,252,209,246,184,29,248,219,21,180,22,
176,115,19,200,179,251,44,101,143,130,69,100,221,240,229,15,133,243,73,110,224,
70,64,214,254,93,207,218,115,131,240,35,247,3,236,170,255,24,15,143,155,85,243,
120,70,217,136,203,204,209,58,173,23,123,131,22,216,83,240,76,214,225,112,205,
229,92,120,228,117,135,115,17,221,103,137,35,64,18,239,12,227,42,185,251,165,61,
18,126,144,2,202,103,228,58,53,201,151,173,174,12,53,34,29,18,137,167,54,201,
185,38,214,157,80,144,152,245,206,170,116,95,173,62,198,146,121,251,91,16,168,
10,28,222,44,58,240,181,226,39,127,155,77,86,56,157,230,209,147,180,124,7,19,42,
70,218,188,191,213,24,207,126,34,236,92,79,39,77,4,9,116,132,164,194,218,221,95,
192,175,243,117,44,251,136,86,65,98,19,1,56,43,209,182,78,189,19,157,228,96,162,
237,213,38,146,151,0,112,247,193,31,235,54,75,230,191,161,40,34,181,41,90,224,
250,17,47,184,36,204,157,133,110,138,91,149,29,243,149,141,234,86,172,213,141,
248,188,76,154,229,110,79,14,201,48,9,218,95,194,255,44,95,133,155,234,225,48,7,
212,197,149,191,28,65,149,169,58,81,246,212,54,100,36,69,45,84,79,8,180,17,8,
224,112,141,123,91,111,171,98,67,113,247,2,142,57,151,55,116,233,45,72,31,146,
109,101,192,193,180,201,118,197,209,86,54,95,210,145,206,126,244,18,239,236,20,
125,38,172,181,154,150,227,227,182,118,80,27,54,18,207,255,136,149,209,223,204,
48,177,94,255,48,83,17,31,63,38,157,102,211,83,0,105,54,228,31,172,53,58,227,72,
10,227,209,160,180,176,109,229,98,203,229,146,30,246,178,94,29,79,196,240,163,
197,90,217,169,121,134,223,180,105,39,191,76,248,96,1,169,205,231,11,152,204,
158,183,194,80,21,224,160,217,209,67,159,43,29,72,165,163,39,89,13,79,25,90,72,
121,91,241,70,66,253,223,14,155,150,209,236,172,177,141,134,130,208,37,174,238,
110,83,145,150,173,159,229,198,249,45,63,85,30,134,152,28,101,167,183,252,121,
164,169,42,49,47,173,2,85,92,240,195,175,130,89,92,33,62,34,27,108,98,112,139,
232,9,167,77,176,95,74,41,3,243,84,53,35,2,55,120,94,40,59,13,170,148,106,203,
211,140,231,63,238,88,202,215,97,114,36,17,210,111,59,213,98,144,11,134,146,67,
254,240,107,57,254,21,205,104,225,161,245,200,224,228,32,171,187,129,30,224,147,
240,79,207,198,178,95,209,56,242,20,54,227,127,111,225,149,61,74,118,223,63,62,
191,35,95,107,222,225,137,191,117,122,14,68,64,192,164,18,249,150,38,48,121,166,
159,91,59,221,166,178,188,229,241,124,9,80,232,232,50,114,167,167,236,182,236,
45,119,144,63,112,39,102,160,160,12,64,252,72,30,162,250,219,136,235,88,146,60,
64,123,110,179,34,21,159,216,1,205,80,146,12,192,185,114,97,90,127,161,87,199,
192,249,193,155,130,172,243,20,232,52,143,86,231,178,108,135,139,110,84,219,0,
96,156,185,211,253,20,82,159,107,25,95,100,218,251,231,135,239,252,112,35,140,
199,10,62,51,145,202,162,230,165,162,70,66,92,26,63,112,108,223,219,133,63,64,
96,58,40,232,42,36,88,77,176,32,88,239,84,233,185,246,208,95,153,23,161,245,49,
225,102,157,192,65,35,0,130,132,59,170,109,101,206,198,178,127,230,10,110,58,
244,40,49,196,136,202,219,41,192,12,11,38,170,204,104,206,204,234,83,7,149,192,
108,100,134,31,227,108,41,82,167,29,122,217,226,2,163,189,44,99,202,56,138,117,
4,242,67,208,220,151,216,113,87,69,213,221,100,185,74,141,11,242,171,134,203,
142,137,110,76,182,210,23,238,92,140,243,78,208,196,43,103,156,156,190,225,114,
156,70,44,231,212,55,217,127,190,164,14,71,19,90,254,229,113,236,66,254,223,145,
206,163,188,54,64,89,245,34,203,145,104,247,120,60,47,82,188,237,246,202,53,9,
36,51,239,150,32,49,148,0,194,98,163,126,153,228,216,142,6,164,32,110,156,153,
171,203,235,103,185,225,49,239,235,86,35,218,237,67,12,129,67,206,228,231,77,
125,203,38,11,210,202,44,65,102,197,236,50,176,84,235,145,134,219,124,220,254,
87,202,65,99,75,133,50,48,108,128,173,56,166,185,10,112,229,76,215,170,57,9,91,
141,245,237,19,208,105,240,206,193,186,15,36,6,149,87,54,1,215,227,58,126,157,
69,239,131,145,198,46,202,208,137,88,197,118,107,150,224,91,100,161,22,116,198,
28,9,29,82,11,245,54,69,116,211,139,99,86};
int h_c[1500];

int *d_a, *d_b, *d_c;

__global__ void kernel(int *a, int *b, int *c){
  int i = threadIdx.x;
  c[i] = a[i] + b[i];
}

int main() {
  cudaError_t error;

  error = cudaMalloc(&d_a, sizeof(int) * 1500);
  if(error){
    fprintf(stderr, "cudaMalloc on d_a returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaMalloc(&d_b, sizeof(int) * 1500);
  if(error){
    fprintf(stderr, "cudaMalloc on d_b returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaMalloc(&d_c, sizeof(int) * 1500);
  if(error){
    fprintf(stderr, "cudaMalloc on d_c returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaMemcpy(d_a, &h_a, sizeof(int) * 1500, cudaMemcpyHostToDevice);
  if(error){
    fprintf(stderr, "cudaMemcpy to d_b returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaMemcpy(d_b, &h_b, sizeof(int) * 1500, cudaMemcpyHostToDevice);
  if(error){
    fprintf(stderr, "cudaMemcpy to d_b returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  kernel <<<1,1500>>>(d_a, d_b, d_c);
  error = cudaGetLastError();
  if(error){
    fprintf(stderr, "Kernel launch returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }
  cudaThreadSynchronize();

  error = cudaMemcpy(h_c, d_c, sizeof(int) * 1500, cudaMemcpyDeviceToHost);
  if(error){
    fprintf(stderr, "cudaMemcpy to h_c returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaFree(d_a);
  if(error){
    fprintf(stderr, "cudaFree on d_a returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaFree(d_b);
  if(error){
    fprintf(stderr, "cudaFree on d_b returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaFree(d_c);
  if(error){
    fprintf(stderr, "cudaFree on d_c returned %d %s\n", error,
      cudaGetErrorString(error));;
    exit(1);
  }

  int i;
  for(i=0;i<1500;i++){
    printf("%-3d + %-3d = %-4d\n", h_a[i], h_b[i], h_c[i]);
  }
  return 0;
}



Kernel launch returned 9 invalid configuration argument



In [26]:
%%cu

#include <stdio.h>
#include <cuda_runtime_api.h>

/******************************************************************************
  This program adds two arrays of integers together and stores the results in
  another array. As the arrays are quite big special care needs to be taken
  to ensure the number of threads per block is not exceeded. We need to use
  the block dimensions to work out the unique index of each thread so that it
  knows which item of data to operate on.

  Compile with:
    nvcc -o 06 06.cu

  Dr Kevan Buckley, University of Wolverhampton, 2018
******************************************************************************/

int h_a[]=
{215,100,200,204,233,50,85,196,71,141,122,160,93,131,243,234,162,183,36,155,4,62,
35,205,40,102,33,27,255,55,131,214,156,75,163,134,126,249,74,197,134,197,102,
228,72,90,206,235,17,243,134,22,49,169,227,89,16,5,117,16,60,248,230,217,68,138,
96,194,131,170,136,10,112,238,238,184,72,189,163,90,176,42,112,225,212,84,58,
228,89,175,244,150,168,219,112,236,101,208,175,233,123,55,243,235,37,225,164,
110,158,71,201,78,114,57,48,70,142,106,43,232,26,32,126,194,252,239,175,98,191,
94,75,59,149,62,39,187,32,203,42,190,19,243,13,133,45,61,204,187,168,247,163,
194,23,34,133,20,17,52,118,209,146,193,13,40,255,52,227,32,255,13,222,18,1,236,
152,46,41,100,233,209,91,141,148,115,175,25,135,193,77,254,147,224,191,161,9,
191,213,236,223,212,250,190,231,251,170,127,41,212,227,19,166,63,161,58,179,81,
84,59,18,162,57,166,130,248,71,139,184,28,120,151,241,115,86,217,111,0,88,153,
213,59,172,123,123,78,182,46,159,10,105,178,172,163,88,47,155,160,187,84,189,51,
235,175,167,65,136,22,66,224,175,23,28,92,147,151,170,73,198,73,84,48,251,0,211,
84,48,111,245,235,195,178,31,175,98,198,241,234,220,52,203,140,76,231,232,223,
127,147,41,70,221,126,118,217,126,74,46,175,186,35,154,126,214,185,45,56,127,31,
35,92,83,238,232,159,214,209,126,85,100,168,155,66,38,18,27,165,93,73,84,23,109,
239,149,67,168,195,124,40,226,160,132,53,142,109,212,100,62,83,186,163,252,86,
229,34,105,1,200,198,75,29,221,184,12,114,252,181,53,121,221,24,25,98,77,168,
207,33,13,13,117,199,177,113,30,150,148,135,152,92,77,227,122,43,156,134,158,
152,59,212,17,25,236,43,123,57,211,74,91,224,88,208,168,9,65,199,160,214,78,56,
50,156,28,172,200,184,51,102,80,111,59,98,136,39,142,3,97,97,78,188,66,166,141,
235,175,207,178,79,165,1,136,216,158,164,132,102,92,184,205,173,39,8,16,175,48,
158,179,145,0,1,78,66,167,219,46,87,170,225,167,80,226,47,40,128,212,172,231,48,
100,180,222,140,189,238,59,237,141,238,126,141,240,204,208,152,168,254,239,83,
223,150,163,194,198,203,67,154,120,42,203,221,223,170,105,156,152,165,137,37,
148,8,179,132,213,131,28,125,130,12,208,98,163,115,36,105,63,104,4,183,146,208,
149,114,122,254,15,19,164,152,56,56,161,236,188,118,112,217,243,242,230,196,85,
137,56,122,243,119,226,247,47,117,199,196,231,65,194,246,84,103,143,141,159,48,
122,92,167,234,53,155,221,28,95,50,165,151,173,152,15,143,144,62,4,88,2,236,153,
197,227,238,44,114,124,203,163,247,39,74,225,93,230,191,121,69,242,31,221,159,
183,236,47,72,42,51,160,45,32,58,242,3,41,30,118,166,234,25,157,18,100,127,111,
75,62,233,144,48,8,110,208,192,91,255,9,134,51,169,179,83,227,165,87,12,196,205,
178,174,231,80,192,76,207,48,151,13,25,40,62,34,150,14,227,242,14,236,120,65,
150,43,149,121,208,237,134,149,186,57,67,162,137,4,238,88,52,133,102,78,174,165,
113,68,180,85,54,194,66,174,4,216,218,153,82,170,134,217,64,65,18,131,227,156,
135,210,245,188,88,92,11,6,1,124,74,181,209,129,119,20,48,123,236,11,21,62,182,
156,23,246,222,42,121,193,199,1,148,188,190,236,24,201,242,25,70,61,207,24,191,
70,44,240,194,24,251,216,87,177,116,111,167,82,153,33,20,96,35,168,29,225,149,
53,171,135,79,241,196,31,9,131,102,54,115,41,78,111,1,166,32,118,21,199,201,175,
233,222,16,12,134,45,237,28,99,152,163,179,138,104,210,147,235,56,202,95,97,24,
206,99,191,239,217,212,182,162,132,159,128,148,171,7,193,153,35,36,50,199,216,
188,47,170,80,27,227,26,122,69,51,73,168,242,56,129,199,239,36,75,143,164,223,
59,172,161,213,208,197,7,151,158,195,198,72,19,225,44,45,92,113,96,165,25,83,
222,155,26,206,191,102,93,100,69,153,17,230,110,225,172,117,120,74,57,62,147,77,
32,191,122,124,49,219,34,75,47,0,230,73,207,166,176,44,11,245,198,28,220,53,254,
137,170,119,212,228,182,103,49,214,39,172,82,88,136,117,163,183,117,138,1,68,49,
177,113,60,167,56,89,131,109,87,12,24,207,225,252,133,72,46,91,112,218,174,201,
98,35,109,26,153,247,27,222,40,205,80,101,116,136,190,248,245,22,5,13,229,230,
10,107,47,56,199,159,18,118,104,117,154,213,144,51,205,172,18,246,121,98,91,238,
234,26,230,224,49,235,237,22,210,247,130,1,48,73,160,67,191,9,184,89,223,72,141,
172,245,159,163,110,2,254,93,236,25,67,205,74,47,186,97,1,178,227,2,226,45,163,
37,236,172,222,70,140,39,212,57,28,115,220,139,118,219,232,98,244,44,48,63,92,
234,160,93,157,132,96,128,177,4,166,158,177,133,229,61,172,185,118,201,45,83,85,
163,46,62,6,35,106,54,98,199,33,3,37,190,135,134,63,57,138,229,216,60,106,189,
122,23,118,240,225,163,68,54,71,114,117,77,150,224,131,248,167,164,252,204,99,
131,82,162,189,221,136,149,25,243,82,147,11,201,132,236,109,200,35,180,59,152,2,
209,121,133,202,32,42,198,236,142,74,63,49,8,28,186,157,54,173,240,201,184,186,
78,165,39,23,200,220,82,97,222,36,218,99,238,251,142,181,232,29,0,39,78,8,68,8,
165,122,182,150,68,111,80,146,20,119,169,221,83,252,63,49,32,25,149,15,21,36,
196,253,65,196,36,144,204,105,153,114,227,79,9,40,191,89,187,211,209,100,177,37,
97,240,87,129,10,236,145,31,17,86,28,82,27,65,227,232,171,124,90,143,204,100,
183,140,189,114,95,142,215,17,180,56,1,11,186,11,248,75,43,9,162,72,92,189,137,
63,165,52,188,0,195,137,100,123,21,34,237,117,177,196,134,101,253,136,112,183,
148,104,3,191,113,165,7,206,99,145,14,8,198,202,9,138,84,109,5,105,144,242,223,
65,183,102,166,180,238,22,108,131,127,112,67,241,22,74,191,121,220,205,130,162,
152,139,44,237,249,50,87,137,36,54,202,220,156,113,145,139,136,254,15,7,110,82,
249,132,156,184,254,121,134,128,27,31,11,72,12,5,122,99,142,159,154,89,124,55,
203,13,195,83,12,210,91,122,36,84,255,193,13,254,58,148,126,86,179,138,159,192,
143,26,36,30};

int h_b[] =
{179,207,22,31,89,108,179,16,150,164,253,75,69,17,243,97,82,253,215,70,152,142,
217,47,101,227,217,81,26,11,165,205,218,187,236,51,39,160,68,190,68,66,9,138,84,
253,235,166,250,195,237,146,82,198,194,183,170,155,9,196,167,174,146,130,106,
127,182,146,31,250,80,100,61,90,238,145,87,218,56,81,157,38,228,239,236,166,167,
151,66,176,91,233,95,238,107,201,109,33,91,141,28,171,241,90,5,223,235,93,185,
36,174,87,75,146,71,55,57,238,206,123,159,42,101,255,24,208,200,134,242,35,19,
14,207,4,104,213,227,84,50,157,121,225,244,196,116,59,252,173,42,203,41,202,245,
142,201,14,94,146,149,80,181,168,95,133,173,200,90,145,29,141,46,151,110,35,91,
226,95,88,144,137,35,185,84,25,71,30,40,166,176,189,246,102,101,86,235,19,31,70,
164,60,211,210,212,66,245,47,37,84,135,181,222,171,110,51,196,182,81,236,92,1,
169,83,104,15,169,83,34,201,154,199,6,110,153,218,176,143,9,213,228,145,138,195,
60,249,246,1,175,72,238,11,74,151,94,178,167,8,6,202,209,160,145,215,14,43,177,
190,187,187,148,160,77,30,99,138,24,90,139,199,162,121,211,236,17,50,159,185,58,
165,132,12,69,22,228,84,66,150,18,253,82,167,158,159,198,2,42,222,92,181,166,
255,47,121,236,65,171,139,250,230,48,127,242,118,149,215,202,215,109,221,213,
191,132,115,94,74,117,136,41,210,62,207,210,110,73,191,175,245,74,170,219,123,
41,206,242,190,165,189,150,18,154,107,210,31,223,49,106,84,186,147,39,249,99,
250,103,172,185,23,161,4,194,125,127,235,76,114,170,241,47,65,4,202,172,215,233,
139,8,84,224,194,232,8,187,75,2,35,248,187,58,154,191,252,24,63,232,100,177,147,
86,224,212,90,170,129,49,148,13,58,233,237,252,209,246,184,29,248,219,21,180,22,
176,115,19,200,179,251,44,101,143,130,69,100,221,240,229,15,133,243,73,110,224,
70,64,214,254,93,207,218,115,131,240,35,247,3,236,170,255,24,15,143,155,85,243,
120,70,217,136,203,204,209,58,173,23,123,131,22,216,83,240,76,214,225,112,205,
229,92,120,228,117,135,115,17,221,103,137,35,64,18,239,12,227,42,185,251,165,61,
18,126,144,2,202,103,228,58,53,201,151,173,174,12,53,34,29,18,137,167,54,201,
185,38,214,157,80,144,152,245,206,170,116,95,173,62,198,146,121,251,91,16,168,
10,28,222,44,58,240,181,226,39,127,155,77,86,56,157,230,209,147,180,124,7,19,42,
70,218,188,191,213,24,207,126,34,236,92,79,39,77,4,9,116,132,164,194,218,221,95,
192,175,243,117,44,251,136,86,65,98,19,1,56,43,209,182,78,189,19,157,228,96,162,
237,213,38,146,151,0,112,247,193,31,235,54,75,230,191,161,40,34,181,41,90,224,
250,17,47,184,36,204,157,133,110,138,91,149,29,243,149,141,234,86,172,213,141,
248,188,76,154,229,110,79,14,201,48,9,218,95,194,255,44,95,133,155,234,225,48,7,
212,197,149,191,28,65,149,169,58,81,246,212,54,100,36,69,45,84,79,8,180,17,8,
224,112,141,123,91,111,171,98,67,113,247,2,142,57,151,55,116,233,45,72,31,146,
109,101,192,193,180,201,118,197,209,86,54,95,210,145,206,126,244,18,239,236,20,
125,38,172,181,154,150,227,227,182,118,80,27,54,18,207,255,136,149,209,223,204,
48,177,94,255,48,83,17,31,63,38,157,102,211,83,0,105,54,228,31,172,53,58,227,72,
10,227,209,160,180,176,109,229,98,203,229,146,30,246,178,94,29,79,196,240,163,
197,90,217,169,121,134,223,180,105,39,191,76,248,96,1,169,205,231,11,152,204,
158,183,194,80,21,224,160,217,209,67,159,43,29,72,165,163,39,89,13,79,25,90,72,
121,91,241,70,66,253,223,14,155,150,209,236,172,177,141,134,130,208,37,174,238,
110,83,145,150,173,159,229,198,249,45,63,85,30,134,152,28,101,167,183,252,121,
164,169,42,49,47,173,2,85,92,240,195,175,130,89,92,33,62,34,27,108,98,112,139,
232,9,167,77,176,95,74,41,3,243,84,53,35,2,55,120,94,40,59,13,170,148,106,203,
211,140,231,63,238,88,202,215,97,114,36,17,210,111,59,213,98,144,11,134,146,67,
254,240,107,57,254,21,205,104,225,161,245,200,224,228,32,171,187,129,30,224,147,
240,79,207,198,178,95,209,56,242,20,54,227,127,111,225,149,61,74,118,223,63,62,
191,35,95,107,222,225,137,191,117,122,14,68,64,192,164,18,249,150,38,48,121,166,
159,91,59,221,166,178,188,229,241,124,9,80,232,232,50,114,167,167,236,182,236,
45,119,144,63,112,39,102,160,160,12,64,252,72,30,162,250,219,136,235,88,146,60,
64,123,110,179,34,21,159,216,1,205,80,146,12,192,185,114,97,90,127,161,87,199,
192,249,193,155,130,172,243,20,232,52,143,86,231,178,108,135,139,110,84,219,0,
96,156,185,211,253,20,82,159,107,25,95,100,218,251,231,135,239,252,112,35,140,
199,10,62,51,145,202,162,230,165,162,70,66,92,26,63,112,108,223,219,133,63,64,
96,58,40,232,42,36,88,77,176,32,88,239,84,233,185,246,208,95,153,23,161,245,49,
225,102,157,192,65,35,0,130,132,59,170,109,101,206,198,178,127,230,10,110,58,
244,40,49,196,136,202,219,41,192,12,11,38,170,204,104,206,204,234,83,7,149,192,
108,100,134,31,227,108,41,82,167,29,122,217,226,2,163,189,44,99,202,56,138,117,
4,242,67,208,220,151,216,113,87,69,213,221,100,185,74,141,11,242,171,134,203,
142,137,110,76,182,210,23,238,92,140,243,78,208,196,43,103,156,156,190,225,114,
156,70,44,231,212,55,217,127,190,164,14,71,19,90,254,229,113,236,66,254,223,145,
206,163,188,54,64,89,245,34,203,145,104,247,120,60,47,82,188,237,246,202,53,9,
36,51,239,150,32,49,148,0,194,98,163,126,153,228,216,142,6,164,32,110,156,153,
171,203,235,103,185,225,49,239,235,86,35,218,237,67,12,129,67,206,228,231,77,
125,203,38,11,210,202,44,65,102,197,236,50,176,84,235,145,134,219,124,220,254,
87,202,65,99,75,133,50,48,108,128,173,56,166,185,10,112,229,76,215,170,57,9,91,
141,245,237,19,208,105,240,206,193,186,15,36,6,149,87,54,1,215,227,58,126,157,
69,239,131,145,198,46,202,208,137,88,197,118,107,150,224,91,100,161,22,116,198,
28,9,29,82,11,245,54,69,116,211,139,99,86};
int h_c[1500];

int *d_a, *d_b, *d_c;

__global__ void kernel(int *a, int *b, int *c){
  int i = (blockIdx.x * blockDim.x) + threadIdx.x;
  c[i] = a[i] + b[i];
}

int main() {
  cudaError_t error;

  error = cudaMalloc(&d_a, sizeof(int) * 1500);
  if(error){
    fprintf(stderr, "cudaMalloc on d_a returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaMalloc(&d_b, sizeof(int) * 1500);
  if(error){
    fprintf(stderr, "cudaMalloc on d_b returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaMalloc(&d_c, sizeof(int) * 1500);
  if(error){
    fprintf(stderr, "cudaMalloc on d_c returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaMemcpy(d_a, &h_a, sizeof(int) * 1500, cudaMemcpyHostToDevice);
  if(error){
    fprintf(stderr, "cudaMemcpy to d_b returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaMemcpy(d_b, &h_b, sizeof(int) * 1500, cudaMemcpyHostToDevice);
  if(error){
    fprintf(stderr, "cudaMemcpy to d_b returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  kernel <<<10,150>>>(d_a, d_b, d_c);
  error = cudaGetLastError();
  if(error){
    fprintf(stderr, "Kernel launch returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }
  cudaThreadSynchronize();

  error = cudaMemcpy(h_c, d_c, sizeof(int) * 1500, cudaMemcpyDeviceToHost);
  if(error){
    fprintf(stderr, "cudaMemcpy to h_c returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaFree(d_a);
  if(error){
    fprintf(stderr, "cudaFree on d_a returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaFree(d_b);
  if(error){
    fprintf(stderr, "cudaFree on d_b returned %d %s\n", error,
      cudaGetErrorString(error));
    exit(1);
  }

  error = cudaFree(d_c);
  if(error){
    fprintf(stderr, "cudaFree on d_c returned %d %s\n", error,
      cudaGetErrorString(error));;
    exit(1);
  }

  int i;
  for(i=0;i<1500;i++){
    printf("%-4d %-3d + %-3d = %-4d\n", i, h_a[i], h_b[i], h_c[i]);
  }
  return 0;
}



0    215 + 179 = 394 
1    100 + 207 = 307 
2    200 + 22  = 222 
3    204 + 31  = 235 
4    233 + 89  = 322 
5    50  + 108 = 158 
6    85  + 179 = 264 
7    196 + 16  = 212 
8    71  + 150 = 221 
9    141 + 164 = 305 
10   122 + 253 = 375 
11   160 + 75  = 235 
12   93  + 69  = 162 
13   131 + 17  = 148 
14   243 + 243 = 486 
15   234 + 97  = 331 
16   162 + 82  = 244 
17   183 + 253 = 436 
18   36  + 215 = 251 
19   155 + 70  = 225 
20   4   + 152 = 156 
21   62  + 142 = 204 
22   35  + 217 = 252 
23   205 + 47  = 252 
24   40  + 101 = 141 
25   102 + 227 = 329 
26   33  + 217 = 250 
27   27  + 81  = 108 
28   255 + 26  = 281 
29   55  + 11  = 66  
30   131 + 165 = 296 
31   214 + 205 = 419 
32   156 + 218 = 374 
33   75  + 187 = 262 
34   163 + 236 = 399 
35   134 + 51  = 185 
36   126 + 39  = 165 
37   249 + 160 = 409 
38   74  + 68  = 142 
39   197 + 190 = 387 
40   134 + 68  = 202 
41   197 + 66  = 263 
42   102 + 9   = 111 
43   228 + 138 = 366 
44   72  + 84  = 156 
45   90  +

In [28]:
%%cu

/****************************************************************************
  Compile with:
    nvcc -o factorise_3_cuda factorise_3_cuda.cu -lrt

  Dr Kevan Buckley, University of Wolverhampton, 2018
*****************************************************************************/

#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include <errno.h>
#include <sys/stat.h>
#include <string.h>
#include <time.h>
//#include <pthread.h>
#include <math.h>

#define goal 98931313
__global__ void factorise(){
  int a = threadIdx.x;
  int b = blockIdx.x;
  int c = blockIdx.y;

  if(a*b*c == goal){
     printf("solution is %d, %d, %d\n", a, b, c);
  }
}
int time_difference(struct timespec *start, struct timespec *finish,
                              long long int *difference) {
  long long int ds =  finish->tv_sec - start->tv_sec;
  long long int dn =  finish->tv_nsec - start->tv_nsec;

  if(dn < 0 ) {
    ds--;
    dn += 1000000000;
  }
  *difference = ds * 1000000000 + dn;
  return !(*difference > 0);
}
int main() {
  cudaError_t error;
  struct timespec start, finish;
  long long int time_elapsed;
  //clock_gettime(CLOCK_MONOTONIC, &start);
  dim3 gd(1000, 1000, 1);
  dim3 bd(1000, 1, 1);
  factorise<<<gd, bd>>>();

  cudaDeviceSynchronize();

  error = cudaGetLastError();

  if(error){
    fprintf(stderr, "Kernel launch returned %d %s\n",
      error, cudaGetErrorString(error));
    return 1;
  } else {
    fprintf(stderr, "Kernel launch successful.\n");
  }
  //clock_gettime(CLOCK_MONOTONIC, &finish);
  time_difference(&start, &finish, &time_elapsed);
  printf("Time elapsed was %lldns or %0.9lfs\n",
    time_elapsed, (time_elapsed/1.0e9));

  return 0;
}


solution is 997, 449, 221
solution is 449, 997, 221
solution is 997, 221, 449
solution is 221, 997, 449
solution is 449, 221, 997
solution is 221, 449, 997
Kernel launch successful.
Time elapsed was 0ns or 0.000000000s

