In [5]:
!nvcc –version
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

nvcc fatal   : Don't know what to do with '–version'
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-7_qu0kkx
  Running command git clone -q https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-7_qu0kkx
The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [33]:
%%cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void
pythagoreanTheorem(const float *x, const float *y, float *c, int numElements)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < numElements)
    {
        c[i] = ((x[i]*x[i]) + (y[i]*y[i])/(x[i] + y[i]));
        printf("c[i] raiz () = %f\n", x[i], y[i], c[i]);
    }
}


// Host main routine
int
main(void)
{
    // Error code to check return values for CUDA calls
    cudaError_t err = cudaSuccess;

    int numElements = 50000;
    size_t size = numElements * sizeof(float);
 
    // Allocate the host input x
    float *h_x = (float *)malloc(size); 

    // Allocate the host input y
    float *h_y = (float *)malloc(size); 

    // Allocate the host input c
    float *h_c = (float *)malloc(size); 

    // Verify that allocations succeeded
    if (h_x == NULL || h_y == NULL || h_c == NULL)
    {
        fprintf(stderr, "Failed to allocate !\n");
        exit(EXIT_FAILURE);
    }

    // Initialize the host input values
    for (int i = 0; i < numElements; ++i)
    {
        h_x[i] = rand()/(float)RAND_MAX;
        h_y[i] = rand()/(float)RAND_MAX;
    }

    // Allocate the device input vector x
    float *d_x = NULL;
    err = cudaMalloc((void **)&d_x, size);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

 
    // Allocate the device input y
    float *d_y = NULL;
    err = cudaMalloc((void **)&d_y, size);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Allocate the device output c
    float *d_c = NULL;
    err = cudaMalloc((void **)&d_c, size);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to allocate device!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Copy the host input
    printf("Copy input data from the host memory to the CUDA device\n");
    err = cudaMemcpy(d_x, h_x, size, cudaMemcpyHostToDevice);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy to from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    err = cudaMemcpy(d_y, h_y, size, cudaMemcpyHostToDevice);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to copy from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    int min;
    for (int i = 1; i < sizeof(h_c) / sizeof(h_c[0]); ++i)
	      {
		        if (h_c[i] < min) min= h_c[i];
	      }	
    printf("The smallest value is %d\n", min);

    // Launch the valur Add CUDA Kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
    pythagoreanTheorem<<<blocksPerGrid, threadsPerBlock>>>(d_x, d_y, d_c, numElements); // Kernell starts
    err = cudaGetLastError();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch pythagoreanTheorem kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Copy the device result vector in device memory to the host result vector
    // in host memory.
    printf("Copy output data from the CUDA device to the host memory\n");
    err = cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "ERROR", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Verify the result
    for (int i = 0; i < numElements; ++i)
    {
        if (fabs(h_x[i] + h_y[i] - h_c[i]) > 1e-5)
        {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            exit(EXIT_FAILURE);
        }
    }

    printf("Test PASSED\n");
 
    // Free device global memory
    err = cudaFree(d_x);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    err = cudaFree(d_y);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    err = cudaFree(d_c);

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to free device)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    // Free host memory 
    free(h_x);
    free(h_y);
    free(h_c);

    err = cudaDeviceReset();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }

    printf("Done\n");
    return 0;
}

[1;30;43mSe truncaron las últimas líneas 5000 del resultado de transmisión.[0m
c[i] raiz () = 0.742544
c[i] raiz () = 0.086280
c[i] raiz () = 0.190922
c[i] raiz () = 0.387636
c[i] raiz () = 0.226774
c[i] raiz () = 0.937275
c[i] raiz () = 0.558036
c[i] raiz () = 0.872923
c[i] raiz () = 0.717133
c[i] raiz () = 0.207212
c[i] raiz () = 0.966235
c[i] raiz () = 0.744556
c[i] raiz () = 0.611981
c[i] raiz () = 0.291879
c[i] raiz () = 0.201917
c[i] raiz () = 0.271460
c[i] raiz () = 0.865118
c[i] raiz () = 0.293907
c[i] raiz () = 0.702223
c[i] raiz () = 0.477784
c[i] raiz () = 0.139216
c[i] raiz () = 0.687778
c[i] raiz () = 0.955521
c[i] raiz () = 0.537624
c[i] raiz () = 0.160331
c[i] raiz () = 0.928522
c[i] raiz () = 0.132155
c[i] raiz () = 0.286439
c[i] raiz () = 0.638559
c[i] raiz () = 0.259159
c[i] raiz () = 0.305154
c[i] raiz () = 0.633829
c[i] raiz () = 0.364585
c[i] raiz () = 0.490600
c[i] raiz () = 0.476449
c[i] raiz () = 0.349315
c[i] raiz () = 0.431288
c[i] raiz () = 0.624089
c[i] ra

KeyboardInterrupt: ignored