<a href="https://colab.research.google.com/github/cealgogu-utnay/Nuevo_Repositorio/blob/main/ECU1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!uv pip install -q --system numba-cuda==0.4.0
import numpy as np
from numba import cuda
import time
import os
from numba import config
config.CUDA_ENABLE_PYNVJITLINK = 1

In [None]:
#CUDA Steps
#Initializing data on CPU
#Transfer from CPU to GPU
#Run Kernel with defined Grid/Block size (Threads)
#Transfer results from GPU to CPU

# 1. CUDA kernel
@cuda.jit
def first_kernel(a, result):
  idx = cuda.grid(1)    # index thread
  if idx < a.size:
    result[idx] = a[idx]

# Host CPU
def main():
    # 1. Initialize data on CPU
    N = 10_000_000
    a_cpu = np.arange(N, dtype=np.float32)

    # -----------------------------
    # CPU computation
    # -----------------------------
    start = time.time()
    result_cpu = a_cpu.copy()
    cpu_time = time.time() - start
    print(f"CPU time: {cpu_time * 1e3:.2f} ms")

    # -----------------------------
    # GPU computation
    # -----------------------------
    # 2.- Transfer from CPU to GPU
    start = time.time()
    a_gpu = cuda.to_device(a_cpu)
    result_gpu = cuda.device_array_like(a_cpu)  #  reserve memory
    transfer_in_time = time.time() - start

    # 3.- Kernel launch
    threads_per_block = 128
    blocks_per_grid = (N + threads_per_block - 1) // threads_per_block  # (10_000_00 + 127)//128 = 78,125 blocks
    start = time.time()
    first_kernel[blocks_per_grid, threads_per_block](a_gpu, result_gpu) # launch kernel
    cuda.synchronize()
    kernel_time = time.time() - start

    # Copy back
    start = time.time()
    result_from_gpu = result_gpu.copy_to_host()
    cuda.synchronize()
    transfer_out_time = time.time() - start

    # Report
    print(f"GPU transfer to device: {transfer_in_time * 1e3:2f} ms")
    print(f"GPU Kernel execution:  {kernel_time * 1e3:.2f} ms")
    print(f"GPU transfer to host:   {transfer_out_time * 1e3:.2f} ms")
    print(f"Total GPU time:         {(transfer_in_time + kernel_time + transfer_out_time) * 1e3:.2f} ms")

    # 5. Cleanup - Libera memoria de la GPU
    del a_gpu, result_gpu
    cuda.close()

if  __name__ == "__main__":
    main()

CPU time: 13.98 ms
GPU transfer to device: 85.096598 ms
GPU Kernel execution:  47.10 ms
GPU transfer to host:   14.96 ms
Total GPU time:         147.15 ms
