<a href="https://colab.research.google.com/github/cealgogu-utnay/Nuevo_Repositorio/blob/main/ECU2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!uv pip install -q --system numba-cuda==0.4.0
import numpy as np

import time
import os

# Enable the CUDA simulator. This MUST be set BEFORE numba imports or kernel definitions.
os.environ["NUMBA_ENABLE_CUDASIM"] = "1"
from numba import cuda
from numba import config

# --- Configuration & Data Preparation ---

config.CUDA_ENABLE_PYNVJITLINK = 1

In [None]:
# Prepare character data (ASCII values for A-H, 8 characters total)
characters = ['A', 'B', 'C', 'D','E', 'F','G', 'H']
data =np.array([ord(c) for c in characters], dtype=np.uint8)
data_size = len(data) # 8 elements
# ---- 1D Kernel Definition (using, gidx, bidx, tidx) ---

@cuda.jit
def kernel_1d_dims(arr):
  #gidx: Global 1D index (Thread ID in the entire grid)
  gidx = cuda.grid(1)

  # bidx: Block ID (Block index in the grid)
  bidx = cuda.blockIdx.x
  # tidx: Thread ID (Thread index within the block)
  tidx =cuda.threadIdx.x

  if gidx < arr.size:
    # Standar Python print works via the simulator
     print(f"BID: {bidx}, TID: {tidx}, GID: {gidx}, Char: {chr(arr[gidx])}")
# =======================================
# Example 1: 1 Block, 8 Threads per Block
# ========================================
# un block no puede exceder 1024 threads (hilos)
blocks_per_grid_ex1 = 8
threads_per_block_ex1 = 1
# Total threads = 1 * 8 = 8
#blocks_per_grid_ex1 = (1000, 60, 60) = 10,000 blocks, para Y y Z lo maximo seria 64
#threads_per_block_ex1 = (32, 32, 1) = 1,024
# Total threads = 1,024,000 * 65,536 * 65,536 =

# X = 2^(32) - 1 threads limit, X = 102,400 (check)
# Y = 65,536 - 1 threads limit, Y = 102,400 (check)
# Z = 65,536 - 1 threads limit, Z = 102,400 (check)

kernel_1d_dims[blocks_per_grid_ex1, threads_per_block_ex1](data)
cuda.synchronize()


BID: 0, TID: 0, GID: 0, Char: A
BID: 1, TID: 0, GID: 1, Char: B
BID: 2, TID: 0, GID: 2, Char: C
BID: 3, TID: 0, GID: 3, Char: D
BID: 4, TID: 0, GID: 4, Char: E
BID: 5, TID: 0, GID: 5, Char: F
BID: 6, TID: 0, GID: 6, Char: G
BID: 7, TID: 0, GID: 7, Char: H


In [None]:
# ==============================================================
# Example 2: 2 Blocks, 4 Threads per Block
# ==============================================================

blocks_per_grid_ex2 = 2
threads_per_block_ex2 = 4
# Total threads = 2 * 4 = 8

kernel_1d_dims[blocks_per_grid_ex2, threads_per_block_ex2](data)
cuda.synchronize()

BID: 0, TID: 0, GID: 0, Char: A
BID: 0, TID: 1, GID: 1, Char: B
BID: 0, TID: 2, GID: 2, Char: C
BID: 0, TID: 3, GID: 3, Char: D
BID: 1, TID: 0, GID: 4, Char: E
BID: 1, TID: 1, GID: 5, Char: F
BID: 1, TID: 2, GID: 6, Char: G
BID: 1, TID: 3, GID: 7, Char: H


In [None]:
@cuda.jit
def whoami():
    # Compute block id in a 3D grid
    block_id = (
        cuda.blockIdx.x +
        cuda.blockIdx.y * cuda.gridDim.x +
        cuda.gridDim.x * cuda.gridDim.y
    )

    # Threads per block
    threads_per_block = (
        cuda.blockDim.x * cuda.blockDim.y
    )

    # Offset of this block
    block_offset = block_id * threads_per_block

    # Compute thread id inside block
    thread_offset = (
        cuda.threadIdx.x +
        cuda.threadIdx.y * cuda.blockDim.x +
        cuda.blockDim.x * cuda.blockDim.y
    )

    # Global thread id across all blocks
    global_id = block_offset + thread_offset


    print(f"{global_id:03d} | Block[x, y, z]({cuda.blockIdx.x} {cuda.blockIdx.y}) = {block_id:3d} | "
          f"Thread[x, y] ({cuda.threadIdx.x} {cuda.threadIdx.y} ) = {thread_offset:3d} BlockDim.x {cuda.blockDim.x} BlockDim.y {cuda.blockDim.y} GridDim.x {cuda.gridDim.x} GridDim.y {cuda.gridDim.y}")


b_x, b_y, b_z = 2, 2, 1
t_x, t_y, t_z = 4, 1, 1

blocks_per_grid = (b_x, b_y, b_z)
threads_per_block = (t_x, t_y, t_z)

total_blocks = b_x * b_y * b_z
total_threads = t_x * t_y * t_z
print(f"{total_blocks} blocks/grid")
print(f"{total_threads} threads/block")
print(f"{total_blocks * total_threads} total threads\n")

# Launch kernel
whoami[blocks_per_grid, threads_per_block]()

# Wait for GPU to finish (like cudaDeviceSynchronize)
cuda.synchronize()

4 blocks/grid
4 threads/block
16 total threads

020 | Block[x, y, z](0 0) =   4 | Thread[x, y] (0 0 ) =   4 BlockDim.x 4 BlockDim.y 1 GridDim.x 2 GridDim.y 2
021 | Block[x, y, z](0 0) =   4 | Thread[x, y] (1 0 ) =   5 BlockDim.x 4 BlockDim.y 1 GridDim.x 2 GridDim.y 2
022 | Block[x, y, z](0 0) =   4 | Thread[x, y] (2 0 ) =   6 BlockDim.x 4 BlockDim.y 1 GridDim.x 2 GridDim.y 2
023 | Block[x, y, z](0 0) =   4 | Thread[x, y] (3 0 ) =   7 BlockDim.x 4 BlockDim.y 1 GridDim.x 2 GridDim.y 2
028 | Block[x, y, z](0 1) =   6 | Thread[x, y] (0 0 ) =   4 BlockDim.x 4 BlockDim.y 1 GridDim.x 2 GridDim.y 2
029 | Block[x, y, z](0 1) =   6 | Thread[x, y] (1 0 ) =   5 BlockDim.x 4 BlockDim.y 1 GridDim.x 2 GridDim.y 2
030 | Block[x, y, z](0 1) =   6 | Thread[x, y] (2 0 ) =   6 BlockDim.x 4 BlockDim.y 1 GridDim.x 2 GridDim.y 2
031 | Block[x, y, z](0 1) =   6 | Thread[x, y] (3 0 ) =   7 BlockDim.x 4 BlockDim.y 1 GridDim.x 2 GridDim.y 2
024 | Block[x, y, z](1 0) =   5 | Thread[x, y] (0 0 ) =   4 BlockDim.x 4