In [2]:
import sys
import numpy as np

# Valores por defecto (Jupyter)
value = 5 * 10**7
ncores = 2

# Leer argumentos
try:
    value = int(sys.argv[1])
except:
    pass

try:
    ncores = int(sys.argv[2])
except:
    pass

# Crear el array 
X = np.random.rand(value)


## Reduction: the sum of the elements of an array

In [3]:
import numpy as np

def reduc_operation(A):
    """Compute the sum of the elements of Array A."""
    s = 0
    for i in range(A.size):
        s += A[i]
    return s

# Secuencial

value = 5*10**7

X = np.random.rand(value)

# Para imprimir los primeros valores del array

#print(X[0:12])

# Utilizando las operaciones mágicas de ipython

tiempo = %timeit -r 2 -o -q reduc_operation(X)

print("Time taken by reduction operation using a function:", tiempo)

print(f"And the result of the sum of numbers in the range [0, value) is: {reduc_operation(X)}\n")


# Utilizando numpy.sum()

tiempo = %timeit -r 2 -o -q np.sum(X)

print("Time taken by reduction operation using numpy.sum():", tiempo)

print("Now, the result using numpy.sum():", np.sum(X),"\n ")


Time taken by reduction operation using a function: 4.92 s ± 172 ms per loop (mean ± std. dev. of 2 runs, 1 loop each)
And the result of the sum of numbers in the range [0, value) is: 24997563.11012026

Time taken by reduction operation using numpy.sum(): 18.6 ms ± 20.4 μs per loop (mean ± std. dev. of 2 runs, 100 loops each)
Now, the result using numpy.sum(): 24997563.110118717 
 


In [4]:
from multiprocessing import Pool
import time

In [5]:
def reduc_parallel(A, nprocs):
    # a) dividir el array
    subarrays = np.array_split(A, nprocs)

    # b) crear pool
    with Pool(processes=nprocs) as pool:
        # c) map
        partial = pool.map(reduc_operation, subarrays)

    # d) reducir resultados
    return sum(partial)


In [6]:
start = time.time()
res = reduc_parallel(X, ncores)
stop = time.time()

print("Time taken by reduction operation using multiprocessing:", f"{stop-start:.3f} s")
print("Now, the result using multiprocessing:", res, "\n")


Time taken by reduction operation using multiprocessing: 3.043 s
Now, the result using multiprocessing: 24997563.110120557 



In [12]:
value = 5 * 10**7
ncores1 = 2
ncores2 = 4
X = np.random.rand(value)

start = time.time()
res = reduc_parallel(X, ncores1)
stop = time.time()

print("Time taken by reduction operation using multiprocessing:", f"{stop-start:.3f} s")
print("Now, the result using multiprocessing:", res, "\n")

start = time.time()
res = reduc_parallel(X, ncores2)
stop = time.time()

print("Time taken by reduction operation using multiprocessing:", f"{stop-start:.3f} s")
print("Now, the result using multiprocessing:", res, "\n")


Time taken by reduction operation using multiprocessing: 3.043 s
Now, the result using multiprocessing: 25001035.258445486 

Time taken by reduction operation using multiprocessing: 1.567 s
Now, the result using multiprocessing: 25001035.25844815 



In [8]:
from numba import njit, prange

# Versión Numba Secuencial

@njit
def reduc_operation_numba(A):
    s = 0.0
    for i in range(A.size):
        s += A[i]
    return s

# Versión Numba Paralela

@njit(parallel=True)
def reduc_operation_numba_parallel(A):
    s = 0.0
    for i in prange(A.size):
        s += A[i]
    return s

# Warm Up
reduc_operation_numba(X)
reduc_operation_numba_parallel(X)


24997563.110118635

In [9]:
tiempo = %timeit -r 2 -o -q reduc_operation_numba(X)

print("Time taken by reduction operation using Numba:", tiempo)
print("Now, the result using Numba:", reduc_operation_numba(X), "\n")


Time taken by reduction operation using Numba: 49.4 ms ± 229 μs per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using Numba: 24997563.11012026 



In [10]:
tiempo = %timeit -r 2 -o -q reduc_operation_numba_parallel(X)

print("Time taken by reduction operation using Numba paralela:", tiempo)
print("Now, the result using Numba paralela:", reduc_operation_numba_parallel(X), "\n")


Time taken by reduction operation using Numba paralela: 12.8 ms ± 1.34 ms per loop (mean ± std. dev. of 2 runs, 100 loops each)
Now, the result using Numba paralela: 24997563.110118635 



### Salida para 10^8:

Array size = 100000000

CPUs       = 1

OMP threads= 1

Time taken by reduction operation using a function: 8.57 s ± 776 μs per loop (mean ± std. dev. of 2 runs, 1 loop each)
And the result of the sum of numbers in the range [0, value) is: 25000197.54055075

Time taken by reduction operation using numpy.sum(): 38.9 ms ± 23.5 μs per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using numpy.sum(): 25000197.54054754 
 
Time taken by reduction operation using multiprocessing: 10.698 s
Now, the result using multiprocessing: 25000197.54055075 

Time taken by reduction operation using Numba: 59.4 ms ± 20 μs per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using Numba: 25000197.54055075 

Time taken by reduction operation using Numba paralela: 68.2 ms ± 1.3 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using Numba paralela: 25000197.540547695 

Array size = 100000000

CPUs       = 2

OMP threads= 2

Time taken by reduction operation using a function: 8.56 s ± 370 μs per loop (mean ± std. dev. of 2 runs, 1 loop each)
And the result of the sum of numbers in the range [0, value) is: 25001410.222938377

Time taken by reduction operation using numpy.sum(): 39.2 ms ± 102 μs per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using numpy.sum(): 25001410.22294169 
 
Time taken by reduction operation using multiprocessing: 5.628 s
Now, the result using multiprocessing: 25001410.222945638 

Time taken by reduction operation using Numba: 59.3 ms ± 2.48 μs per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using Numba: 25001410.222938377 

Time taken by reduction operation using Numba paralela: 75.4 ms ± 981 μs per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using Numba paralela: 25001410.222941823 

Array size = 100000000

CPUs       = 4

OMP threads= 4

Time taken by reduction operation using a function: 8.69 s ± 2.7 ms per loop (mean ± std. dev. of 2 runs, 1 loop each)
And the result of the sum of numbers in the range [0, value) is: 25000260.85754782

Time taken by reduction operation using numpy.sum(): 38.5 ms ± 202 μs per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using numpy.sum(): 25000260.857552096 
 
Time taken by reduction operation using multiprocessing: 3.165 s
Now, the result using multiprocessing: 25000260.85755286 

Time taken by reduction operation using Numba: 59.3 ms ± 21 μs per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using Numba: 25000260.85754782 

Time taken by reduction operation using Numba paralela: 73 ms ± 5.65 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using Numba paralela: 25000260.857552037 

Array size = 100000000

CPUs       = 8

OMP threads= 8

Time taken by reduction operation using a function: 8.44 s ± 13.7 ms per loop (mean ± std. dev. of 2 runs, 1 loop each)
And the result of the sum of numbers in the range [0, value) is: 25004247.95931078

Time taken by reduction operation using numpy.sum(): 32 ms ± 3.77 μs per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using numpy.sum(): 25004247.959315047 
 
Time taken by reduction operation using multiprocessing: 2.012 s
Now, the result using multiprocessing: 25004247.959315114 

Time taken by reduction operation using Numba: 58.6 ms ± 379 μs per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using Numba: 25004247.95931078 

Time taken by reduction operation using Numba paralela: 53.2 ms ± 1.28 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using Numba paralela: 25004247.95931507

### Salida para 10^9:

Array size = 1000000000

CPUs       = 1

OMP threads= 1

Time taken by reduction operation using a function: 8.77 s ± 5.69 ms per loop (mean ± std. dev. of 2 runs, 1 loop each)
And the result of the sum of numbers in the range [0, value) is: 24997160.88784177

Time taken by reduction operation using numpy.sum(): 32.5 ms ± 436 μs per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using numpy.sum(): 24997160.887841225 
 
Time taken by reduction operation using multiprocessing: 17.652 s
Now, the result using multiprocessing: 24997160.88784177 

Time taken by reduction operation using Numba: 57.7 ms ± 8.14 μs per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using Numba: 24997160.88784177 

Time taken by reduction operation using Numba paralela: 16.2 ms ± 187 μs per loop (mean ± std. dev. of 2 runs, 100 loops each)
Now, the result using Numba paralela: 24997160.887841176 

Array size = 1000000000

CPUs       = 2

OMP threads= 2

Time taken by reduction operation using a function: 8.46 s ± 1.43 ms per loop (mean ± std. dev. of 2 runs, 1 loop each)
And the result of the sum of numbers in the range [0, value) is: 24999549.26423816

Time taken by reduction operation using numpy.sum(): 33.7 ms ± 1.01 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using numpy.sum(): 24999549.264228523 
 
Time taken by reduction operation using multiprocessing: 9.226 s
Now, the result using multiprocessing: 24999549.264223583 

Time taken by reduction operation using Numba: 136 ms ± 8.5 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using Numba: 24999549.26423816 

Time taken by reduction operation using Numba paralela: 16.2 ms ± 95.9 μs per loop (mean ± std. dev. of 2 runs, 100 loops each)
Now, the result using Numba paralela: 24999549.264228538 

Array size = 1000000000

CPUs       = 4

OMP threads= 4

Time taken by reduction operation using a function: 8.58 s ± 6.13 ms per loop (mean ± std. dev. of 2 runs, 1 loop each)
And the result of the sum of numbers in the range [0, value) is: 24998443.550749265

Time taken by reduction operation using numpy.sum(): 32.1 ms ± 8.05 μs per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using numpy.sum(): 24998443.550745845 
 
Time taken by reduction operation using multiprocessing: 7.178 s
Now, the result using multiprocessing: 24998443.550744016 

Time taken by reduction operation using Numba: 71.7 ms ± 12.4 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using Numba: 24998443.550749265 

Time taken by reduction operation using Numba paralela: 50.4 ms ± 17.3 ms per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using Numba paralela: 24998443.550745755 

Array size = 1000000000

CPUs       = 8

OMP threads= 8

Time taken by reduction operation using a function: 8.57 s ± 153 μs per loop (mean ± std. dev. of 2 runs, 1 loop each)
And the result of the sum of numbers in the range [0, value) is: 25000736.43982481

Time taken by reduction operation using numpy.sum(): 34.2 ms ± 551 μs per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using numpy.sum(): 25000736.439827453 
 
Time taken by reduction operation using multiprocessing: 5.558 s
Now, the result using multiprocessing: 25000736.439829096 

Time taken by reduction operation using Numba: 58.7 ms ± 372 μs per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using Numba: 25000736.43982481 

Time taken by reduction operation using Numba paralela: 54.2 ms ± 948 μs per loop (mean ± std. dev. of 2 runs, 10 loops each)
Now, the result using Numba paralela: 25000736.439827446 

## Análisis de resultados

El código **Python secuencial** presenta tiempos prácticamente constantes, del orden de varios segundos, independientemente del tamaño del problema y del número de núcleos. Esto se debe a que la operación se realiza mediante un bucle en Python que no aprovecha el paralelismo y cuyo rendimiento está dominado por el overhead del intérprete.

La función **numpy.sum()** actúa como referencia optimizada. Sus tiempos se mantienen en el rango de decenas de milisegundos tanto para 10^8 como para 10^9 elementos, ya que está implementada en código nativo altamente optimizado. En este caso, el rendimiento está limitado principalmente por el acceso a memoria, por lo que aumentar el tamaño del array no incrementa proporcionalmente el tiempo de ejecución.

El uso de **multiprocessing** permite reducir el tiempo de ejecución al aumentar el número de núcleos, tanto para 10^8 como para 10^9 elementos. Sin embargo, el escalado no es lineal debido al overhead asociado a la creación de procesos y a la comunicación de datos entre ellos. Este efecto es especialmente notable para un único núcleo, donde el uso de multiprocessing puede incluso empeorar el rendimiento respecto a la versión secuencial en Python puro, estos resultados están en el orden de segundos.

La versión con **Numba y @njit** reduce drásticamente el tiempo de ejecución frente al código Python original, obteniendo tiempos del mismo orden de magnitud que numpy.sum(), sin llegar a mejorarlos. Al igual que en este último caso, el rendimiento está limitado por el acceso a memoria, lo que explica que los tiempos no escalen de forma clara al aumentar el tamaño del problema.

Finalmente, la versión con **@njit(parallel=True)** no muestra una mejora clara al aumentar el número de núcleos, y su rendimiento no escala como cabría esperar a priori. Aunque en algunos casos (con uno o dos núcleos para 10^9) ofrece los mejores tiempos, al aumentar el número de CPUs la mejora se estanca o incluso se reduce. De nuevo, se encuentra limitado por el acceso a memoria: todos los núcleos deben leer el array completo y compiten por el mismo recurso. En estas condiciones, añadir más paralelismo no implica necesariamente una reducción del tiempo de ejecución y puede introducir un overhead adicional.
