# AbdelRahman Adel AbdelFattah
## 17012296
### Initializing the code

Downloading PyCUDA

In [None]:
pip install pycuda

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycuda
  Downloading pycuda-2022.2.2.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytools>=2011.2
  Downloading pytools-2022.1.14.tar.gz (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.6/74.6 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (pyproject.

In [None]:
import numpy as np
from pycuda import compiler, gpuarray, tools
import pycuda.driver as cuda
import pycuda.autoinit
import time

The Kernel code that will use the GPU directly.
GPU functions.

In [None]:
kernel_code = """
__global__ void multi_gpu(int matrixsize,float *a, float *b, float *c)
{
    int tx = blockDim.x*blockIdx.x + threadIdx.x;
    int ty = blockDim.y*blockIdx.y + threadIdx.y;

    if((ty <matrixsize) && (tx < matrixsize)){
      float Pvalue = 0;
      for(int k=0; k<matrixsize;++k){
        float Aelement = a[ty*matrixsize +k];
        float Belement = b[k*matrixsize +tx];
        Pvalue += Aelement * Belement;
      }
      c[ty * matrixsize + tx] = Pvalue;
    }
}

__global__ void add_gpu(int matrixsize,float *a, float *b, float *c)
{
    int tx = blockDim.x*blockIdx.x + threadIdx.x;
    int ty = blockDim.y*blockIdx.y + threadIdx.y;

    if((ty <matrixsize) && (tx < matrixsize)){
      float Aelement = a[ty * matrixsize + tx];
      float Belement = b[ty * matrixsize + tx];
      c[ty * matrixsize + tx] = Aelement + Belement;
    }
}
"""

Functions for CPU

In [None]:
def add_cpu(matrix_a, matrix_b):
  result = np.zeros((matrix_a.shape[0], matrix_a.shape[1]))
  for i in range(matrix_a.shape[0]):
    for j in range(matrix_a.shape[1]):
      result[i][j] = matrix_a[i][j] + matrix_b[i][j]
  return result

In [None]:
def multi_cpu(matrix_a, matrix_b):
  result = np.zeros((matrix_a.shape[0], matrix_a.shape[1]))
  for i in range(matrix_a.shape[0]):
    for j in range(matrix_b.shape[1]):
      for k in range(matrix_b.shape[0]):
        result[i][j] += matrix_a[i][k] * matrix_b[k][j];
  return result

In [None]:
mod = compiler.SourceModule(kernel_code)

multi_gpu = mod.get_function("multi_gpu")
add_gpu = mod.get_function("add_gpu")

BLOCK_SIZE = 32
MAX_NUM = 1024

MATRIX_SIZE = 3
if MATRIX_SIZE%BLOCK_SIZE != 0:
    grid=(MATRIX_SIZE//BLOCK_SIZE+1,MATRIX_SIZE//BLOCK_SIZE+1,1)
else:
    grid=(MATRIX_SIZE//BLOCK_SIZE,MATRIX_SIZE//BLOCK_SIZE,1)
matrixsize=MATRIX_SIZE

a_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)
b_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)
c_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)

a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
c_gpu = gpuarray.to_gpu(c_cpu)

# Calculating different sizes and calculating the time.

## Matrix size = 3

In [None]:
MATRIX_SIZE = 3
if MATRIX_SIZE%BLOCK_SIZE != 0:
    grid=(MATRIX_SIZE//BLOCK_SIZE+1,MATRIX_SIZE//BLOCK_SIZE+1,1)
else:
    grid=(MATRIX_SIZE//BLOCK_SIZE,MATRIX_SIZE//BLOCK_SIZE,1)
matrixsize=MATRIX_SIZE

a_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)
b_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)
c_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)

a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
c_gpu = gpuarray.to_gpu(c_cpu)

In [None]:
# CPU
start = time.time()
temp1_cpu = a_cpu + b_cpu
temp2_cpu = np.dot(a_cpu, temp1_cpu)
temp3_cpu = temp2_cpu + c_cpu
end = time.time()

a_cpu = temp3_cpu
print(f'Finished in: {end-start}')

Finished in: 0.0037767887115478516


In [None]:
# GPU
temp1_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
temp2_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
temp3_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)

start = time.time()
add_gpu(np.uint32(matrixsize),
    a_gpu, b_gpu,
    temp1_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
multi_gpu(np.uint32(matrixsize),
    a_gpu, temp1_gpu,
    temp2_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
add_gpu(np.uint32(matrixsize),
    temp2_gpu, c_gpu,
    temp3_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
end = time.time()

a_gpu = temp3_gpu
print(f'Finished in: {end-start}')

Finished in: 0.0012199878692626953


## Matrix size = 10

In [None]:
MATRIX_SIZE = 10

if MATRIX_SIZE%BLOCK_SIZE != 0:
    grid=(MATRIX_SIZE//BLOCK_SIZE+1,MATRIX_SIZE//BLOCK_SIZE+1,1)
else:
    grid=(MATRIX_SIZE//BLOCK_SIZE,MATRIX_SIZE//BLOCK_SIZE,1)
matrixsize=MATRIX_SIZE

a_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)
b_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)
c_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)

a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
c_gpu = gpuarray.to_gpu(c_cpu)

In [None]:
# CPU
start = time.time()
temp1_cpu = a_cpu + b_cpu
temp2_cpu = np.dot(a_cpu, temp1_cpu)
temp3_cpu = temp2_cpu + c_cpu
end = time.time()

a_cpu = temp3_cpu
print(f'Finished in: {end-start}')

Finished in: 0.0002346038818359375


In [None]:
# GPU
temp1_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
temp2_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
temp3_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)

start = time.time()
add_gpu(np.uint32(matrixsize),
    a_gpu, b_gpu,
    temp1_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
multi_gpu(np.uint32(matrixsize),
    a_gpu, temp1_gpu,
    temp2_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
add_gpu(np.uint32(matrixsize),
    temp2_gpu, c_gpu,
    temp3_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
end = time.time()

a_gpu = temp3_gpu
print(f'Finished in: {end-start}')

Finished in: 0.0003705024719238281


## Matrix size = 100

In [None]:
MATRIX_SIZE = 100

if MATRIX_SIZE%BLOCK_SIZE != 0:
    grid=(MATRIX_SIZE//BLOCK_SIZE+1,MATRIX_SIZE//BLOCK_SIZE+1,1)
else:
    grid=(MATRIX_SIZE//BLOCK_SIZE,MATRIX_SIZE//BLOCK_SIZE,1)
matrixsize=MATRIX_SIZE

a_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)
b_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)
c_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)

a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
c_gpu = gpuarray.to_gpu(c_cpu)

In [None]:
# CPU
start = time.time()
temp1_cpu = a_cpu + b_cpu
temp2_cpu = np.dot(a_cpu, temp1_cpu)
temp3_cpu = temp2_cpu + c_cpu
end = time.time()

a_cpu = temp3_cpu
print(f'Finished in: {end-start}')

Finished in: 0.005835771560668945


In [None]:
# GPU
temp1_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
temp2_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
temp3_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)

start = time.time()
add_gpu(np.uint32(matrixsize),
    a_gpu, b_gpu,
    temp1_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
multi_gpu(np.uint32(matrixsize),
    a_gpu, temp1_gpu,
    temp2_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
add_gpu(np.uint32(matrixsize),
    temp2_gpu, c_gpu,
    temp3_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
end = time.time()

a_gpu = temp3_gpu
print(f'Finished in: {end-start}')

Finished in: 0.0005102157592773438


## Matrix Size = 1,000

In [None]:
MATRIX_SIZE = 1_000

if MATRIX_SIZE%BLOCK_SIZE != 0:
    grid=(MATRIX_SIZE//BLOCK_SIZE+1,MATRIX_SIZE//BLOCK_SIZE+1,1)
else:
    grid=(MATRIX_SIZE//BLOCK_SIZE,MATRIX_SIZE//BLOCK_SIZE,1)
matrixsize=MATRIX_SIZE

a_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)
b_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)
c_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)

a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
c_gpu = gpuarray.to_gpu(c_cpu)

  globals().clear()


In [None]:
# CPU
start = time.time()
temp1_cpu = a_cpu + b_cpu
temp2_cpu = np.dot(a_cpu, temp1_cpu)
temp3_cpu = temp2_cpu + c_cpu
end = time.time()

a_cpu = temp3_cpu
print(f'Finished in: {end-start}')

Finished in: 0.06137251853942871


In [None]:
# GPU
temp1_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
temp2_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
temp3_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)

start = time.time()
add_gpu(np.uint32(matrixsize),
    a_gpu, b_gpu,
    temp1_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
multi_gpu(np.uint32(matrixsize),
    a_gpu, temp1_gpu,
    temp2_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
add_gpu(np.uint32(matrixsize),
    temp2_gpu, c_gpu,
    temp3_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
end = time.time()

a_gpu = temp3_gpu
print(f'Finished in: {end-start}')

Finished in: 0.0005323886871337891


## Matrix Size = 5,000

In [None]:
MATRIX_SIZE = 5_000

if MATRIX_SIZE%BLOCK_SIZE != 0:
    grid=(MATRIX_SIZE//BLOCK_SIZE+1,MATRIX_SIZE//BLOCK_SIZE+1,1)
else:
    grid=(MATRIX_SIZE//BLOCK_SIZE,MATRIX_SIZE//BLOCK_SIZE,1)
matrixsize=MATRIX_SIZE

a_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)
b_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)
c_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)

a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
c_gpu = gpuarray.to_gpu(c_cpu)

  globals().clear()


In [None]:
# CPU
start = time.time()
temp1_cpu = a_cpu + b_cpu
temp2_cpu = np.dot(a_cpu, temp1_cpu)
temp3_cpu = temp2_cpu + c_cpu
end = time.time()

a_cpu = temp3_cpu
print(f'Finished in: {end-start}')

  globals().clear()


Finished in: 3.5989608764648438


In [None]:
# GPU
temp1_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
temp2_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
temp3_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)

start = time.time()
add_gpu(np.uint32(matrixsize),
    a_gpu, b_gpu,
    temp1_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
multi_gpu(np.uint32(matrixsize),
    a_gpu, temp1_gpu,
    temp2_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
add_gpu(np.uint32(matrixsize),
    temp2_gpu, c_gpu,
    temp3_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
end = time.time()

a_gpu = temp3_gpu
print(f'Finished in: {end-start}')

Finished in: 0.0004749298095703125


## Matrix Size = 10,000

In [None]:
MATRIX_SIZE = 10_000

if MATRIX_SIZE%BLOCK_SIZE != 0:
    grid=(MATRIX_SIZE//BLOCK_SIZE+1,MATRIX_SIZE//BLOCK_SIZE+1,1)
else:
    grid=(MATRIX_SIZE//BLOCK_SIZE,MATRIX_SIZE//BLOCK_SIZE,1)
matrixsize=MATRIX_SIZE

a_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)
b_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)
c_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)

a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
c_gpu = gpuarray.to_gpu(c_cpu)

  globals().clear()
  globals().clear()


In [None]:
# CPU
start = time.time()
temp1_cpu = a_cpu + b_cpu
temp2_cpu = np.dot(a_cpu, temp1_cpu)
temp3_cpu = temp2_cpu + c_cpu
end = time.time()

a_cpu = temp3_cpu
print(f'Finished in: {end-start}')

  globals().clear()


Finished in: 30.343129634857178


In [None]:
# GPU
temp1_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
temp2_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
temp3_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)

start = time.time()
add_gpu(np.uint32(matrixsize),
    a_gpu, b_gpu,
    temp1_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
multi_gpu(np.uint32(matrixsize),
    a_gpu, temp1_gpu,
    temp2_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
add_gpu(np.uint32(matrixsize),
    temp2_gpu, c_gpu,
    temp3_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
end = time.time()

a_gpu = temp3_gpu
print(f'Finished in: {end-start}')

Finished in: 0.000492095947265625


## Matrix Size = 15,000

In [None]:
MATRIX_SIZE = 15_000

if MATRIX_SIZE%BLOCK_SIZE != 0:
    grid=(MATRIX_SIZE//BLOCK_SIZE+1,MATRIX_SIZE//BLOCK_SIZE+1,1)
else:
    grid=(MATRIX_SIZE//BLOCK_SIZE,MATRIX_SIZE//BLOCK_SIZE,1)
matrixsize=MATRIX_SIZE

a_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)
b_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)
c_cpu = np.random.randint(MAX_NUM,size=(MATRIX_SIZE, MATRIX_SIZE)).astype(np.float32)

a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
c_gpu = gpuarray.to_gpu(c_cpu)

  globals().clear()
  globals().clear()


In [None]:
# CPU
start = time.time()
temp1_cpu = a_cpu + b_cpu
temp2_cpu = np.dot(a_cpu, temp1_cpu)
temp3_cpu = temp2_cpu + c_cpu
end = time.time()

a_cpu = temp3_cpu
print(f'Finished in: {end-start}')

  globals().clear()


Finished in: 91.37070488929749


In [None]:
# GPU
temp1_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
temp2_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)
temp3_gpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.float32)

start = time.time()
add_gpu(np.uint32(matrixsize),
    a_gpu, b_gpu,
    temp1_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
multi_gpu(np.uint32(matrixsize),
    a_gpu, temp1_gpu,
    temp2_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
add_gpu(np.uint32(matrixsize),
    temp2_gpu, c_gpu,
    temp3_gpu,
    grid=grid,
    block = (BLOCK_SIZE, BLOCK_SIZE, 1))
end = time.time()

a_gpu = temp3_gpu
print(f'Finished in: {end-start}')

Finished in: 0.0005667209625244141
