<a href="https://colab.research.google.com/github/arbinydv/Machine-Learning/blob/main/W1_E1_Solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
# GPU Computing Exercise 1 Solutions

In [23]:
import cupy as cp
import numpy as np

In [31]:
# Task 1 Practice

thread_idx_kernel = cp.RawKernel(r'''
extern "C" __global__
void thread_idx(float* out)
{
  int indx = blockIdx.x * blockDim.x + threadIdx.x;
    out[indx] = indx;
}
''', 'thread_idx')

# assign the vector

vector_sz= 1000
output = cp.zeros(vector_sz, dtype=cp.float32)

thread_pr_blk = 256
blocks_per_grid = (vector_sz + thread_pr_blk - 1) // thread_pr_blk

thread_idx_kernel((blocks_per_grid, 1, 1), (thread_pr_blk, 1, 1), output)

print("THe output of first 20 elements is:", output[:20].get())


THe output of first 20 elements is: [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18. 19.]


In [30]:
# TASK 2 ==> A kernel which takes two vectors A and B and adds them together to form a vector C.

vector_adder_kernel = cp.RawKernel(r'''
extern "C" __global__
void vector_adder(float* A, float* B, float* C, int vector_size)
{
  int indx = blockIdx.x * blockDim.x + threadIdx.x;

  if (indx < vector_size)
  {
    C[indx] = A[indx] + B[indx];
  }
}
''', 'vector_adder')


# vector initialization
vector_sz = 50
A = cp.zeros(vector_sz, dtype=cp.float32)
B = cp.random.rand(vector_sz, dtype=cp.float32)
C = cp.zeros(vector_sz, dtype=cp.float32)

# Kernel Call

threads_per_block = 256
blocks_per_grid = (vector_sz + threads_per_block - 1) // threads_per_block

vector_adder_kernel((blocks_per_grid, 1, 1), (threads_per_block, 1, 1), (A, B, C, np.int32(vector_sz)))

# Display
print(C)

# Validates the addition
print("Validating C = A+B ==>", np.allclose(C, A + B))


[0.06040776 0.3298758  0.85877097 0.37285647 0.797116   0.85741156
 0.39641282 0.90738106 0.3240584  0.60581505 0.2186684  0.10242421
 0.8267793  0.07266703 0.09956244 0.270448   0.64732814 0.30453515
 0.97300786 0.82268745 0.49947056 0.2756813  0.15297419 0.55198836
 0.02170919 0.4193786  0.82551163 0.42315635 0.60463315 0.46792215
 0.6431096  0.6963408  0.9305247  0.10266901 0.46572918 0.1203765
 0.6100591  0.6488709  0.3160373  0.3352976  0.7026568  0.40861544
 0.38604563 0.35752347 0.3481771  0.03127746 0.50698954 0.77525926
 0.08266708 0.9194981 ]
Validating C = A+B ==> True


In [29]:
# Task 3: Vector Add + Multiply

vector_add_mult_kernel= cp.RawKernel(r'''
extern "C" __global__
void vector_add_mult(int* A, int* B, int* C, int* D, int vector_size)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;

  if (index < vector_size)
  {
    float temp = A[index] + B[index];
    D[index] = temp * C[index];
  }
}
''', 'vector_add_mult')

# Initialization

vector_sz = 10
A = cp.random.randint(0,100,vector_sz, dtype=cp.int32)
B = cp.random.randint(0,100,vector_sz, dtype=cp.int32)
C = cp.random.randint(0,100, vector_sz, dtype=cp.int32)
D = cp.zeros(vector_sz, dtype=cp.int32)

# populate vector

thread_per_block = 256
blocks_per_grid = (vector_sz + thread_per_block - 1) // thread_per_block

# calls the kernel
vector_add_mult_kernel((blocks_per_grid, 1, 1), (thread_per_block, 1, 1), (A, B, C, D, vector_sz))

# Display

print("Vector A: ", A)
print("Vector B: ", B)
print("Vector C: ", C)
print("Vector D: ", D)

print("add",(A+B))
print("mul",(A+B)*C)

# validates if the addition and multiplication true
print("Validating D = (A+B)*C ==> ", np.allclose(D, ( (A + B) * C)))


Vector A:  [72 89 41 51 20 82 36 18 30  3]
Vector B:  [90 85 25 28 77 67 19 30 75 21]
Vector C:  [89  6 81 28 35  9 60 91 77 64]
Vector D:  [14418  1044  5346  2212  3395  1341  3300  4368  8085  1536]
add [162 174  66  79  97 149  55  48 105  24]
mul [14418  1044  5346  2212  3395  1341  3300  4368  8085  1536]
Validating D = (A+B)*C ==>  True
