<a href="https://colab.research.google.com/github/bonomip/GPU/blob/master/gpu_progetto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

---
# PROGETTO CLUSTER
---

# 🎬 CUDA setup

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

## [GPU Compute Capability](https://developer.nvidia.com/cuda-gpus)

## NVCC Plugin for Jupyter notebook

*Usage*:


*   Load Extension `%load_ext nvcc_plugin`
*   Mark a cell to be treated as cuda cell
`%%cuda --name example.cu --compile false`

**NOTE**: The cell must contain either code or comments to be run successfully. It accepts 2 arguments. `-n | --name` - which is the name of either CUDA source or Header. The name parameter must have extension `.cu` or `.h`. Second argument -c | --compile; default value is false. The argument is a flag to specify if the cell will be compiled and run right away or not. It might be usefull if you're playing in the main function

*  We are ready to run CUDA C/C++ code right in your Notebook. For this we need explicitly say to the interpreter, that we want to use the extension by adding `%%cu` at the beginning of each cell with CUDA code. 




In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git


In [None]:
%load_ext nvcc_plugin

In [None]:
%pip install --target=$nb_path pycuda

## ⛔️ NOT WORKING INSIDE CELLS pyCuda

In [None]:
#import os, sys
#from google.colab import drive

#try:
#  drive.mount('/content/drive')
#  nb_path = '/content/notebooks'
#  os.symlink("/content/drive/My Drive/Colab_Notebooks", nb_path)
#  sys.path.insert(0,nb_path)
#except FileExistsError:
#  print("")

Run this command only once

In [None]:
#!pip install --target=$nb_path pycuda

## 📦 Packages

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA
import plotly.graph_objects as go
import plotly.express as px
import numpy
import numba
import time
  

def TicTocGenerator():
  # Generator that returns time differences
  ti = 0           # initial time
  tf = time.time() # final time
  while True:
    ti = tf
    tf = time.time()
    yield tf-ti # returns the time difference

TicToc = TicTocGenerator() 

def toc(tempBool=True):
  # Prints the time difference yielded by generator instance TicToc
  tempTimeInterval = next(TicToc)
  if tempBool:
    print( "Elapsed time: %f seconds." %tempTimeInterval )
  return tempTimeInterval

def tic():
  # Records a time in TicToc, marks the beginning of a time interval
  toc(False)

# instance creation
def rand_blobs(n, d, k=3, std=1, rstate=None, standard=True, display=True, threeD=False): 
  X, l = make_blobs(n_samples=n, centers=k, n_features=d, cluster_std=std, random_state=rstate)
  if standard:
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
  W = euclidean_distances(X, X)

  # plot
  if display:
    plot_blobs(X,l,threeD)
  return X, l, W, n

def histogram(theta, nbins=None, verb=True):
  if nbins is None:
    nbins = len(theta)
  binsLIM = numpy.linspace(0,2*numpy.pi,nbins)
  hist, bins = numpy.histogram(theta, binsLIM)

  if verb:
    print('Data size : %d' %len(theta))
    print('Num bins  : %d' %nbins)

  return hist, bins 

def plot_circle(theta, l=None, radius=500):
  """
    Produce a plot with the locations of all poles and zeros
  """

  x = numpy.cos(theta)
  y = numpy.sin(theta)

  fig = go.Figure()
  fig.add_shape(type="circle", xref="x", yref="y", x0=-1, y0=-1, x1=1, y1=1, line=dict(color="black", width=1))
  
  if l is None:
    fig.add_trace(go.Scatter(x=x, y=y,
          mode='markers',
          marker_symbol='circle',
          marker_size=10))
  else:
    ul = numpy.unique(l)
    cols = list(range(len(ul)))
    for c,u in zip(cols,ul):
      idx = numpy.where(u == l)
      fig.add_trace(go.Scatter(x=x[idx], y=y[idx],
          mode='markers',
          marker_symbol='circle',
          marker_color=cols[c], 
          marker_line_color=cols[c],
          marker_line_width=0, 
          marker_size=10))
  
  M = 1.05
  fig.update_xaxes(title='', range=[-M, M])
  fig.update_yaxes(title='', range=[-M, M])
  fig.update_layout(title='clusters', width=radius, height=radius)
  fig.show()

def plot_hist(hist, bins, mode=0, smooth_wlen=None):

  if mode==0:
    mode_line = 'lines'
  elif mode == 1:
    mode_line = 'markers'
  else:
    mode_line = 'lines+markers'
  
  if smooth_wlen is not None:
    hist = smooth(hist, window_len=smooth_wlen, window='hanning')

  figh = go.Figure(data=go.Scatter(x=bins, y=hist, mode=mode_line))
  figh.show()

def plot_blobs(X, labels=None, threeD=False, doPCA=True, sizex=1):

  if threeD:
    if PCA:
      pca = PCA(n_components=3)
      components = pca.fit_transform(X)
    else:
      components = X[:,0:3]  
    if labels is None:
      fig = px.scatter_3d(components, x=0, y=1, z=2, title='Blobs 3D',
                          labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'})
    else:
      fig = px.scatter_3d(components, x=0, y=1, z=2, color=labels, title='Blobs 3D',
                          labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'})
  else:
    if doPCA:
      pca = PCA(n_components=2)
      components = pca.fit_transform(X)
    else:
      components = X[:,0:2]  
    if labels is None:
      fig = px.scatter(components, x=0, y=1, title='Blobs 2D', labels={'0': 'PC 1', '1': 'PC 2'})
    else:
      fig = px.scatter(components, x=0, y=1, title='Blobs 2D', color=labels, labels={'0': 'PC 1', '1': 'PC 2'})
  
  fig.update_layout(
    width = 800*sizex,
    height = 800*sizex,
    title = "fixed-ratio axes")
  fig.update_yaxes(
    scaleanchor = "x",
    scaleratio = 1)
  fig.show()


def kmeans(X, k, niter=300, rstate=0, n_init=10):
  kmeans = KMeans(n_clusters=k, n_init=n_init, max_iter=niter, random_state=rstate)
  kmeans.fit(X)

  return kmeans.labels_, kmeans.n_iter_, kmeans.inertia_ 

# 🪣 CSN Algorithms

## 📟 Legacy Code

### CPU


In [None]:
@numba.jit
def CSN_LEGACY(W, eps=0.01, theta0=None, normalize=True, seed=1):

  # general vars
  PI = numpy.pi #numpy.pi
  n = W.shape[0]

  # param check
  if normalize:
    W = W / numpy.linalg.norm(W) #linear algebra norm
  if theta0 is None: # if theta is not defined
    numpy.random.seed(seed)
    theta = 2*PI*numpy.random.rand(n)  # init. values in [0, 2*PI] # cudarandom
  else:
    theta = theta0

  # preliminar computations 
  sin_t = numpy.sin(theta)
  cos_t = numpy.cos(theta)
  A = numpy.dot(W, cos_t)
  B = numpy.dot(W, sin_t)

  # main loop
  ok = True
  rounds = 0
  while ok:
    ok = False
    rounds += 1
    nchanges = 0
    # loop on angles
    for i in range(n):
      old = theta[i]

      # change i-th theta
      theta[i] = numpy.arctan(B[i]/A[i])    # within [-PI/2, PI/2]

      if A[i] >= 0:
        theta[i] += PI
      elif B[i] > 0:
        theta[i] += 2*PI

      # update Ak & Bk by elementwise product and diff
      A += numpy.multiply(W[i,:], numpy.repeat(numpy.cos(theta[i]) - numpy.cos(old), n))
      B += numpy.multiply(W[i,:], numpy.repeat(numpy.sin(theta[i]) - numpy.sin(old), n)) 

      if min(abs(old-theta[i]),abs(2*PI-old+theta[i])) > eps:
        ok = True
        nchanges += 1

  print("rounds="+str(rounds))

  return theta

### GPU


In [None]:
def CSN1_legacy(W, eps=0.01, theta0=None, verb=0):

  n = W.shape[0]
  W = (W / numpy.linalg.norm(W)).astype('float32')
  PI = numpy.pi
  if theta0 is None:
    theta = 2*PI*numpy.random.rand(n)  # init. values in [0, 2*PI]
  else:
    theta = theta0

  TxB = 32      # number of threads in a block
  BxG = (n + (TxB - 1)) // TxB    # number of thread blocks in the grid
 
  sin_t = numpy.sin(theta)
  cos_t = numpy.cos(theta)
  A = numpy.dot(W, cos_t).astype('float32')
  B = numpy.dot(W, sin_t).astype('float32')

  # main loop
  ok = True
  rounds = 0
  while ok:
    ok = False
    rounds += 1
    nchanges = 0
    # loop on angles
    for i in range(n):
      old = theta[i]

      # change i-th theta
      theta[i] = numpy.arctan(B[i]/A[i])    # within [-PI/2, PI/2]
      if A[i] >= 0:
        theta[i] += PI
      elif B[i] > 0:
        theta[i] += 2*PI

      # update Ak & Bk using GPU
      dc = numpy.cos(theta[i]) - numpy.cos(old)
      ds = numpy.sin(theta[i]) - numpy.sin(old)
      #func[BxG, TxB](A, B, W, dc, ds, i)
      dot_GPU[BxG, TxB](A, W, dc.astype('float32'), n)  # start the kernel
      dot_GPU[BxG, TxB](B, W, ds.astype('float32'), n)  # start the kernel

      if min(abs(old-theta[i]),abs(2*PI-old+theta[i])) > eps:
        ok = True
        nchanges += 1

  if verb > 0:
    print('  Size : %d' %n)
    print('Rounds : %d' %rounds)
 
  return theta

#@cuda.jit
def dot_GPU(A, W, d, n):
  tx = cuda.threadIdx.x  # Thread id in a 1D block
  bw = cuda.blockDim.x   # Block width

  # shared memory
  sW = cuda.shared.array(bw, dtype=numpy.float32)

  # index inside the array
  pos = cuda.grid(1)
  if pos >= W.shape[0]:
    return

  gw = cuda.gridDim.x     # num blocks per grid

  # The dot product is chunked into dot products of TPB-long vectors.
  res = 0.
  for i in range(gw):
    # Preload data into shared memory
    sW[tx] = W[tx + i * bw,:]
    # Wait until all threads finish preloading
    cuda.syncthreads()
    # Computes partial product on the shared memory
    for j in range(bw):
        res += A[tx] * d
    # Wait until all threads finish computing
    cuda.syncthreads()

  return res

## 🪜 My Solutions


### CPU

In [None]:
import numpy

@numba.jit
def CSN_CPU(W, eps=0.01, seed=1):
  n = W.shape[0]
  PI = numpy.pi
  
  W = W / numpy.linalg.norm(W) # Frobenius_norm(W)
  
  numpy.random.seed(seed)
  theta = 2*PI*numpy.random.rand(n)
  
  sin_t = numpy.sin(theta)
  cos_t = numpy.cos(theta)
  
  A = numpy.dot(W, cos_t)
  B = numpy.dot(W, sin_t)

  ok = True
  rounds = 0

  while ok:
    ok = False
    rounds += 1
    nchanges = 0

    # loop on angles
    for i in range(n):
      theta, A, B, ok, nchanges = loop_fun(i, theta, A, B, eps, W, n, nchanges, PI, ok)

  print("rounds="+str(rounds))

  return theta

@numba.jit
def loop_fun(i, theta, A, B, eps, W, n, nchanges, PI, ok):
  old = theta[i]

  # change i-th theta
  theta[i] = numpy.arctan(B[i]/A[i])    # within [-PI/2, PI/2]
  
  if A[i] >= 0:
    theta[i] += PI
  elif B[i] > 0:
    theta[i] += 2*PI

  # update Ak & Bk by elementwise product and diff
  A, B = updateAB(A, B, W, i, theta, old, n)

  if min(abs(old-theta[i]),abs(2*PI-old+theta[i])) > eps:
    ok = True
    nchanges += 1

  return theta, A, B, ok, nchanges

@numba.jit
def updateAB(A, B, W, i, theta, old, n):
  A += numpy.multiply(W[i,:], numpy.cos(theta[i]) - numpy.cos(old))
  B += numpy.multiply(W[i,:], numpy.sin(theta[i]) - numpy.sin(old)) 
  return A, B


In [None]:
import numpy

@numba.jit
def CSN_CPU_2(W, eps=0.01, seed=1):
  n = W.shape[0]
  PI = numpy.pi
  
  W = W / numpy.linalg.norm(W) # Frobenius_norm(W) # cuSolver?
  
  numpy.random.seed(seed)
  theta = 2*PI*numpy.random.rand(n)
  
  old = numpy.copy(theta)
  sin_t = numpy.sin(theta)
  cos_t = numpy.cos(theta)
  
  A = numpy.dot(W, cos_t)
  B = numpy.dot(W, sin_t)

  ok = True
  rounds = 0

  while ok:
    ok = False
    rounds += 1
    nchanges = 0

    if rounds > 1:
      old = numpy.copy(theta)
      sin_t = numpy.sin(theta)
      cos_t = numpy.cos(theta)

    # loop on angles
    for i in range(n):
      theta, A, B, ok, nchanges = loop_fun_2(i, theta, A, B, eps, W, n, nchanges, PI, ok, sin_t, cos_t, old)

  print("rounds="+str(rounds))

  return theta

@numba.jit
def loop_fun_2(i, theta, A, B, eps, W, n, nchanges, PI, ok, sin_t, cos_t, old):
  # change i-th theta
  theta[i] = numpy.arctan(B[i]/A[i])    # within [-PI/2, PI/2]
  
  if A[i] >= 0:
    theta[i] += PI
  elif B[i] > 0:
    theta[i] += 2*PI

  # update Ak & Bk by elementwise product and diff
  A, B = updateAB(A, B, W, i, theta, n)

  if min(abs(old[i]-theta[i]),abs(2*PI-old[i]+theta[i])) > eps:
    ok = True
    nchanges += 1

  return theta, A, B, ok, nchanges

@numba.jit
def updateAB_2(A, B, W, i, theta, n, sin_t, cos_t):
  #this are all vectors with length n
  A += numpy.multiply(W[i,:], numpy.cos(theta[i]) - cos_t[i])
  B += numpy.multiply(W[i,:], numpy.sin(theta[i]) - sin_t[i]) 
  return A, B

### GPU_CUPY

In [None]:
import cupy

@numba.jit
def CSN_CUPY(W_, eps=0.01, seed=1, same_random=True):
  
  n = W_.shape[0]
  PI = cupy.pi
  
  W = cupy.array(W_)

  W = W / cupy.linalg.norm(W)

  if same_random: ## only to check corectness
    numpy.random.seed(seed)
    theta = (2*PI)*numpy.random.rand(n)
    theta = cupy.asarray(theta)
  else:
    cupy.random.seed(seed)
    theta = (2*PI)*cupy.random.rand(n)

  old = cupy.copy(theta)
  sin_t = cupy.sin(theta)
  cos_t = cupy.cos(theta)

  A = cupy.dot(W, cos_t)
  B = cupy.dot(W, sin_t)

  #main loop
  ok = True
  rounds = 0
  while ok:
    ok = False
    rounds += 1
    nchanges = 0

    if rounds > 1:
      old = cupy.copy(theta)
      sin_t = numpy.sin(theta)
      cos_t = numpy.cos(theta)

    #loop on angles
    for i in range(n):
      theta[i] = cupy.arctan(B[i]/A[i])    # within [-PI/2, PI/2]

      if A[i] >= 0:
        theta[i] += PI
      elif B[i] > 0:
        theta[i] += 2*PI

      if min(abs(old[i]-theta[i]),abs(2*PI-old[i]+theta[i])) > eps:
        ok = True
        nchanges += 1

      A += cupy.multiply(W[i,:], cupy.cos(theta[i]) - cos_t[i])
      B += cupy.multiply(W[i,:], cupy.sin(theta[i]) - sin_t[i])

  print("rounds="+str(rounds))

  return cupy.asnumpy(theta)

### GPU pyCuda

In [None]:
from pycuda.compiler import SourceModule
import pycuda.driver as cuda
import pycuda.autoinit
import numpy
import math

cuRand = """ 
#include <curand_kernel.h>

 extern "C" { 
  __global__ void initCurand(curandState* states, unsigned int n)
  {
      unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
      
      if (tid < n)
      {
        curand_init(1234, tid, 0, &states[tid]);
      }
    }

  __global__ void genVector(curandState* states, float *result, unsigned int n, float a)
  {
    unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
    if (tid < n)
    {
      result[tid] = a * curand_uniform(&states[tid]);
    }
  }
}

"""

In [None]:
import cupy
from numba import cuda

@numba.jit
def CSN_GPU(W, eps=0.01, seed=0, same_random=False):
  #w must be cupy ndarray
  n = W.shape[0]
  PI = cupy.pi
  
  W = W / cupy.linalg.norm(W)

  if same_random:
    numpy.random.seed(seed)
    theta = (2*PI)*numpy.random.rand(n)
    theta = cupy.asarray(theta)
  else:
    cupy.random.seed(seed)
    theta = (2*PI)*cupy.random.rand(n)

  sin_t = cupy.sin(theta)
  cos_t = cupy.cos(theta)
  A = cupy.dot(W, cos_t)
  B = cupy.dot(W, sin_t)

  TxB = 32                        # number of threads in a block
  BxG = (n + (TxB - 1)) // TxB    # number of thread blocks in the grid

  ok = True
  rounds = 0
  while ok:
    ok = False
    rounds += 1
    nchanges = 0
    for i in range(n):
      theta, A, B, ok, nchanges = loop_func_GPU(i, theta, A, B, W, ok, nchanges, eps, PI, n, TxB, BxG)
 
  return theta

@numba.jit
def loop_func_GPU(i, theta, A, B, W, ok, nchanges, eps, PI, n, TxB, BxG):
  old = theta[i]

  theta[i] = cupy.arctan(B[i]/A[i])    # within [-PI/2, PI/2]
  if A[i] >= 0:
    theta[i] += PI
  elif B[i] > 0:
    theta[i] += 2*PI

  # update Ak & Bk using GPU
  dc = cupy.cos(theta[i]) - cupy.cos(old)
  ds = cupy.sin(theta[i]) - cupy.sin(old)
  ##init shared mem
  sm_size = TxB * theta.dtype.itemsize
  #func[BxG, TxB, stream, sm_byte_size](A, B, W, dc, ds, i)
  dot_CSN_GPU[BxG, TxB, 0, sm_size](A, W, n, dc.astype('float32'), BxG)  # start the kernel
  dot_CSN_GPU[BxG, TxB, 0, sm_size](B, W, n, ds.astype('float32'), BxG)  # start the kernel

  if min(abs(old-theta[i]),abs(2*PI-old+theta[i])) > eps:
    ok = True
    nchanges += 1
  
  return theta, A, B, ok, nchanges

@cuda.jit()
def dot_CSN_GPU(A, W, n, d):
  tx = cuda.threadIdx.x  # Thread id in a 1D block
  bw = cuda.blockDim.x   # Block width

  # shared memory
  sW = cuda.shared.array(shape=0, dtype=cupy.float32)

  # index inside the array
  pos = cuda.grid(1)
  if pos >= n:
    return

  gw = cuda.gridDim.x     # num blocks per grid

  # The dot product is chunked into dot products of TPB-long vectors.
  res = 0.
  for i in range(gw):
    # Preload data into shared memory
    sW[tx] = W[tx + i * bw, :]
    # Wait until all threads finish preloading
    cuda.syncthreads()
    # Computes partial product on the shared memory
    for j in range(bw):
        res += A[tx] * d
    # Wait until all threads finish computing
    cuda.syncthreads()
  return res

In [None]:
#X, l, W, n = rand_blobs(5, d=28*28, k=5, std=5, standard=False, display=False)
#W = pairwise_distances(X,X, metric='cosine')
#W_CUPY = cupy.array(W)
#CSN_GPU(W_CUPY)

# ⚠️ TESTS

## 🏎 Speed Tests

### Between Whole Functions

In [None]:
@numba.jit
def avg_time_of(f, w, name, r=10):

  t = 0
  for i in range(0, int(r-1)):
    tic()
    f(w)
    t += toc(False)
  
  tic()
  theta = f(w)
  t += toc(False)

  print( "Average time of %s: \t\t %f seconds.\n" %(name, (t/r)) )
  
  return theta

Warm up befor real tests

In [None]:
%%capture
X, l, W, n = rand_blobs(5, d=28*28, k=5, std=5, standard=False, display=False)
W = pairwise_distances(X,X, metric='cosine')
trial = 1

avg_time_of(CSN_LEGACY, W, "CSN Legacy", trial)
avg_time_of(CSN_CPU, W, "CSN CPU", trial)
avg_time_of(CSN_CPU_2, W, "CSN CPU 2", trial)
#avg_time_of(CSN_CUPY, W, "CSN CUPY", trial) # too slo

#W = None
#W_CUPY = None

Real tests

In [None]:
X, l, W, n = rand_blobs(10000, d=28*28, k=5, std=5, standard=False, display=False)
W = pairwise_distances(X,X, metric='cosine')
trial = 3

avg_time_of(CSN_LEGACY, W, "CSN Legacy", trial)
avg_time_of(CSN_CPU, W, "CSN CPU", trial)
avg_time_of(CSN_CPU_2, W, "CSN CPU 2", trial)
#avg_time_of(CSN_CUPY, W, "CSN CUPY", trial) # too slow

print("")

RESULTS on Tesla T4

rounds=21
Average time of CSN Legacy: 		 8.325038 seconds.

rounds=21
Average time of CSN CPU: 		 6.899417 seconds.

rounds=21
Average time of CSN CPU 2: 		 6.752142 seconds.

rounds=21
Average time of CSN CUPY: 		 109.777244 seconds.

rounds=21
Average time of CSN CUPY 2: 		 112.692553 seconds.













### Between single fuctions

#### cuRand - numpy

In [None]:
%%writefile cuRand_test.py

import numpy
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import math

cuRand = """ 
#include <curand_kernel.h>

 extern "C" { 
  __global__ void initCurand(curandState* states, unsigned int n)
  {
      unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
      
      if (tid < n)
      {
        curand_init(1234, tid, 0, &states[tid]);
      }
    }

  __global__ void genVector(curandState* states, float *result, unsigned int n, float a)
  {
    unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
    if (tid < n)
    {
      result[tid] = a * curand_uniform(&states[tid]);
    }
  }
}

"""

def loadKernels():
  mod = SourceModule(cuRand, keep=False, no_extern_c=True)
  init_curand = mod.get_function("initCurand")
  gen_vector = mod.get_function("genVector")
  return init_curand, gen_vector

def init(n, h_n, grid, block):
  # Curand States generation
  states = cuda.mem_alloc(n * pycuda.characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
  init_curand(states, h_n, grid=grid, block=block)
  return states

def generate_vector(states, n, h_n, h_pi2, grid, block):
  # Curand vector generation
  h_theta = numpy.zeros(n, dtype=numpy.float32)
  d_theta = cuda.mem_alloc(h_theta.nbytes)
  gen_vector(states, d_theta, h_n, h_pi2, grid=grid, block=block)

  # get result
  cuda.memcpy_dtoh(h_theta, d_theta)

  return h_theta

init_curand, gen_vector = loadKernels()

#vector size
n = 1000000
#threads per block
threadsperblock = 128
pi2 = numpy.pi*2

#host costant
h_n = numpy.uint32(n)
h_pi2 = numpy.float32(pi2)

#grid size definition
grid = (math.floor((n + threadsperblock - 1) / threadsperblock), 1, 1)
block = (threadsperblock,1,1)

states = init(n, h_n, grid, block)
generate_vector(states, n, h_n, h_pi2, grid, block)

In [None]:
!rm cuRand_test.nvvp
!nvprof -s -o cuRand_test.nvvp python cuRand_test.py

In [None]:
#### REMOVE FROM PREVIUS CELL <<%%writefile ...>> AND ADD THIS CELL AT THE BOTTOM
#### TO TEST CURAND AND NUMPY

print("curand")
tic()
theta_gpu = generate_vector(n, h_n, h_pi2, grid, block)
toc()

print("numpy.random.rand")
tic()
numpy.random.seed(0)
theta_cpu = numpy.pi*2*numpy.random.rand(n)
toc()

#print("theta gpu shape"+str(theta_gpu.shape))
#print("theta cpu shape"+str(theta_cpu.shape))

#### numpy - cupy

In [None]:
def np(n):
  W = numpy.random.rand(n, n)
  PI = numpy.pi

  W = W / numpy.linalg.norm(W)

  numpy.random.seed(0)
  theta = 2*PI*numpy.random.rand(n)

  sin_t = numpy.sin(theta)
  cos_t = numpy.cos(theta)
  A = numpy.dot(W, cos_t)
  B = numpy.dot(W, sin_t)

def cp(n):
  W_ = numpy.random.rand(n, n)
  W = cupy.array(W_)
  PI = cupy.pi

  W = W / cupy.linalg.norm(W)

  cupy.random.seed(0)
  theta = 2*PI*cupy.random.rand(n)

  old = cupy.copy(theta)
  sin_t = cupy.sin(theta)
  cos_t = cupy.cos(theta)
  A = cupy.dot(W, cos_t)
  B = cupy.dot(W, sin_t)

  return cupy.asnumpy(theta)

In [None]:
import numpy
import cupy

n = 20

np(n)
cp(n)

k=1000
n = 10*k # 10k

tic()
np(n)
t0 = toc()

tic()
cp(n)
t1 = toc()
print("numpy speed-up "+str(t0/t1))

In [None]:
!rm test.nvvp
!nvprof -s -o test.nvvp python test.py

## ✔ Correctness Tests

In [None]:
X, l, W, n = rand_blobs(100, d=28*28, k=5, std=5, standard=False, display=False)
W = pairwise_distances(X,X, metric='cosine')

In [None]:
#%%capture
eps = 0.00001

t0 = CSN_LEGACY(W)
t1 = CSN_CPU(W)
t2 = CSN_CPU_2(W)
t3 = CSN_CUPY(W_CUPY)


In [None]:
print(" CSN_CPU is %s" % numpy.allclose(t0, t1, atol=eps))
print(" CSN_CPU_2 is %s" % numpy.allclose(t0, t2, atol=eps))
print(" CSN_CPU_CUPY is %s" % numpy.allclose(t0, t3, atol=eps))

# 🏞 SHOW RESULT

In [None]:
X, l, W, n = rand_blobs(10, d=28*28, k=5, std=5, standard=False)
W = pairwise_distances(X,X, metric='cosine')

In [None]:
theta = CSN_CPU(W)

In [None]:
hist, bins = histogram(theta, nbins=256)
plot_circle(theta,l)
plot_hist(hist, bins, mode=0)