<a href="https://colab.research.google.com/github/bonomip/GPU/blob/master/gpu_progetto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

---
# PROGETTO CLUSTER
---

# 🎬 CUDA setup

In [None]:
!nvcc --version

In [None]:
!nvidia-smi

## [GPU Compute Capability](https://developer.nvidia.com/cuda-gpus)

## NVCC Plugin for Jupyter notebook

*Usage*:


*   Load Extension `%load_ext nvcc_plugin`
*   Mark a cell to be treated as cuda cell
`%%cuda --name example.cu --compile false`

**NOTE**: The cell must contain either code or comments to be run successfully. It accepts 2 arguments. `-n | --name` - which is the name of either CUDA source or Header. The name parameter must have extension `.cu` or `.h`. Second argument -c | --compile; default value is false. The argument is a flag to specify if the cell will be compiled and run right away or not. It might be usefull if you're playing in the main function

*  We are ready to run CUDA C/C++ code right in your Notebook. For this we need explicitly say to the interpreter, that we want to use the extension by adding `%%cu` at the beginning of each cell with CUDA code. 




In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git


In [None]:
%load_ext nvcc_plugin

In [None]:
%pip install --target=$nb_path pycuda

In [None]:
!mkdir -p src

In [None]:
#@title Bash setup
%%writefile /root/.bashrc

# If not running interactively, don't do anything
[ -z "$PS1" ] && return

# don't put duplicate lines in the history. See bash(1) for more options
# ... or force ignoredups and ignorespace
HISTCONTROL=ignoredups:ignorespace

# append to the history file, don't overwrite it
shopt -s histappend

# for setting history length see HISTSIZE and HISTFILESIZE in bash(1)
HISTSIZE=10000
HISTFILESIZE=20000

# check the window size after each command and, if necessary,
# update the values of LINES and COLUMNS.
shopt -s checkwinsize

# make less more friendly for non-text input files, see lesspipe(1)
[ -x /usr/bin/lesspipe ] && eval "$(SHELL=/bin/sh lesspipe)"

PS1='\[\033[01;34m\]\w\[\033[00m\]\$ '

# enable color support of ls and also add handy aliases
if [ -x /usr/bin/dircolors ]; then
    test -r ~/.dircolors && eval "$(dircolors -b ~/.dircolors)" || eval "$(dircolors -b)"
    alias ls='ls --color=auto'
    #alias dir='dir --color=auto'
    #alias vdir='vdir --color=auto'

    alias grep='grep --color=auto'
    alias fgrep='fgrep --color=auto'
    alias egrep='egrep --color=auto'
fi

# some more ls aliases
alias ll='ls -lF'
alias la='ls -A'
alias l='ls -CF'

# path setup
export PATH="./:/usr/local/cuda/bin:$PATH"

## 📦 Packages

C pkgs

In [None]:
%%writefile src/common.h

#include <sys/time.h>
#include <stdio.h>

#ifndef _COMMON_H
#define _COMMON_H

#define CHECK(call)                                                            \
{                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
    {                                                                          \
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
        fprintf(stderr, "code: %d, reason: %s\n", error,                       \
                cudaGetErrorString(error));                                    \
    }                                                                          \
}

#define CHECK_CUBLAS(call)                                                     \
{                                                                              \
    cublasStatus_t err;                                                        \
    if ((err = (call)) != CUBLAS_STATUS_SUCCESS)                               \
    {                                                                          \
        fprintf(stderr, "Got CUBLAS error %d at %s:%d\n", err, __FILE__,       \
                __LINE__);                                                     \
        exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CURAND(call)                                                     \
{                                                                              \
    curandStatus_t err;                                                        \
    if ((err = (call)) != CURAND_STATUS_SUCCESS)                               \
    {                                                                          \
        fprintf(stderr, "Got CURAND error %d at %s:%d\n", err, __FILE__,       \
                __LINE__);                                                     \
        exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CUFFT(call)                                                      \
{                                                                              \
    cufftResult err;                                                           \
    if ( (err = (call)) != CUFFT_SUCCESS)                                      \
    {                                                                          \
        fprintf(stderr, "Got CUFFT error %d at %s:%d\n", err, __FILE__,        \
                __LINE__);                                                     \
        exit(1);                                                               \
    }                                                                          \
}

#define CHECK_CUSPARSE(call)                                                   \
{                                                                              \
    cusparseStatus_t err;                                                      \
    if ((err = (call)) != CUSPARSE_STATUS_SUCCESS)                             \
    {                                                                          \
        fprintf(stderr, "Got error %d at %s:%d\n", err, __FILE__, __LINE__);   \
        cudaError_t cuda_err = cudaGetLastError();                             \
        if (cuda_err != cudaSuccess)                                           \
        {                                                                      \
            fprintf(stderr, "  CUDA error \"%s\" also detected\n",             \
                    cudaGetErrorString(cuda_err));                             \
        }                                                                      \
        exit(1);                                                               \
    }                                                                          \
}

cudaEvent_t start, stop;

inline void tic(){
  cudaEventCreate(&start);
	cudaEventCreate(&stop);
  CHECK(cudaEventRecord(start));
}

inline void toc(){
  CHECK(cudaEventRecord(stop));
  CHECK(cudaEventSynchronize(stop));
  
  float milliseconds;
  CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
  printf("    elapsed time: %.5f (sec)\n", milliseconds / 1000.0);
}

inline double seconds() {
    struct timeval tp;
    struct timezone tzp;
    int i = gettimeofday(&tp, &tzp);
    return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
}

inline void device_name() {
    // set up device
    int dev = 0;
    cudaDeviceProp deviceProp;
    CHECK(cudaGetDeviceProperties(&deviceProp, dev));
    printf("device %d: %s\n", dev, deviceProp.name);
    CHECK(cudaSetDevice(dev));
}

typedef unsigned long ulong;
typedef unsigned int uint;

#endif // _COMMON_H

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA
import plotly.graph_objects as go
import plotly.express as px
import numpy
import numba
import time


def get_chunks_of_size_until_empty(size, n):
  #function that return n as an array of chunks
  n_chunks = math.ceil(n/size)

  chunks = [0] * (n_chunks)

  for i in range(0, n_chunks-1):
      chunks[i] = size

  if(n_chunks == n/size):
      chunks[n_chunks-1] = size
  else:
      chunks[n_chunks-1] = int(n-numpy.sum(chunks))

  return chunks

def TicTocGenerator():
  # Generator that returns time differences
  ti = 0           # initial time
  tf = time.time() # final time
  while True:
    ti = tf
    tf = time.time()
    yield tf-ti # returns the time difference

TicToc = TicTocGenerator() 

def toc(tempBool=True):
  # Prints the time difference yielded by generator instance TicToc
  tempTimeInterval = next(TicToc)
  if tempBool:
    print( "Elapsed time: %f seconds." %tempTimeInterval )
  return tempTimeInterval

def tic():
  # Records a time in TicToc, marks the beginning of a time interval
  toc(False)

# instance creation
def rand_blobs(n, d, k=3, std=1, rstate=None, standard=True, display=True, threeD=False): 
  X, l = make_blobs(n_samples=n, centers=k, n_features=d, cluster_std=std, random_state=rstate)
  if standard:
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
  W = euclidean_distances(X, X)

  # plot
  if display:
    plot_blobs(X,l,threeD)
  return X, l, W, n

def histogram(theta, nbins=None, verb=True):
  if nbins is None:
    nbins = len(theta)
  binsLIM = numpy.linspace(0,2*numpy.pi,nbins)
  hist, bins = numpy.histogram(theta, binsLIM)

  if verb:
    print('Data size : %d' %len(theta))
    print('Num bins  : %d' %nbins)

  return hist, bins 

def plot_circle(theta, l=None, radius=500):
  """
    Produce a plot with the locations of all poles and zeros
  """

  x = numpy.cos(theta)
  y = numpy.sin(theta)

  fig = go.Figure()
  fig.add_shape(type="circle", xref="x", yref="y", x0=-1, y0=-1, x1=1, y1=1, line=dict(color="black", width=1))
  
  if l is None:
    fig.add_trace(go.Scatter(x=x, y=y,
          mode='markers',
          marker_symbol='circle',
          marker_size=10))
  else:
    ul = numpy.unique(l)
    cols = list(range(len(ul)))
    for c,u in zip(cols,ul):
      idx = numpy.where(u == l)
      fig.add_trace(go.Scatter(x=x[idx], y=y[idx],
          mode='markers',
          marker_symbol='circle',
          marker_color=cols[c], 
          marker_line_color=cols[c],
          marker_line_width=0, 
          marker_size=10))
  
  M = 1.05
  fig.update_xaxes(title='', range=[-M, M])
  fig.update_yaxes(title='', range=[-M, M])
  fig.update_layout(title='clusters', width=radius, height=radius)
  fig.show()

def plot_hist(hist, bins, mode=0, smooth_wlen=None):

  if mode==0:
    mode_line = 'lines'
  elif mode == 1:
    mode_line = 'markers'
  else:
    mode_line = 'lines+markers'
  
  if smooth_wlen is not None:
    hist = smooth(hist, window_len=smooth_wlen, window='hanning')

  figh = go.Figure(data=go.Scatter(x=bins, y=hist, mode=mode_line))
  figh.show()

def plot_blobs(X, labels=None, threeD=False, doPCA=True, sizex=1):

  if threeD:
    if PCA:
      pca = PCA(n_components=3)
      components = pca.fit_transform(X)
    else:
      components = X[:,0:3]  
    if labels is None:
      fig = px.scatter_3d(components, x=0, y=1, z=2, title='Blobs 3D',
                          labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'})
    else:
      fig = px.scatter_3d(components, x=0, y=1, z=2, color=labels, title='Blobs 3D',
                          labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'})
  else:
    if doPCA:
      pca = PCA(n_components=2)
      components = pca.fit_transform(X)
    else:
      components = X[:,0:2]  
    if labels is None:
      fig = px.scatter(components, x=0, y=1, title='Blobs 2D', labels={'0': 'PC 1', '1': 'PC 2'})
    else:
      fig = px.scatter(components, x=0, y=1, title='Blobs 2D', color=labels, labels={'0': 'PC 1', '1': 'PC 2'})
  
  fig.update_layout(
    width = 800*sizex,
    height = 800*sizex,
    title = "fixed-ratio axes")
  fig.update_yaxes(
    scaleanchor = "x",
    scaleratio = 1)
  fig.show()


def kmeans(X, k, niter=300, rstate=0, n_init=10):
  kmeans = KMeans(n_clusters=k, n_init=n_init, max_iter=niter, random_state=rstate)
  kmeans.fit(X)

  return kmeans.labels_, kmeans.n_iter_, kmeans.inertia_ 

# 🪣 CSN Algorithms

## 📟 Legacy Code

### CPU


In [None]:
@numba.jit
def CSN_LEGACY(W, eps=0.01, theta0=None, normalize=True, seed=1):

  # general vars
  PI = numpy.pi #numpy.pi
  n = W.shape[0]

  # param check
  if normalize:
    W = W / numpy.linalg.norm(W) #linear algebra norm
  if theta0 is None: # if theta is not defined
    numpy.random.seed(seed)
    theta = 2*PI*numpy.random.rand(n)  # init. values in [0, 2*PI] # cudarandom
  else:
    theta = theta0

  # preliminar computations 
  sin_t = numpy.sin(theta)
  cos_t = numpy.cos(theta)
  A = numpy.dot(W, cos_t)
  B = numpy.dot(W, sin_t)

  # main loop
  ok = True
  rounds = 0
  while ok:
    ok = False
    rounds += 1
    nchanges = 0
    # loop on angles
    for i in range(n):
      old = theta[i]

      # change i-th theta
      theta[i] = numpy.arctan(B[i]/A[i])    # within [-PI/2, PI/2]

      if A[i] >= 0:
        theta[i] += PI
      elif B[i] > 0:
        theta[i] += 2*PI

      # update Ak & Bk by elementwise product and diff
      A += numpy.multiply(W[i,:], numpy.repeat(numpy.cos(theta[i]) - numpy.cos(old), n))
      B += numpy.multiply(W[i,:], numpy.repeat(numpy.sin(theta[i]) - numpy.sin(old), n)) 

      if min(abs(old-theta[i]),abs(2*PI-old+theta[i])) > eps:
        ok = True
        nchanges += 1

  print("rounds="+str(rounds))

  return theta

### GPU


In [None]:
def CSN1_legacy(W, eps=0.01, theta0=None, verb=0):

  n = W.shape[0]
  W = (W / numpy.linalg.norm(W)).astype('float32')
  PI = numpy.pi
  if theta0 is None:
    theta = 2*PI*numpy.random.rand(n)  # init. values in [0, 2*PI]
  else:
    theta = theta0

  TxB = 32      # number of threads in a block
  BxG = (n + (TxB - 1)) // TxB    # number of thread blocks in the grid
 
  sin_t = numpy.sin(theta)
  cos_t = numpy.cos(theta)
  A = numpy.dot(W, cos_t).astype('float32')
  B = numpy.dot(W, sin_t).astype('float32')

  # main loop
  ok = True
  rounds = 0
  while ok:
    ok = False
    rounds += 1
    nchanges = 0
    # loop on angles
    for i in range(n):
      old = theta[i]

      # change i-th theta
      theta[i] = numpy.arctan(B[i]/A[i])    # within [-PI/2, PI/2]
      if A[i] >= 0:
        theta[i] += PI
      elif B[i] > 0:
        theta[i] += 2*PI

      # update Ak & Bk using GPU
      dc = numpy.cos(theta[i]) - numpy.cos(old)
      ds = numpy.sin(theta[i]) - numpy.sin(old)
      #func[BxG, TxB](A, B, W, dc, ds, i)
      dot_GPU[BxG, TxB](A, W, dc.astype('float32'), n)  # start the kernel
      dot_GPU[BxG, TxB](B, W, ds.astype('float32'), n)  # start the kernel

      if min(abs(old-theta[i]),abs(2*PI-old+theta[i])) > eps:
        ok = True
        nchanges += 1

  if verb > 0:
    print('  Size : %d' %n)
    print('Rounds : %d' %rounds)
 
  return theta

#@cuda.jit
def dot_GPU(A, W, d, n):
  tx = cuda.threadIdx.x  # Thread id in a 1D block
  bw = cuda.blockDim.x   # Block width

  # shared memory
  sW = cuda.shared.array(bw, dtype=numpy.float32)

  # index inside the array
  pos = cuda.grid(1)
  if pos >= W.shape[0]:
    return

  gw = cuda.gridDim.x     # num blocks per grid

  # The dot product is chunked into dot products of TPB-long vectors.
  res = 0.
  for i in range(gw):
    # Preload data into shared memory
    sW[tx] = W[tx + i * bw,:]
    # Wait until all threads finish preloading
    cuda.syncthreads()
    # Computes partial product on the shared memory
    for j in range(bw):
        res += A[tx] * d
    # Wait until all threads finish computing
    cuda.syncthreads()

  return res

## 🪜 My Solutions


### CUDA C

In [None]:
%%writefile src/structs.h

#ifndef STRUCTS
#define STRUCTS

struct gtv { //gpu theta values
    float *theta, *sin_theta, *cos_theta;
};

#endif

#### Theta, sin, cos

In [None]:
%%writefile src/thetaModule.cu

#include <stdio.h>
#include <stdlib.h>
#include "../src/common.h"
#include "../src/structs.h"
#include <cuda.h>
#include <curand.h>

#define N 10000
#define ALPHA 2*M_PI
#define BLOCK_SIZE 128

__global__ void Kernel_1(float *theta, float* cosTheta, float* sinTheta)
{
  unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
  
  if (tid < N)
  {
    float x = ALPHA * theta[tid];
    theta[tid] = x;
    cosTheta[tid] = cos(x);
    sinTheta[tid] = sin(x);
  }
}

gtv theta_sin_cos()
{
  //# curand
  curandGenerator_t gen;
  
  float *dTheta, *dSinTheta, *dCosTheta;

  /* # CREATE RANDOM VECTOR # */

  /* Allocate n floats on device */
  CHECK(cudaMalloc((void **) & dTheta, N*sizeof ( float )));
  
  /* Create pseudo - random number generator */
  CHECK_CURAND(curandCreateGenerator (&gen, CURAND_RNG_PSEUDO_DEFAULT));
  
  /* Set seed */
  CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
  
  /* Generate N floats on device */
  CHECK_CURAND(curandGenerateUniform (gen , dTheta, N));
  
  /* # CALL KERNEL 1 # */

  /* Allocate n floats on device */
  CHECK(cudaMalloc((void **) & dCosTheta, N*sizeof ( float )));
  /* Allocate n floats on device */
  CHECK(cudaMalloc((void **) & dSinTheta, N*sizeof ( float )));

  dim3 block(BLOCK_SIZE, 1, 1);
	dim3 grid((N + block.x - 1) / block.x, 1, 1);

  printf("Strating Kernel 1 -- N: %d, GRID_SIZE: %d, BLOCK_SIZE: %d\n", N, grid.x, BLOCK_SIZE);
  Kernel_1<<<block, grid>>>(dTheta, dCosTheta, dSinTheta);

	CHECK_CURAND(curandDestroyGenerator(gen));

  struct gtv s;
  s.theta = dTheta;
  s.sin_theta = dSinTheta;
  s.cos_theta = dCosTheta;

  return s;
}

In [None]:
%%writefile src/matrixModule.cu

#include <stdio.h>
#include <stdlib.h>
#include "../src/common.h"
#include <cuda.h>
#include "cublas_v2.h"

#define N 10000  
#define BLOCK_SIZE 128

void matrix_norm(float *matrix)
{
  float *d_matrix;
  float norm;
  int n = N*N;
 
  dim3 block(BLOCK_SIZE, 1, 1);
  dim3 grid((N*N + block.x - 1) / block.x, 1, 1);

  cublasHandle_t handle;
  CHECK_CUBLAS(cublasCreate(&handle));

  CHECK(cudaMalloc(&d_matrix, N*N*sizeof(float)));
  CHECK_CUBLAS(cublasSetVector(N*N, sizeof(float), matrix, 1, d_matrix, 1));
  CHECK_CUBLAS(cublasSnrm2(handle, n, d_matrix, 1, &norm));
  norm = 1/norm;
  //multi matrix per norm.

}

In [None]:
%%writefile src/cudaMain.cu

#include <stdio.h>
#include <stdlib.h>
#include "../src/matrixModule.cu"
#include "../src/thetaModule.cu"
#include "../src/structs.h"

#define N 10000

/*
* MAIN
*/

int main()
{
  //allocate matrix on heap
  float *matrix;
  matrix = (float *) malloc(sizeof(float) * N*N);
  for(long i=0;i<N*N;i++)
    matrix[i]=rand()%100;
 
  struct gtv s;
  s = theta_sin_cos();

  matrix_norm(matrix);

  CHECK(cudaDeviceSynchronize());
}

In [None]:
!nvcc -o /content/main /content/src/cudaMain.cu -lcurand -lcublas
!./main

N: 10000

GRID_SIZE: 79, BLOCK_SIZE: 128, 0.01246 (sec)

GRID_SIZE: 40, BLOCK_SIZE: 256, 0.01200 (sec)

GRID_SIZE: 157, BLOCK_SIZE: 64, 0.01234 (sec) 

#### Matrix Normalization

### CPU

In [None]:
import numpy

@numba.jit
def CSN_CPU(W, eps=0.01, seed=1):
  n = W.shape[0]
  PI = numpy.pi
  
  # W is reshaped in a vector using the column major order
  # than the norm-2 (euclidean norm) is computed
  # this is Frobenius norm
  W = W / numpy.linalg.norm(W) # Frobenius_norm(W)
  
  numpy.random.seed(seed)
  theta = 2*PI*numpy.random.rand(n)
  
  sin_t = numpy.sin(theta)
  cos_t = numpy.cos(theta)
  
  A = numpy.dot(W, cos_t) # W @ cos_t
  B = numpy.dot(W, sin_t) # W @ sin_t

  ok = True
  rounds = 0

  while ok:
    ok = False
    rounds += 1
    nchanges = 0

    # loop on angles
    for i in range(n):
      theta, A, B, ok, nchanges = loop_fun(i, theta, A, B, eps, W, n, nchanges, PI, ok)

  print("rounds="+str(rounds))

  return theta

@numba.jit
def loop_fun(i, theta, A, B, eps, W, n, nchanges, PI, ok):
  old = theta[i]

  # change i-th theta
  theta[i] = numpy.arctan(B[i]/A[i])    # within [-PI/2, PI/2]
  
  if A[i] >= 0:
    theta[i] += PI
  elif B[i] > 0:
    theta[i] += 2*PI

  # update Ak & Bk by elementwise product and diff
  A, B = updateAB(A, B, W, i, theta, old, n)

  if min(abs(old-theta[i]),abs(2*PI-old+theta[i])) > eps:
    ok = True
    nchanges += 1

  return theta, A, B, ok, nchanges

@numba.jit
def updateAB(A, B, W, i, theta, old, n):
  # A += W[i,:] * numpy.cos(theta[i]) - numpy.cos(old)
  A += numpy.multiply(W[i,:], numpy.cos(theta[i]) - numpy.cos(old))
  B += numpy.multiply(W[i,:], numpy.sin(theta[i]) - numpy.sin(old)) 
  return A, B


In [None]:
import numpy

@numba.jit
def CSN_CPU_2(W, eps=0.01, seed=1):
  n = W.shape[0]
  PI = numpy.pi
  
  W = W / numpy.linalg.norm(W) # Frobenius_norm(W) # cuSolver?
  
  numpy.random.seed(seed)
  theta = 2*PI*numpy.random.rand(n)
  
  old = numpy.copy(theta)
  sin_t = numpy.sin(theta)
  cos_t = numpy.cos(theta)
  
  A = numpy.dot(W, cos_t)
  B = numpy.dot(W, sin_t)

  ok = True
  rounds = 0

  while ok:
    ok = False
    rounds += 1
    nchanges = 0

    if rounds > 1:
      old = numpy.copy(theta)
      sin_t = numpy.sin(theta)
      cos_t = numpy.cos(theta)

    # loop on angles
    for i in range(n):
      theta, A, B, ok, nchanges = loop_fun_2(i, theta, A, B, eps, W, n, nchanges, PI, ok, sin_t, cos_t, old)

  print("rounds="+str(rounds))

  return theta

@numba.jit
def loop_fun_2(i, theta, A, B, eps, W, n, nchanges, PI, ok, sin_t, cos_t, old):
  # change i-th theta
  theta[i] = numpy.arctan(B[i]/A[i])    # within [-PI/2, PI/2]
  
  if A[i] >= 0:
    theta[i] += PI
  elif B[i] > 0:
    theta[i] += 2*PI

  # update Ak & Bk by elementwise product and diff
  A, B = updateAB(A, B, W, i, theta, n)

  if min(abs(old[i]-theta[i]),abs(2*PI-old[i]+theta[i])) > eps:
    ok = True
    nchanges += 1

  return theta, A, B, ok, nchanges

@numba.jit
def updateAB_2(A, B, W, i, theta, n, sin_t, cos_t):
  #this are all vectors with length n
  A += numpy.multiply(W[i,:], numpy.cos(theta[i]) - cos_t[i])
  B += numpy.multiply(W[i,:], numpy.sin(theta[i]) - sin_t[i]) 
  return A, B

### GPU_CUPY

In [None]:
import cupy

@numba.jit
def CSN_CUPY(W_, eps=0.01, seed=1, same_random=True):
  
  n = W_.shape[0]
  PI = cupy.pi
  
  W = cupy.array(W_)

  W = W / cupy.linalg.norm(W)

  if same_random: ## only to check corectness
    numpy.random.seed(seed)
    theta = (2*PI)*numpy.random.rand(n)
    theta = cupy.asarray(theta)
  else:
    cupy.random.seed(seed)
    theta = (2*PI)*cupy.random.rand(n)

  old = cupy.copy(theta)
  sin_t = cupy.sin(theta)
  cos_t = cupy.cos(theta)

  A = cupy.dot(W, cos_t)
  B = cupy.dot(W, sin_t)

  #main loop
  ok = True
  rounds = 0
  while ok:
    ok = False
    rounds += 1
    nchanges = 0

    if rounds > 1:
      old = cupy.copy(theta)
      sin_t = numpy.sin(theta)
      cos_t = numpy.cos(theta)

    #loop on angles
    for i in range(n):
      theta[i] = cupy.arctan(B[i]/A[i])    # within [-PI/2, PI/2]

      if A[i] >= 0:
        theta[i] += PI
      elif B[i] > 0:
        theta[i] += 2*PI

      if min(abs(old[i]-theta[i]),abs(2*PI-old[i]+theta[i])) > eps:
        ok = True
        nchanges += 1

      A += cupy.multiply(W[i,:], cupy.cos(theta[i]) - cos_t[i])
      B += cupy.multiply(W[i,:], cupy.sin(theta[i]) - sin_t[i])

  print("rounds="+str(rounds))

  return cupy.asnumpy(theta)

### 🐍 GPU pyCuda

In [None]:
from pycuda.compiler import SourceModule
from numba.cuda.api import stream
import pycuda.driver as cuda
import pycuda.autoinit
import numpy
import math

In [None]:
def block_grid_mem_chunks_setup(n, n_streams, txb, rf=1, padding=False):
  n_chunk = n/n_streams

  assert(math.floor(n_chunk)==(n_chunk))
  assert(n_chunk*n_streams == n)

  grid = ( math.ceil( n_chunk / txb / rf ), 1, 1 )
  block = ( txb, 1, 1 )
  txs = grid[0]*block[0]

  assert(txs*n_streams*rf >= n)

  if(padding):
    padding = txs*n_streams*rf - n
    n = txs*n_streams*rf
    return block_grid_mem_chunks_setup(n, n_streams, txb, rf)+(padding,)

  mem_chunks = get_chunks_of_size_until_empty(txs*rf, n)

  assert ( len(mem_chunks) == n_streams)
  assert ( numpy.sum(mem_chunks) >= n), ""+str(numpy.sum(mem_chunks))

  return block, grid, mem_chunks

#### Theta Module

In [None]:
theta_module = """ 
#include <curand_kernel.h>

 extern "C" { 
  __global__ void initCurand(curandState* states, unsigned int n, unsigned int seed)
  {
      unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
      
      if (tid < n)
      {
        curand_init(seed, tid, 0, &states[tid]);
      }
    }

  __global__ void genVector( curandState* states, float *theta, float* cos_theta,
                   float* sin_theta, unsigned int n, float a)
  {
    unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
    if (tid < n)
    {
      float x = a * curand_uniform(&states[tid]);
      theta[tid] = x;
      cos_theta[tid] = cos(x);
      sin_theta[tid] = sin(x);
    }
  }
}

"""

def load_theta_module():
  global theta_module
  mod = SourceModule(theta_module, keep=False, no_extern_c=True)
  init_curand = mod.get_function("initCurand")
  gen_vector = mod.get_function("genVector")
  return init_curand, gen_vector

def launch_theta_module(init_curand, gen_vector, n, n_streams=64, conv=False):
  # set up streams, chunks size, block size, grid size
  float_size = numpy.dtype(numpy.float32).itemsize
  txb = 128
  block, grid, mem_chunks = block_grid_mem_chunks_setup(n, n_streams, txb)

  # Define HOST streams and curand states
  streams = [None] * n_streams
  states = [None] * n_streams
  # Define DEVICE Algorithms variables
  d_theta = [numpy.ndarray] * n_streams
  d_cos_theta = [numpy.ndarray] * n_streams
  d_sin_theta = [numpy.ndarray] * n_streams
  
  if(conv):
    # Define HOST Algorithms variables
    theta = [numpy.ndarray] * n_streams
    cos_theta = [numpy.ndarray] * n_streams
    sin_theta = [numpy.ndarray] * n_streams

    for i in range(0, n_streams):
      theta[i] = numpy.empty(mem_chunks[i], dtype=numpy.float32)
      cos_theta[i] = numpy.empty(mem_chunks[i], dtype=numpy.float32)
      sin_theta[i] = numpy.empty(mem_chunks[i], dtype=numpy.float32)

  # launch Init Curand
  for i in range(0, n_streams):
    streams[i] = cuda.Stream()
    states[i] = cuda.mem_alloc(mem_chunks[i] * pycuda.characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
    init_curand(states[i], numpy.uint32(mem_chunks[i]), numpy.uint32(i), grid=grid, block=block, stream=streams[i])

  for i in range(0, n_streams):
    #Allocate DEVICE Algorithms variables
    d_theta[i] = cuda.mem_alloc(mem_chunks[i]*float_size)
    d_cos_theta[i] = cuda.mem_alloc(mem_chunks[i]*float_size)
    d_sin_theta[i] = cuda.mem_alloc(mem_chunks[i]*float_size)
    
    # Launch kernel
    gen_vector(
          states[i], d_theta[i], d_cos_theta[i], d_sin_theta[i], 
          numpy.uint32(mem_chunks[i]), numpy.float32(numpy.pi*2), 
          grid=grid, block=block, stream=streams[i])

    if(conv):
      # Copy data back to HOST
      cuda.memcpy_dtoh_async(theta[i], d_theta[i], streams[i])
      cuda.memcpy_dtoh_async(cos_theta[i], d_cos_theta[i], streams[i])
      cuda.memcpy_dtoh_async(sin_theta[i], d_sin_theta[i], streams[i])

  if(conv):
    cuda.Context.synchronize()

    # Convert Device data
    theta = numpy.array(theta)
    theta = numpy.concatenate(theta[:])
    cos_theta = numpy.array(cos_theta)
    cos_theta = numpy.concatenate(cos_theta[:])
    sin_theta = numpy.array(sin_theta)
    sin_theta = numpy.concatenate(sin_theta[:])
    return theta, cos_theta, sin_theta

  return d_theta, d_cos_theta, d_sin_theta, mem_chunks

In [None]:
n = 1000*1000
n_streams = 64
init, gen = load_theta_module()
tic()
t, c, s = launch_theta_module(init, gen, n, n_streams, conv=True)
toc()
tic()
t, c, s, m = launch_theta_module(init, gen, n, n_streams, conv=False)
toc()


n size = 1.000.000 

THETA TxB=128 n_streams=10 = 0.079370 seconds. (with DtoH)

THETA, COS, SIN: 0.055484 seconds. (with DtoH)
TxB=128 n_streams=64

#### Matrix Normalization Module

The Frobenius norm of a Matrix M is the euclidean norm (norm-2) of vector V that is obtained by flattening M in column major order.
In other words al the entries of M are multiplied for itself, summed all toghether and then the square root is applied to the result.

In [None]:
matrix_module = """
extern "C"
{ 
  __global__ void squaredReduction(float* W, float* V, unsigned long n)
  {
    extern __shared__ float sV[];

    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;

    if( i+blockDim.x < n )
    {
      sV[tid] = pow(W[i],2)+pow(W[i+blockDim.x],2);

      __syncthreads();

      for (unsigned int s=blockDim.x/2; s>0; s>>=1)
      {
        if (tid < s)
        {
          sV[tid] += sV[tid + s];
        }
        __syncthreads();
      }
      if (tid == 0) V[blockIdx.x] = sV[0];
    } 
  }

  __global__ void reduce(float* W, float* V, unsigned int n)
  {
    extern __shared__ float sW[];

    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;

    if( i+blockDim.x < n )
    {
      sW[tid] = W[i]+V[i+blockDim.x];

      __syncthreads();

      for (unsigned int s=blockDim.x/2; s>0; s>>=1)
      {
        if (tid < s)
        {
          sW[tid] += sW[tid + s];
        }
        __syncthreads();
      }
      if (tid == 0) W[blockIdx.x] = sW[0];
    }
  }
}
"""

In [None]:
def load_matrix_module():
  global matrix_module
  mod = SourceModule(matrix_module, keep=False, no_extern_c=True)
  sqrt_reduce = mod.get_function("squaredReduction")
  s_reduce = mod.get_function("reduce")
  return sqrt_reduce, s_reduce

def launch_matrix_normalization(W):
  sqrt_reduce, s_reduce = load_matrix_module()

  ####################################################################################################

  float_size = numpy.dtype(numpy.float32).itemsize

  k = W.shape[0] #matrix order
  W = W.flatten()
  n = W.shape[0]

  txb = 128
  n_streams = 64
  rf = 2

  block, grid, mem_chunks, padding = block_grid_mem_chunks_setup(n, n_streams, txb, rf=rf, padding=True)

  if(padding > 0):
    W = numpy.float32(numpy.append(W, numpy.zeros(padding)))
  
  assert(grid[0]*block[0]*n_streams == W.shape[0] / 2)

  # HOST 
  streams = [None] * n_streams
  h_V = [None] * n_streams
  h_W = [None] * n_streams

  # DEVICE
  d_W = [None] * n_streams
  d_V = [None] * n_streams

  V_size = grid[0]

  for i in range(0, n_streams):
    streams[i] = cuda.Stream()

    s =  i*mem_chunks[i]
    f = (i+1)*mem_chunks[i]
    
    # Allocate on GPU Matrix
    d_W[i] = cuda.mem_alloc(mem_chunks[i] * float_size)

    assert mem_chunks[i] == W[s:f].shape[0]
    assert mem_chunks[i]*float_size == W[s:f].nbytes, "W elemnts type "+str(W.dtype)

    cuda.memcpy_htod_async(d_W[i], W[s:f], streams[i])

    # Allocate on GPU vector where result is put in
    d_V[i] = cuda.mem_alloc(V_size * float_size)

    # define shared memory size
    smem = block[0]*float_size

    #launch kernel
    sqrt_reduce(d_W[i], d_V[i], numpy.uint(mem_chunks[i]),
                grid=grid, block=block, shared=smem, stream=streams[i])

  cuda.Context.synchronize()

  ###############################################################################################################
  
  #ogni stream i gli passo h_V[i*2] e h_V[i*2 + 1]
  # faccio reduction su queste due porzioni di vettore
  # il risulto si trova sommando il primo valore su tutte le h_V pari

  n = V_size * len(h_V)
  txb = int(txb/2)
  n_streams = int(n_streams/2)

  assert txb%2==0
  assert n_streams%2==0

  block, grid, mem_chunks = block_grid_mem_chunks_setup(n, n_streams, txb, rf=rf)
  
  assert(grid[0]*block[0]*n_streams == n / 2)
  assert(V_size >= grid[0])

  # HOST 
  streams = [None] * n_streams
  h_V = [None] * n_streams

  for i in range(0, n_streams):
    streams[i] = cuda.Stream()

    # define shared memory size
    smem = block[0]*float_size
    
    #launch kernel
    s_reduce(d_V[i*2], d_V[i*2+1], numpy.uint32(V_size),
                grid=grid, block=block, shared=smem, stream=streams[i])

    h_V[i] = numpy.zeros(grid[0], dtype=numpy.float32)
    cuda.memcpy_dtoh_async(h_V[i], d_V[i].as_buffer(grid[0]*float_size, 0), streams[i])

  cuda.Context.synchronize()

  ####NOW the cpu summ the h_V, compute sqrt
  ##### then the gpu normalize the matrix
  ##### that is already on GPU memory


  return h_V

In [None]:

k = 8192
W = numpy.float32(numpy.random.rand(k, k) + 2 )

result = W / numpy.linalg.norm(W)


w = launch_matrix_normalization(W)



In [None]:
import numpy
import math

## ORIGINAL W
k = 10
W = numpy.zeros((k, k))
W += 2
W[0,:] += 2

n = W.shape[0]*W.shape[0]
W_0 = W.flatten()

n_streams = 2
chunk = int(n / 2)

assert chunk*n_streams == n

W_c = [None] * n_streams
for i in range(0, n_streams): 
  s =  i*chunk
  f = (i+1)*chunk
  W_c[i] = W_0[s:f]

W_0 = numpy.array(W_c)
W_0 = numpy.concatenate(W_0[:])
W_0 = W.reshape((k,k))

assert (W==W_0).any

In [None]:
n = 3
numpy.random.seed(0)
W = numpy.random.rand(n, n)
fW = W.flatten(order="F") #column major order
e_norm = numpy.linalg.norm(fW)
f_norm = numpy.linalg.norm(W)

assert e_norm == f_norm

assert (W / e_norm ==  W / f_norm).all

# ⚠️ TESTS

## 🏎 Speed Tests

In [None]:
@numba.jit
def avg_time_of(f, w, name, r=10):

  t = 0
  for i in range(0, int(r-1)):
    tic()
    if use_w:
      f(w)
    t += toc(False)
  
  tic()
  theta = f(w)
  t += toc(False)

  print( "Average time of %s: \t\t %f seconds.\n" %(name, (t/r)) )
  
  return theta

### Malloc with(out[hybrid]) streams

In [None]:
%%writefile with_stream.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define N 100000
#define BLOCK_SIZE 128

__global__ void kernel(float *x, int n)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    for (int i = tid; i < n; i += blockDim.x * gridDim.x)
      x[i] = sqrt(pow(3.14159,i));
}


int main()
{
    const int num_streams = 10;
    const int n_stream = N/num_streams;

    cudaStream_t streams[num_streams];
    float *data[num_streams];

    dim3 block(BLOCK_SIZE, 1, 1);
	  dim3 grid((n_stream + block.x - 1) / block.x, 1, 1);

    for (int i = 0; i < num_streams; i++) {
        cudaStreamCreate(&streams[i]);
 
        cudaMalloc(&data[i], n_stream * sizeof(float));
        
        // launch one worker kernel per stream
        kernel<<<grid, block, 0, streams[i]>>>(data[i], n_stream);
    }


    cudaDeviceSynchronize();
    cudaDeviceReset();

    return 0;
}

In [None]:
%%writefile hyb_stream.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

#define N 100000
#define BLOCK_SIZE 128

__global__ void kernel(float *x, int n, int offset)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x + offset;
    for (int i = tid; i < n; i += blockDim.x * gridDim.x)
      x[i] = sqrt(pow(3.14159,i));
}


int main()
{
    const int num_streams = 10;
    const int off_stream = N/num_streams;

    cudaStream_t streams[num_streams];
    float *data;

    cudaMalloc(&data, N * sizeof(float));

    dim3 block(BLOCK_SIZE, 1, 1);
	  dim3 grid((off_stream + block.x - 1) / block.x, 1, 1);

    for (int i = 0; i < num_streams; i++) {
        cudaStreamCreate(&streams[i]);
 
        // launch one worker kernel per stream
        kernel<<<grid, block, 0, streams[i]>>>(data, N, i*off_stream);
    }


    cudaDeviceSynchronize();
    cudaDeviceReset();

    return 0;
}

In [None]:
%%writefile no_stream.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include "src/common.h"

#define N 100000
#define BLOCK_SIZE 128

__global__ void kernel(float *x, int n)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    for (int i = tid; i < n; i += blockDim.x * gridDim.x)
      x[i] = sqrt(pow(3.14159,i));
}


int main()
{
    float *data;
    CHECK(cudaMalloc(&data, N * sizeof(float)));
    
    dim3 block(BLOCK_SIZE, 1, 1);
	  dim3 grid((N + block.x - 1) / block.x, 1, 1);

    // launch one worker kernel per stream
    kernel<<<block, grid>>>(data, N);

    CHECK(cudaDeviceSynchronize());
    cudaDeviceReset();

    return 0;
}

In [None]:
!nvcc -o /content/no_stream /content/no_stream.cu
!nvcc -o /content/hyb_stream /content/hyb_stream.cu
!nvcc -o /content/with_stream /content/with_stream.cu

In [None]:
!nvprof ./no_stream

In [None]:
!nvprof ./hyb_stream

In [None]:
!nvprof ./with_stream

In [None]:
!time ./hyb_stream

In [None]:
!time ./no_stream

In [None]:
!time ./with_stream

### Theta, sin, cos

In [None]:
%%cuda --name theta_test.cu

#include <stdio.h>
#include <stdlib.h>
#include "../src/common.h"
#include <cuda.h>
#include <curand.h>
#include "cublas_v2.h"
#include <math.h>

#define ALPHA 2*M_PI
#define N 10000
#define BLOCK_SIZE 128


__global__ void Kernel_1(float *theta, float* cosTheta, float* sinTheta)
{
  unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
  
  if (tid < N)
  {
    float x = ALPHA * theta[tid];
    theta[tid] = x;
    cosTheta[tid] = cos(x);
    sinTheta[tid] = sin(x);
  }
}

/**
* MAIN
*/
int main() {
  //# curand
  curandGenerator_t gen;
  
  
  float *dTheta, *dSinTheta, *dCosTheta;

  /* # CREATE RANDOM VECTOR # */

  /* Allocate n floats on device */
  CHECK(cudaMalloc((void **) & dTheta, N*sizeof ( float )));
  
  /* Create pseudo - random number generator */
  CHECK_CURAND(curandCreateGenerator (&gen, CURAND_RNG_PSEUDO_XORWOW));
  
  /* Set seed */
  CHECK_CURAND(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
  
  /* Generate N floats on device */
  CHECK_CURAND(curandGenerateUniform (gen , dTheta, N));
  
  /* # CALL KERNEL 1 # */

  /* Allocate n floats on device */
  CHECK(cudaMalloc((void **) & dCosTheta, N*sizeof ( float )));
  /* Allocate n floats on device */
  CHECK(cudaMalloc((void **) & dSinTheta, N*sizeof ( float )));

  dim3 block(BLOCK_SIZE, 1, 1);
	dim3 grid((N + block.x - 1) / block.x, 1, 1);

  Kernel_1<<<block, grid>>>(dTheta, dCosTheta, dSinTheta);

  CHECK(cudaDeviceSynchronize());
	CHECK_CURAND(curandDestroyGenerator(gen)); 
}

In [None]:
!nvcc -o /content/theta_test_cuda /content/src/theta_test.cu -lcurand -lcublas
#!./theta_test_cuda

In [None]:
%%writefile src/theta_test_pycuda.py
import numpy
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import math

def get_chunks_of_size_until_empty(size, n):
  #function that return n as an array of chunks
  n_chunks = math.ceil(n/size)

  chunks = [0] * (n_chunks)

  for i in range(0, n_chunks-1):
      chunks[i] = size

  if(n_chunks == n/size):
      chunks[n_chunks-1] = size
  else:
      chunks[n_chunks-1] = int(n-numpy.sum(chunks))

  return chunks

def block_grid_mem_chunks_setup(n, n_streams, txb, rf=1):
  n_chunk = n/n_streams

  assert(math.floor(n_chunk)==(n_chunk))
  assert(n_chunk*n_streams == n)

  grid = ( math.ceil( n_chunk / txb / rf ), 1, 1 )
  block = ( txb, 1, 1 )
  txs = grid[0]*block[0]

  assert(txs*n_streams*rf >= n)

  mem_chunks = get_chunks_of_size_until_empty(txs*rf, n)

  assert ( len(mem_chunks) == n_streams)
  assert ( numpy.sum(mem_chunks) >= n), ""+str(numpy.sum(mem_chunks))

  return block, grid, mem_chunks

theta_module = """ 
#include <curand_kernel.h>

 extern "C" { 
  __global__ void initCurand(curandState* states, unsigned int n, unsigned int seed)
  {
      unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
      
      if (tid < n)
      {
        curand_init(seed, tid, 0, &states[tid]);
      }
    }

  __global__ void genVector( curandState* states, float *theta, float* cos_theta,
                   float* sin_theta, unsigned int n, float a)
  {
    unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
    if (tid < n)
    {
      float x = a * curand_uniform(&states[tid]);
      theta[tid] = x;
      cos_theta[tid] = cos(x);
      sin_theta[tid] = sin(x);
    }
  }
}

"""

def load_theta_module():
  global theta_module
  mod = SourceModule(theta_module, keep=False, no_extern_c=True)
  init_curand = mod.get_function("initCurand")
  gen_vector = mod.get_function("genVector")
  return init_curand, gen_vector

def launch_theta_module(init_curand, gen_vector, n, n_streams=64):
  # set up streams, chunks size, block size, grid size
  float_size = numpy.dtype(numpy.float32).itemsize
  txb = 128
  block, grid, mem_chunks = block_grid_mem_chunks_setup(n, n_streams, txb)

  # Define HOST streams and curand states
  streams = [None] * n_streams
  states = [None] * n_streams
  # Define DEVICE Algorithms variables
  d_theta = [numpy.ndarray] * n_streams
  d_cos_theta = [numpy.ndarray] * n_streams
  d_sin_theta = [numpy.ndarray] * n_streams

  # launch Init Curand
  for i in range(0, n_streams):
    streams[i] = cuda.Stream()
    states[i] = cuda.mem_alloc(mem_chunks[i] * pycuda.characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
    init_curand(states[i], numpy.uint32(mem_chunks[i]), numpy.uint32(i), grid=grid, block=block, stream=streams[i])

  for i in range(0, n_streams):
    #Allocate DEVICE Algorithms variables
    d_theta[i] = cuda.mem_alloc(mem_chunks[i]*float_size)
    d_cos_theta[i] = cuda.mem_alloc(mem_chunks[i]*float_size)
    d_sin_theta[i] = cuda.mem_alloc(mem_chunks[i]*float_size)
    
    # Launch kernel
    gen_vector(
          states[i], d_theta[i], d_cos_theta[i], d_sin_theta[i], 
          numpy.uint32(mem_chunks[i]), numpy.float32(numpy.pi*2), 
          grid=grid, block=block, stream=streams[i])

    cuda.Context.synchronize()

  return d_theta, d_cos_theta, d_sin_theta, mem_chunks


n = 1000000
n_streams = 64
init, gen = load_theta_module()
launch_theta_module(init, gen, n, n_streams)
exit(0)

In [None]:
%%writefile src/theta_test_numpy.py
import numpy
numpy.random.seed(0)
theta = numpy.pi*2*numpy.random.rand(1000000)
sin_theta = numpy.sin(theta)
cos_theta = numpy.cos(theta)

Execution under NVPROF

In [None]:
#!nvprof --print-gpu-summary ./theta_test_cuda
#!nvprof --print-gpu-trace ./theta_test_cuda
!nvprof ./theta_test_cuda

In [None]:
!nvprof python src/theta_test_pycuda.py

Timing

---------------------

`real` is the time from start to finish of the call. It is the time from the moment you hit the Enter key until the moment the wget command is completed.
`user` amount of CPU time spent in user mode.
`sys` amount of CPU time spent in kernel mode.

In [None]:
!time python ./src/theta_test_pycuda.py

In [None]:
!time python ./src/theta_test_numpy.py

In [None]:
!time ./theta_test_cuda

### Between Whole Functions

Warm up befor real tests

In [None]:
%%capture
X, l, W, n = rand_blobs(5, d=28*28, k=5, std=5, standard=False, display=False)
W = pairwise_distances(X,X, metric='cosine')
trial = 1

avg_time_of(CSN_LEGACY, W, "CSN Legacy", trial)
avg_time_of(CSN_CPU, W, "CSN CPU", trial)
avg_time_of(CSN_CPU_2, W, "CSN CPU 2", trial)
#avg_time_of(CSN_CUPY, W, "CSN CUPY", trial) # too slo

#W = None
#W_CUPY = None

Real tests

In [None]:
X, l, W, n = rand_blobs(10000, d=28*28, k=5, std=5, standard=False, display=False)
W = pairwise_distances(X,X, metric='cosine')
trial = 3

avg_time_of(CSN_LEGACY, W, "CSN Legacy", trial)
avg_time_of(CSN_CPU, W, "CSN CPU", trial)
avg_time_of(CSN_CPU_2, W, "CSN CPU 2", trial)
#avg_time_of(CSN_CUPY, W, "CSN CUPY", trial) # too slow

print("")

RESULTS on Tesla T4

rounds=21
Average time of CSN Legacy: 		 8.325038 seconds.

rounds=21
Average time of CSN CPU: 		 6.899417 seconds.

rounds=21
Average time of CSN CPU 2: 		 6.752142 seconds.

rounds=21
Average time of CSN CUPY: 		 109.777244 seconds.

rounds=21
Average time of CSN CUPY 2: 		 112.692553 seconds.













### Between single fuctions

#### cuRand - numpy

In [None]:
#%%writefile cuRand_test.py

import numpy
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import math

cuRand = """ 
#include <curand_kernel.h>

 extern "C" { 
  __global__ void initCurand(curandState* states, unsigned int n)
  {
      unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
      
      if (tid < n)
      {
        curand_init(1234, tid, 0, &states[tid]);
      }
    }

  __global__ void genVector(curandState* states, float *result, float *result2, unsigned int n, float a)
  {
    unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
    
    if (tid < n)
    {
      result[tid] = a * curand_uniform(&states[tid]);
      result2[tid] = a * curand_uniform(&states[tid]);
    }
  }
}

"""

def loadKernels():
  mod = SourceModule(cuRand, keep=False, no_extern_c=True)
  init_curand = mod.get_function("initCurand")
  gen_vector = mod.get_function("genVector")
  return init_curand, gen_vector

def init(n, h_n, grid, block):
  # Curand States generation
  states = cuda.mem_alloc(n * pycuda.characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>'))
  init_curand(states, h_n, grid=grid, block=block)
  return states

def generate_vector(states, n, h_n, h_pi2, grid, block):
  # Curand vector generation
  h_theta = numpy.zeros(n, dtype=numpy.float32)
  h_theta2 = numpy.zeros(n, dtype=numpy.float32)
  d_theta = cuda.mem_alloc(h_theta.nbytes)
  d_theta2 = cuda.mem_alloc(h_theta.nbytes)
  gen_vector(states, d_theta, d_theta2, h_n, h_pi2, grid=grid, block=block)

  # get result
  cuda.memcpy_dtoh(h_theta, d_theta)
  cuda.memcpy_dtoh(h_theta2, d_theta2)

  return h_theta, h_theta2 

init_curand, gen_vector = loadKernels()

#vector size
n = 1000
#threads per block
threadsperblock = 128
pi2 = numpy.pi*2

#host costant
h_n = numpy.uint32(n)
h_pi2 = numpy.float32(pi2)

#grid size definition
grid = (math.floor((n + threadsperblock - 1) / threadsperblock), 1, 1)
block = (threadsperblock,1,1)

states = init(n, h_n, grid, block)
v, v1 = generate_vector(states, n, h_n, h_pi2, grid, block)

print(v[0:5])
print(v1[0:5])

#print(v[0:20])

size = 1.000.000

Elapsed time: 3.738552 seconds.

In [None]:
!rm cuRand_test.nvvp
!nvprof -s -o cuRand_test.nvvp python cuRand_test.py

In [None]:
#### REMOVE FROM PREVIUS CELL <<%%writefile ...>> AND ADD THIS CELL AT THE BOTTOM
#### TO TEST CURAND AND NUMPY

print("curand")
tic()
theta_gpu = generate_vector(n, h_n, h_pi2, grid, block)
toc()

print("numpy.random.rand")
tic()
numpy.random.seed(0)
theta_cpu = numpy.pi*2*numpy.random.rand(n)
toc()

#print("theta gpu shape"+str(theta_gpu.shape))
#print("theta cpu shape"+str(theta_cpu.shape))

#### numpy - cupy

In [None]:
import numpy
import cupy

def np(n):
  numpy.random.seed(0)
  theta = 2*numpy.pi*numpy.random.rand(n)

  sin_t = numpy.sin(theta)
  cos_t = numpy.cos(theta)

  print(theta.shape)
  print(sin_t.shape)
  print(cos_t.shape)

def cp(n):
  cupy.random.seed(0)
  theta = 2*cupy.pi*cupy.random.rand(n)

  sin_t = cupy.sin(theta)
  cos_t = cupy.cos(theta)

  cupy.asnumpy(theta)
  cupy.asnumpy(sin_t)
  cupy.asnumpy(cos_t)

  return

In [None]:
n = 20

np(n)
cp(n)

k=1000
n = 1000*k

tic()
np(n)
t0 = toc()

tic()
cp(n)
t1 = toc()




## ✔ Correctness Tests

In [None]:
X, l, W, n = rand_blobs(100, d=28*28, k=5, std=5, standard=False, display=False)
W = pairwise_distances(X,X, metric='cosine')

In [None]:
#%%capture
eps = 0.00001

t0 = CSN_LEGACY(W)
t1 = CSN_CPU(W)
t2 = CSN_CPU_2(W)
t3 = CSN_CUPY(W_CUPY)


In [None]:
print(" CSN_CPU is %s" % numpy.allclose(t0, t1, atol=eps))
print(" CSN_CPU_2 is %s" % numpy.allclose(t0, t2, atol=eps))
print(" CSN_CPU_CUPY is %s" % numpy.allclose(t0, t3, atol=eps))

# 🏞 SHOW RESULT

In [None]:
X, l, W, n = rand_blobs(10, d=28*28, k=5, std=5, standard=False)
W = pairwise_distances(X,X, metric='cosine')

In [None]:
theta = CSN_CPU(W)

In [None]:
hist, bins = histogram(theta, nbins=256)
plot_circle(theta,l)
plot_hist(hist, bins, mode=0)

# scratch

In [None]:
void matrix_norm(float *matrix)
{
  const uint num_streams = 100;
  const uint chunk = (N*N)/num_streams;

  cublasHandle_t handles[num_streams];
  cudaStream_t streams[num_streams];
  float *data[num_streams];

  dim3 block(BLOCK_SIZE, 1, 1);
  dim3 grid((chunk + block.x - 1) / block.x, 1, 1);

  for (int i = 0; i < num_streams; i++) {
      CHECK(cudaStreamCreate(&streams[i]));
      CHECK_CUBLAS(cublasCreate(&handle[i]));
      CHECK_CUBLAS(cublasSetStream(handle[i],stream[i]));

      CHECK(cudaMalloc(&data[i], chunk * sizeof(float)));
      CHECK(cuda)
  }
 
  
}