In [1]:
!nvidia-smi

Wed Mar 22 01:42:12 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P0    27W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install pycuda scikit-cuda

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycuda
  Downloading pycuda-2022.2.2.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting scikit-cuda
  Downloading scikit_cuda-0.5.3-py2.py3-none-any.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.8/114.8 KB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytools>=2011.2
  Downloading pytools-2022.1.14.tar.gz (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.6/74.6 KB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting appdirs>=1.4.0
  Downloading appdirs-1.4.4-py2.py3-none-any.whl 

In [3]:
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.compiler import SourceModule
import numpy as np
from sympy import Rational


In [11]:
ker = SourceModule(no_extern_c=True ,source='''
#include <curand_kernel.h>
#define _PYTHAG(a,b)  (a*a + b*b)
#define ULL  unsigned long long

extern "C" {

__global__ void estimate_pi(ULL iters, ULL * hits)
{

	curandState cr_state;
     
	int tid = blockIdx.x * blockDim.x + threadIdx.x;

  // new random seed for each thread based on the clock value
	curand_init( (ULL)  clock() + (ULL) tid, (ULL) 0, (ULL) 0, &cr_state);

	float x, y;
 
	for(ULL i=0; i < iters; i++)
	{ 

		 x = curand_uniform(&cr_state);
		 y = curand_uniform(&cr_state);
		 
		 
		 if(_PYTHAG(x,y) <= 1.0f)
			 hits[tid]++;
	}
 
 return;

}

}// (End of 'extern "C"' here)
''')





In [12]:
pi_ker = ker.get_function("estimate_pi")

threads_per_block = 32
blocks_per_grid = 512 

total_threads = threads_per_block * blocks_per_grid
print(f'total_threads:{total_threads}')

iters = 2**24   
print(f'Total iterations: {iters}')

total_threads:16384
Total iterations: 16777216


In [13]:
hits_d = gpuarray.zeros((total_threads,),dtype=np.uint64)



In [14]:
pi_ker(np.uint64(iters), hits_d, grid=(blocks_per_grid,1,1), block=(threads_per_block,1,1))


In [16]:
total_hits = np.sum( hits_d.get()  )
total = np.uint64(total_threads) * np.uint64(iters)

est_pi_symbolic =  Rational(4)*Rational(int(total_hits), int(total) )
est_pi = float(est_pi_symbolic.evalf())


In [17]:
print(f"Our Monte Carlo estimate of Pi is : {est_pi}")
print(f"NumPy's Pi constant is: {np.pi}" )
print(f"Our estimate passes NumPy's 'allclose' : {np.allclose(est_pi, np.pi)}")


Our Monte Carlo estimate of Pi is : 3.141595962792053
NumPy's Pi constant is: 3.141592653589793
Our estimate passes NumPy's 'allclose' : True
