Hypothesis: It's really only worth using this for very large Hamiltonians, large enough that you can't precompute the fermion inner-products and hold them in memory. Otherwise, the benefit of precomputing the inner-products far outweighs anything you'll get from Numba/Cuda. 

In [1]:
import os 
import time
import copy
from tqdm import tqdm

import numpy as np 
import math
import matplotlib.pyplot as plt
import scipy
import pandas as pd

from joblib import Parallel, delayed
from numba import jit, njit, prange, cuda, float32, float64, complex64, complex128, types
import numba as nb

np.random.seed(0)

# Global variables
TPB = 16 # threads per block
BPG_MULTIPLIER = 1 # blocks per grid multiplier
REAL_FLOAT_TYPE = np.float64
REAL_COMPLEX_TYPE = np.complex128
REAL_INT_TYPE = np.int32
LOAD_PYTHON = False#True # Whether to generate Hamiltonians or load them from file (along with the associated coefficients, of course)

# Physical constants
K=7 # number of fermionic modes
J=4 # ~"energy scale"
Q=3 # order of coupling
N = 2*K # number of fermions
N_DIM = 2**K # Hilbert space dimensions

N_SAMPLES = 10 # number of samples to generate
N_JOBS = 20 # number of jobs to run in parallel

# 1. Define Python Hamiltonian as benchmark

In [2]:
#### Define fermionic modes
cr = np.array([[0,1],[0,0]])
an = np.array([[0,0],[1,0]])
id = np.identity(2)
id2 = np.array([[-1,0],[0,1]])

def c(n):
    factors = [id for i in range(n-1)]+[cr]+[id2 for i in range(K-n)]
    out = factors[0]
    for i in range(1, K):
        out = np.kron(out,factors[i])
    return out

def cd(n):
    factors = [id for i in range(n-1)]+[an]+[id2 for i in range(K-n)]
    out = factors[0]
    for i in range(1, K):
        out = np.kron(out,factors[i])
    return out

#### Define fermions
# Compute first N psi's
psi_h = np.zeros((N, N_DIM, N_DIM), dtype=np.complex128)
for i in range(1,K+1):
    psi_h[2*(i-1)] = (c(i)+cd(i))/np.sqrt(2)
    psi_h[2*(i-1)+1] = (c(i)-cd(i))*(-1j/np.sqrt(2))

# Copy to GPU
psi_d = cuda.to_device(psi_h)

def H3_python(js): #js being the random coefficients
    # Compute Hamiltonian
    H = np.zeros((N_DIM, N_DIM), dtype=np.complex128)
    for i in range(N-2):
        psi_i = psi_h[i]
        for j in range(i+1, N-1):
            psi_ij=psi_i@psi_h[j]
            for k in range(j+1, N):
                psi_ijk = psi_ij@psi_h[k]
                H += (1j**(Q/2))*js[i, j, k]*psi_ijk

    return H

#### Generate random coefficients
sigma_j = np.sqrt((J**2)*np.math.factorial(Q-1)/(N**(Q-1)))
js_all = [np.random.normal(0, sigma_j, size=tuple([N for i in range(Q)])) for j in range(N_SAMPLES+1)]

In [3]:
# Test: 55.O SECONDS
js_test = js_all[0]
H3_python_test = H3_python(js_test)

# For K=7: 0.2 seconds
# For K=10: 62.3 SECONDS
if LOAD_PYTHON:
    js_test = np.load(os.path.join("Excel", "Benchmarks", f"js{Q}_benchmark.npy"))
    H3_python_test = np.load(os.path.join("Excel", "Benchmarks", f"H{Q}_python_benchmark.npy"))
else:                   
    js_test = js_all[0]

    tic = time.time()
    H3_python_test = H3_python(js_test)
    toc = time.time()
    duration = toc-tic
    print(f"Hamiltonian generation, python: {duration//60} minutes, {duration%60} seconds")

    np.save(os.path.join("Excel", "Benchmarks", f"js{Q}_benchmark.npy"), js_test)
    np.save(os.path.join("Excel", "Benchmarks", f"H{Q}_python_benchmark.npy"), H3_python_test)

Hamiltonian generation, python: 0.0 minutes, 0.11816620826721191 seconds


K=7: 0.1s

K=8: 1.4s

# 2. Define CUDA Hamiltonian

Method: elementwise function for Hamiltonian

In essence, each thread is responsible for computing the value at a specific index, e.g. $(\alpha,\beta)$, of the Hamiltonian, which is given by 

$H[\alpha,\beta] = \sum_{k>j>i} J_{ijk} \psi_i[\alpha,:] \psi_j \psi_k[:,\beta]$

In [28]:
#TODO: grid sync, thread sync, shared memory
@cuda.jit(device=True)
def matmul_vecarr_device(A, B, C):
    """Perform square matrix multiplication of C = A * B
    """
    for i in range(len(C)):
        tmp = 0.
        for j in range(len(A)):
            tmp += A[j] * B[i,j]
        C[i] = tmp

@cuda.jit
def matmul_vecvec_device(A, B, C):
    """Perform square matrix multiplication of C = A * B
    """
    for i in range(len(C)):
        tmp = 0.
        for j in range(len(A)):
            tmp += A[j] * B[j]
        C[i]=tmp

@cuda.jit(device=True)
def matmul_device(A, B, C):
    """Perform square matrix multiplication of C = A * B
    """
    for i in range(N_DIM):
        for j in range(N_DIM):
            tmp = 0.
            for k in range(A.shape[1]):
                tmp += A[i, k] * B[k, j]
        C[i, j] = tmp

@cuda.jit(debug=True, opt=False)#(types.void(types.float64[:,:], types.complex128[:,:]))
def H3_cuda(js, H3, psi):

    alpha, beta = cuda.grid(2) # These are the indeces of H3 for which this kernel is responsible
    if (alpha < H3.shape[0]) and (beta < H3.shape[1]):
        
        #cuda.syncthreads()
        H_ab = 0
        for i in range(js.shape[0]-2):
            psi_i = psi[i]
            psi_i_a = psi[i][alpha,:]

            for j in range(i+1, js.shape[1]-1):
                psi_j = psi[j]

                psi_j1 = psi[j+1]
                # 2 lines below crop error
                #psi_ij1 = cuda.shared.array((N_DIM, N_DIM), dtype=np.complex128)
                #matmul_device(psi_i, psi_j1, psi_ij1)

                # Compute dot product: psi_i[alpha,:] @ psi_j[:,beta]
                psi_ij_a = cuda.shared.array((N_DIM,), dtype=np.complex128) # initialize destination array of intermediate product, in other words "shared memory"
                matmul_vecarr_device(psi_i_a, psi_j, psi_ij_a)

                for k in range(j+1, js.shape[2]):
                    psi_k_b = psi[k][:,beta]
                    
                    # Compute dot product: psi_ij_a @ psi_k_b
                    psi_ijk_ab = cuda.shared.array((1,1), dtype=np.complex128) # initialize destination array of intermediate product, in other words "shared memory"
                    matmul_vecvec_device(psi_ij_a, psi_k_b, psi_ijk_ab)
                    
                    j_ijk = js[i, j, k]
                    #H_ab += (1j**(Q/2))*j_ijk*psi_ijk_ab[0,0]
                    cuda.atomic.add(H3, (alpha, beta), (1j**(Q/2))*j_ijk*psi_ijk_ab[0,0])

        H3[alpha, beta] = H_ab

# Test
js_test_h = js_all[0]
js_test_d = cuda.to_device(js_test_h)

H3_test_h = np.zeros((N_DIM, N_DIM), dtype=np.complex128)
#H3_test_d = cuda.device_array((N_DIM, N_DIM), dtype=np.complex128)
H3_test_d = cuda.to_device(H3_test_h)

threadsperblock = (4, 4)
blockspergrid_x = math.ceil(H3_test_d.shape[0] / threadsperblock[0])
blockspergrid_y = math.ceil(H3_test_d.shape[1] / threadsperblock[1])
blockspergrid = (blockspergrid_x, blockspergrid_y)

tic = time.time()
print(f"Config: K={K}, Q={Q}, J={J}, threadsperblock={threadsperblock}, blockspergrid={blockspergrid}")
H3_cuda[blockspergrid, threadsperblock](js_test_d, H3_test_d, psi_d)
duration = time.time() - tic
print(f"Duration: {duration//60} minutes, {duration%60} seconds")


Config: K=7, Q=3, J=4, threadsperblock=(4, 4), blockspergrid=(32, 32)


TypingError: Failed in cuda mode pipeline (step: nopython frontend)
[1m[1m[1mNo implementation of function Function(<class 'numba.cuda.stubs.atomic.add'>) found for signature:
 
 >>> add(array(complex128, 2d, C), UniTuple(int32 x 2), complex128)
 
There are 2 candidate implementations:
[1m  - Of which 2 did not match due to:
  Overload of function 'add': File: numba\cuda\cudadecl.py: Line 390.
    With argument(s): '(array(complex128, 2d, C), UniTuple(int32 x 2), complex128)':[0m
[1m   No match.[0m
[0m
[0m[1mDuring: resolving callee type: Function(<class 'numba.cuda.stubs.atomic.add'>)[0m
[0m[1mDuring: typing of call at C:\Users\abdel\AppData\Local\Temp\ipykernel_19872\2043246606.py (66)
[0m
[1m
File "..\..\..\..\..\AppData\Local\Temp\ipykernel_19872\2043246606.py", line 66:[0m
[1m<source missing, REPL/exec in use?>[0m


- K=7, bpg_multiplier=64: 44.2 seconds
- K=7, bpg_multiplier=8: 41 seconds
- K=8: 14 minutes, 15.4 seconds

## Check that Python and CUDA implementations agree

In [25]:
H3_cuda_test = H3_test_d.copy_to_host()
print(np.allclose(H3_python_test, H3_cuda_test))
print(np.allclose(H3_python_test.real, H3_cuda_test.real))

False
False


In [26]:
print(H3_python_test[0].real)

[ 0.          0.10991912 -0.13799655  0.         -0.29160098  0.
  0.         -0.24553369  0.06160392  0.          0.          0.35215939
  0.          0.17796138 -0.14947524  0.         -0.01984952  0.
  0.         -0.33390324  0.          0.25467405  0.04530312  0.
  0.          0.28326189  0.05675327  0.          0.25171002  0.
  0.          0.          0.2443454   0.          0.         -0.53768181
  0.          0.44968509 -0.14235685  0.          0.         -0.32796051
 -0.14743016  0.         -0.57624634  0.          0.          0.
  0.         -0.22477497 -0.29709331  0.          0.46000882  0.
  0.          0.         -0.22097222  0.          0.          0.
  0.          0.          0.          0.          0.11725844  0.
  0.          0.11178457  0.         -0.50004678 -0.32169739  0.
  0.         -0.58642729  0.38099436  0.         -0.21829082  0.
  0.          0.          0.         -0.60144644 -0.18077908  0.
 -0.28200682  0.          0.          0.          0.17874883  0.
 

In [27]:
print(H3_cuda_test[0].real)

[ 0.          0.          0.          0.          0.07730385  0.07730385
  0.07730385  0.07730385  0.49138994  0.49138994  0.49138994  0.49138994
  0.          0.          0.          0.          0.04839577  0.04839577
  0.04839577  0.04839577  0.          0.          0.          0.
  0.          0.          0.          0.          0.17877991  0.17877991
  0.17877991  0.17877991 -0.01605593 -0.01605593 -0.01605593 -0.01605593
  0.          0.          0.          0.          0.          0.
  0.          0.         -0.02685485 -0.02685485 -0.02685485 -0.02685485
  0.          0.          0.          0.         -0.27911558 -0.27911558
 -0.27911558 -0.27911558  0.14994833  0.14994833  0.14994833  0.14994833
  0.          0.          0.          0.          0.11725844  0.11725844
  0.11725844  0.11725844  0.          0.          0.          0.
  0.          0.          0.          0.         -0.33931914 -0.33931914
 -0.33931914 -0.33931914  0.          0.          0.          0.
 -0.177588