Hypothesis: It's really only worth using this for very large Hamiltonians, large enough that you can't precompute the fermion inner-products and hold them in memory. Otherwise, the benefit of precomputing the inner-products far outweighs anything you'll get from Numba/Cuda. 

In [1]:
import os 
import time
import copy
from tqdm import tqdm

import numpy as np 
import math
import matplotlib.pyplot as plt
import scipy
import pandas as pd

from joblib import Parallel, delayed
from numba import jit, njit, prange, cuda, float32, float64, complex64, complex128, types
import numba as nb

np.random.seed(0)

# Global variables
TPB = 16 # threads per block
BPG_MULTIPLIER = 1 # blocks per grid multiplier
REAL_FLOAT_TYPE = np.float64
REAL_COMPLEX_TYPE = np.complex128
REAL_INT_TYPE = np.int32
LOAD_PYTHON = False#True # Whether to generate Hamiltonians or load them from file (along with the associated coefficients, of course)


# Physical constants
K=7 # number of fermionic modes
J=4 # ~"energy scale"
Q=3 # order of coupling
N = 2*K # number of fermions
N_DIM = 2**K # Hilbert space dimensions

N_SAMPLES = 10 # number of samples to generate
N_JOBS = 20 # number of jobs to run in parallel

# 1. Define Python Hamiltonian as benchmark

In [5]:
#### Define fermionic modes
cr = np.array([[0,1],[0,0]])
an = np.array([[0,0],[1,0]])
id = np.identity(2)
id2 = np.array([[-1,0],[0,1]])

def c(n):
    factors = [id for i in range(n-1)]+[cr]+[id2 for i in range(K-n)]
    out = factors[0]
    for i in range(1, K):
        out = np.kron(out,factors[i])
    return out

def cd(n):
    factors = [id for i in range(n-1)]+[an]+[id2 for i in range(K-n)]
    out = factors[0]
    for i in range(1, K):
        out = np.kron(out,factors[i])
    return out

#### Define fermions
# Compute first N psi's
psi_h = np.zeros((N, N_DIM, N_DIM), dtype=np.complex128)
for i in range(1,K+1):
    psi_h[2*(i-1)] = (c(i)+cd(i))/np.sqrt(2)
    psi_h[2*(i-1)+1] = (c(i)-cd(i))*(-1j/np.sqrt(2))

psi_d = cuda.to_device(psi_h)

def H3_python(js): #js being the random coefficients
    # Compute Hamiltonian
    H = np.zeros((N_DIM, N_DIM), dtype=np.complex128)
    for i in range(N-2):
        psi_i = psi_h[i]
        for j in range(i+1, N-1):
            psi_ij=psi_i@psi_h[j]
            for k in range(j+1, N):
                psi_ijk = psi_ij@psi_h[k]
                H += (1j**(Q/2))*js[i, j, k]*psi_ijk

    return H

#### Generate random coefficients
sigma_j = np.sqrt((J**2)*np.math.factorial(Q-1)/(N**(Q-1)))
js_all = [np.random.normal(0, sigma_j, size=tuple([N for i in range(Q)])) for j in range(N_SAMPLES+1)]

In [3]:
# Test: 55.O SECONDS
js_test = js_all[0]
H3_python_test = H3_python(js_test)

# For K=7: 0.2 seconds
# For K=10: 62.3 SECONDS
if LOAD_PYTHON:
    js_test = np.load(os.path.join("Excel", "Benchmarks", "js4_benchmark.npy"))
    H3_python_test = np.load(os.path.join("Excel", "Benchmarks", "H3_python_benchmark.npy"))
else:                   
    js_test = js_all[0]

    tic = time.time()
    H3_python_test = H3_python(js_test)
    toc = time.time()
    duration = toc-tic
    print(f"Hamiltonian generation, python: {duration//60} minutes, {duration%60} seconds")

Hamiltonian generation, python: 0.0 minutes, 0.12907838821411133 seconds


# 2. Define CUDA Hamiltonian

Method: Use thread-index to determine which value of $(i^{Q/2})*J_{ijk} \psi_i \psi_j \psi_k$ that the thread is responsible for computing and adding to the total Hamiltonian

In [25]:
#TODO: thread_sync, grid_sync, shared memory
@cuda.jit(device=True)
def matmul_device(A, B, C):
    """Perform square matrix multiplication of C = A * B
    """
    for i in range(N_DIM):
        for j in range(N_DIM):
            tmp = 0.
            for k in range(N_DIM):
                tmp += A[i, k] * B[k, j]
        C[i, j] = tmp

@cuda.jit(inline=True, device=True)
def matmul_vecarr_device(A, B, C):
    """Perform square matrix multiplication of C = A * B
    """
    for i in range(N_DIM):
        tmp = 0.
        for j in range(N_DIM):
            tmp += A[j] * B[i,j]
        C[i] = tmp


@cuda.jit()#(types.void(types.float64[:,:], types.complex128[:,:], types.complex128[:,:]), debug=True)
def H3_cuda(js, H3, psi):
    i, j, k = cuda.grid(3)
    #TODO: Use shared memory to store psi_ij and psi_ijk
    
    if (i<N-2) and (i<j<N-1) and (j<k<N):
        
        psi_i = psi[i]
        psi_j = psi[j]
        psi_k = psi[k]
        j_ijk = js[i, j, k]

        psi_ij = cuda.shared.array(shape=(N_DIM, N_DIM), dtype=np.complex128)
        #for i in range(N_DIM): # method 1

         #   matmul_vecarr_device(psi_i[i], psi_j, psi_ij[i]) #method 2
        #matmul_device(psi_i, psi_j, psi_ij)

        # this one works for some reason
        psi_i1 = psi_i[1,:]
        psi_i1j = cuda.shared.array(shape=(N_DIM,), dtype=np.complex128)
        matmul_vecarr_device(psi_i1, psi_j, psi_i1j)

    
        #manual matmul
        for ii in range(N_DIM):
            later = """
            for jj in range(N_DIM):
                tmp = 0.
                for kk in range(N_DIM):
                    tmp += psi_i[ii, kk] * psi_j[kk, jj]
                psi_ij[ii, jj] = tmp

        psi_ijk = cuda.shared.array(shape=(N_DIM, N_DIM), dtype=np.complex128)
        #fast_matmul(psi_ij, psi_k, psi_ijk)

        #for alpha in range(H3.shape[0]):
            #for beta in range(H3.shape[1]):
                #cuda.atomic.add(H3, (alpha, beta), (1j**(Q/2))*j_ijk*psi_ijk[alpha, beta])
                #pass

        #cuda.atomic.add(H3, (i, j), (1j**(Q/2))*j_ijk*psi_ijk)"""

js_test_h = js_all[0].astype(np.complex128)
js_test_d = cuda.to_device(js_test_h)

H3_cuda_test_h = np.zeros((N_DIM, N_DIM), dtype=np.complex128)
H3_cuda_test_d = cuda.to_device(H3_cuda_test_h)

threadsperblock = (TPB, TPB)
blockspergrid_x = math.ceil(H3_cuda_test_h.shape[0] / threadsperblock[0])
blockspergrid_y = math.ceil(H3_cuda_test_h.shape[1] / threadsperblock[1])
blockspergrid = (blockspergrid_x, blockspergrid_y)

tic = time.time()
H3_cuda[blockspergrid, threadsperblock](js_test_d, H3_cuda_test_d, psi_d)
toc = time.time()
duration = toc-tic
print(f"Duration: {duration} seconds")




Duration: 0.31600046157836914 seconds


In [22]:
print(np.allclose(H3_python_test, H3_cuda_test_h))
print(np.allclose(H3_python_test.real, H3_cuda_test_h.real))

False
False


In [None]:
@cuda.jit()
def matmul(A, B, C):
    """Perform square matrix multiplication of C = A * B
    """
    for i in range(A.shape[0]):
        for j in range(A.shape[0]):
            tmp = 0.
            for k in range(A.shape[0]):
                tmp += A[i, k] * B[k, j]
        C[i, j] = tmp

# Test
js_test_h = js_all[0].astype(np.complex128)
js_test_d = cuda.to_device(js_test_h)

H3_test_h = np.zeros((N_DIM, N_DIM), dtype=np.complex128)
H3_test_d = cuda.to_device(H3_test_h)


for i in range(N-2):
    psi_i = psi_d[i]
    for j in range(i+1, N-1):
        psi_j = psi_d[j]
        

        psi_ij = cuda.device_array(shape=(N_DIM, N_DIM), dtype=np.complex128)
        threadsperblock = (TPB, TPB)
        blockspergrid_x = math.ceil(N_DIM / threadsperblock[0])
        blockspergrid_y = math.ceil(N_DIM / threadsperblock[1])
        blockspergrid = (blockspergrid_x, blockspergrid_y)
        matmul[blockspergrid, threadsperblock](psi_i, psi_j, psi_ij)

        later = """
        for k in range(j+1,N_DIM):
            psi_k = psi_d[k]

            psi_ijk = cuda.device_array(shape=(N_DIM, N_DIM), dtype=np.complex128)
            matmul[blockspergrid, threadsperblock](psi_ij, psi_k, psi_ijk)

            j_ijk = js_test_d[i, j, k]
            H3_test_d += j_ijk*psi_ijk"""
        
