Hypothesis: It's really only worth using this for very large Hamiltonians, large enough that you can't precompute the fermion inner-products and hold them in memory. Otherwise, the benefit of precomputing the inner-products far outweighs anything you'll get from Numba/Cuda. 

In [1]:
import os 
import time
import copy
from tqdm import tqdm

import numpy as np 
import math
import matplotlib.pyplot as plt
import scipy
import pandas as pd

from joblib import Parallel, delayed
from numba import jit, njit, prange, cuda, float32, float64, complex64, complex128, types
import numba as nb

np.random.seed(0)

# Global variables
TPB = 16 # threads per block
BPG_MULTIPLIER = 1 # blocks per grid multiplier
REAL_FLOAT_TYPE = np.float64
REAL_COMPLEX_TYPE = np.complex128
REAL_INT_TYPE = np.int32
LOAD_PYTHON = False#True # Whether to generate Hamiltonians or load them from file (along with the associated coefficients, of course)

# Physical constants
K = 7 # number of fermionic modes
J = 4 # ~"energy scale"
Q = 4 # order of coupling
N = 2*K # number of fermions
N_DIM = 2**K # Hilbert space dimensions

N_SAMPLES = 10 # number of samples to generate
N_JOBS = 20 # number of jobs to run in parallel

# 1. Define Python Hamiltonian as benchmark

In [2]:
#### Define fermionic modes
cr = np.array([[0,1],[0,0]])
an = np.array([[0,0],[1,0]])
id = np.identity(2)
id2 = np.array([[-1,0],[0,1]])

def c(n):
    factors = [id for i in range(n-1)]+[cr]+[id2 for i in range(K-n)]
    out = factors[0]
    for i in range(1, K):
        out = np.kron(out,factors[i])
    return out

def cd(n):
    factors = [id for i in range(n-1)]+[an]+[id2 for i in range(K-n)]
    out = factors[0]
    for i in range(1, K):
        out = np.kron(out,factors[i])
    return out

#### Define fermions
# Compute first N psi's
psi_h = np.zeros((N, N_DIM, N_DIM), dtype=np.complex128)
for i in range(1,K+1):
    psi_h[2*(i-1)] = (c(i)+cd(i))/np.sqrt(2)
    psi_h[2*(i-1)+1] = (c(i)-cd(i))*(-1j/np.sqrt(2))

## Copy to GPU
psi_d = cuda.to_device(psi_h)

def H4_python(js): #js being the random coefficients
    # Compute Hamiltonian
    H = np.zeros((N_DIM, N_DIM), dtype=np.complex128)
    for i in range(N-3):
        psi_i = psi_h[i]
        for j in range(i+1, N-2):
            psi_ij=psi_i@psi_h[j]
            for k in range(j+1, N-1):
                psi_ijk = psi_ij@psi_h[k]
                for l in range(k+1, N):
                    psi_ijkl = psi_ijk@psi_h[l]
                    H += js[i, j, k, l]*psi_ijkl

    return H

#### Generate random coefficients
sigma_j = np.sqrt((J**2)*np.math.factorial(Q-1)/(N**(Q-1)))
js_all = [np.random.normal(0, sigma_j, size=tuple([N for i in range(Q)])) for j in range(N_SAMPLES+1)]

Don't run this one if you don't want to wait

In [4]:
# For K=7: 0.2 seconds
# For K=10: 62.3 SECONDS
if LOAD_PYTHON:
    js_test = np.load(os.path.join("Excel", "Benchmarks", "js4_benchmark.npy"))
    H4_python_test = np.load(os.path.join("Excel", "Benchmarks", "H4_python_benchmark.npy"))
else:                   
    js_test = js_all[0]

    tic = time.time()
    H4_python_test = H4_python(js_test)
    toc = time.time()
    duration = toc-tic
    print(f"Hamiltonian generation, python: {duration//60} minutes, {duration%60} seconds")

    np.save(os.path.join("Excel", "Benchmarks", "js3_benchmark.npy"), js_test)
    np.save(os.path.join("Excel", "Benchmarks", "H3_python_benchmark.npy"), H4_python_test)

Hamiltonian generation, python: 0.0 minutes, 0.32666444778442383 seconds


In [5]:
#np.save(os.path.join("Excel", "Benchmarks", "js4_benchmark.npy"), js_test)
#np.save(os.path.join("Excel", "Benchmarks", "H4_python_benchmark.npy"), H4_python_test)

# 2. Define hybrid Python-CUDA Hamiltonian

Method: Pretty much the same as the Python Hamiltonian, but use CUDA kernels to parallelize the matrix-multiplications and elementwise-addition.

$H = \sum_{k>j>i} J_{ijk} \psi_i \psi_j \psi_k$

$H[\alpha, \beta] = \sum_{k>j>i} J_{ijk} \left( \psi_i \psi_j \psi_k[\alpha, \beta] \right)$
$ = \sum_{k>j>i} J_{ijk} \left( \psi_{ijk}[\alpha, \beta] \right)$

Well, $\psi_l \psi_k [\alpha, \beta] = $

## Define CUDA fast matrix-multiply

In [7]:
# Controls threads per block and shared memory usage.
# The computation will be done on blocks of TPBxTPB elements.
# TPB should not be larger than 32 in this example

@cuda.jit
def fast_matmul(A, B, C):
    """
    Perform matrix multiplication of C = A * B using CUDA shared memory.

    Reference: https://stackoverflow.com/a/64198479/13697228 by @RobertCrovella
    """
    # Define an array in the shared memory
    # The size and type of the arrays must be known at compile time
    sA = cuda.shared.array(shape=(TPB, TPB), dtype=np.complex128)
    sB = cuda.shared.array(shape=(TPB, TPB), dtype=np.complex128)

    x, y = cuda.grid(2)

    tx = cuda.threadIdx.x
    ty = cuda.threadIdx.y
    bpg = cuda.gridDim.x    # blocks per grid

    # Each thread computes one element in the result matrix.
    # The dot product is chunked into dot products of TPB-long vectors.
    tmp = float32(0.)
    for i in range(bpg):
        # Preload data into shared memory
        sA[ty, tx] = 0
        sB[ty, tx] = 0
        if y < A.shape[0] and (tx + i * TPB) < A.shape[1]:
            sA[ty, tx] = A[y, tx + i * TPB]
        if x < B.shape[1] and (ty + i * TPB) < B.shape[0]:
            sB[ty, tx] = B[ty + i * TPB, x]

        # Wait until all threads finish preloading
        cuda.syncthreads()

        # Computes partial product on the shared memory
        for j in range(TPB):
            tmp += sA[ty, j] * sB[j, tx]

        # Wait until all threads finish computing
        cuda.syncthreads()
    if y < C.shape[0] and x < C.shape[1]:
        C[y, x] = tmp

x_h = np.arange(16).reshape([4, 4]).astype(np.complex128)+2*1j*np.arange(16).reshape([4, 4]).astype(np.complex128)
y_h = np.ones([4, 4]).astype(np.complex128)+4*1j*np.ones([4, 4]).astype(np.complex128)
z_h = np.zeros([4, 4]).astype(np.complex128)

x_d = cuda.to_device(x_h)
y_d = cuda.to_device(y_h)
z_d = cuda.to_device(z_h)

threadsperblock = (TPB, TPB)
blockspergrid_x = math.ceil(z_h.shape[0] / threadsperblock[0])
blockspergrid_y = math.ceil(z_h.shape[1] / threadsperblock[1])
blockspergrid = (blockspergrid_x, blockspergrid_y)

fast_matmul[blockspergrid, threadsperblock](x_d, y_d, z_d)
z_h = z_d.copy_to_host()
print(z_h)
print(x_h @ y_h)



[[ -42. +36.j  -42. +36.j  -42. +36.j  -42. +36.j]
 [-154.+132.j -154.+132.j -154.+132.j -154.+132.j]
 [-266.+228.j -266.+228.j -266.+228.j -266.+228.j]
 [-378.+324.j -378.+324.j -378.+324.j -378.+324.j]]
[[ -42. +36.j  -42. +36.j  -42. +36.j  -42. +36.j]
 [-154.+132.j -154.+132.j -154.+132.j -154.+132.j]
 [-266.+228.j -266.+228.j -266.+228.j -266.+228.j]
 [-378.+324.j -378.+324.j -378.+324.j -378.+324.j]]


In [8]:
@cuda.jit(types.void(float64[:,:], float64[:,:]))
def parallel_add(src, dest):
    i, j = cuda.grid(2)
    if (i<dest.shape[0]) and (j<dest.shape[1]):
        cuda.atomic.add(dest, (i,j), src[i,j]) # <-- THIS FUNCTION DISCARDS THE IMAGINARY PART OF COMPLEX NUMBERS. ONLY USE FOR REAL NUMBERS


In [23]:
def H4_hybrid(H4_d, js_d):
    threadsperblock = (TPB, TPB)
    blockspergrid_x = math.ceil(N_DIM / threadsperblock[0])
    blockspergrid_y = math.ceil(N_DIM / threadsperblock[1])
    blockspergrid = (blockspergrid_x, blockspergrid_y)

    tic = time.time()
    for i in range(N-3):
        psi_i = psi_d[i]

        for j in range(i+1, N-2):
            psi_j = psi_d[j]
            psi_ij = cuda.device_array(shape=(N_DIM, N_DIM), dtype=np.complex128)
            fast_matmul[blockspergrid, threadsperblock](psi_i, psi_j, psi_ij)

            for k in range(j+1, N-1):
                psi_k = psi_d[k]
                psi_ijk = cuda.device_array(shape=(N_DIM, N_DIM), dtype=np.complex128)
                fast_matmul[blockspergrid, threadsperblock](psi_ij, psi_k, psi_ijk)

                for l in range(k+1, N):
                    psi_l = psi_d[l]
                    psi_ijkl = cuda.device_array(shape=(N_DIM, N_DIM), dtype=np.complex128)
                    fast_matmul[blockspergrid, threadsperblock](psi_ijk, psi_l, psi_ijkl)

                    j_ijkl = js_d[i, j, k,l]
                    parallel_add[blockspergrid, threadsperblock]((1j**(Q/2))*j_ijkl*psi_ijkl, H4_d)
                    #H4_d += (1j**(Q/2))*j_ijkl*psi_ijkl

                    #if (i<5) and (j<5) and (k<5) and (l<5):
                    #   print(f"\nj_ijk: {j_ijk}")
                    #  print(f"psi_ijk: {np.sum(np.sum(psi_ijk.copy_to_host(), axis=0), axis=0)}")
        
        if i==0:
            duration = time.time() - tic
            exp_dur = duration*(N-2)
            print(f"First i: {duration//60} minutes, {duration%60} seconds")
            print(f"Estimated time remaining: {exp_dur//60} minutes, {exp_dur%60} seconds")
                
js_test_h = js_all[0].astype(np.complex128)
js_test_d = cuda.to_device(js_test_h)

H4_hybrid_test_h = np.zeros((N_DIM, N_DIM), dtype=np.complex128)
H4_hybrid_test_d = cuda.to_device(H4_hybrid_test_h) # Crops host-array copy-overhead warning
#H4_hybrid_test_d = cuda.device_array((N_DIM, N_DIM), dtype=np.complex128) # <--- DOESN'T UPDATE ARRAY. NEVER USE.
tic = time.time()
H4_hybrid(H4_hybrid_test_d, js_test_d)
H4_hybrid_test_h = H4_hybrid_test_d.copy_to_host()
toc = time.time()
duration = toc - tic
print(f"Duration: {duration//60} minutes, {duration%60} seconds seconds")



First i: 0.0 minutes, 1.2129058837890625 seconds
Estimated time remaining: 0.0 minutes, 14.55487060546875 seconds
Duration: 0.0 minutes, 3.1065595149993896 seconds seconds


In [28]:
print(np.allclose(H4_python_test, H4_hybrid_test_h))
print(np.allclose(H4_python_test.imag, H4_hybrid_test_h.imag))

False
False


For some reason Numba is discarding the imaginary part of the Hamiltonian...

In [25]:
print(H4_python_test)

[[ 0.29599142+0.j          0.        +0.j          0.        +0.j
  ...  0.        +0.j          0.        +0.j
   0.        +0.j        ]
 [ 0.        +0.j          0.11585132+0.j          0.17995689+0.27755935j
  ...  0.        +0.j          0.        +0.j
   0.        +0.j        ]
 [ 0.        +0.j          0.17995689-0.27755935j  0.23738243+0.j
  ...  0.        +0.j          0.        +0.j
   0.        +0.j        ]
 ...
 [ 0.        +0.j          0.        +0.j          0.        +0.j
  ...  0.23738243+0.j         -0.17995689-0.27755935j
   0.        +0.j        ]
 [ 0.        +0.j          0.        +0.j          0.        +0.j
  ... -0.17995689+0.27755935j  0.11585132+0.j
   0.        +0.j        ]
 [ 0.        +0.j          0.        +0.j          0.        +0.j
  ...  0.        +0.j          0.        +0.j
   0.29599142+0.j        ]]


In [26]:
print(H4_hybrid_test_h)

[[-0.29599142+0.j  0.        +0.j  0.        +0.j ...  0.        +0.j
   0.        +0.j  0.        +0.j]
 [ 0.        +0.j -0.11585132+0.j -0.17995689+0.j ...  0.        +0.j
   0.        +0.j  0.        +0.j]
 [ 0.        +0.j -0.17995689+0.j -0.23738243+0.j ...  0.        +0.j
   0.        +0.j  0.        +0.j]
 ...
 [ 0.        +0.j  0.        +0.j  0.        +0.j ... -0.23738243+0.j
   0.17995689+0.j  0.        +0.j]
 [ 0.        +0.j  0.        +0.j  0.        +0.j ...  0.17995689+0.j
  -0.11585132+0.j  0.        +0.j]
 [ 0.        +0.j  0.        +0.j  0.        +0.j ...  0.        +0.j
   0.        +0.j -0.29599142+0.j]]


In [27]:
diff = np.abs(H4_python_test - H4_hybrid_test_h)
print(diff)

[[0.59198284 0.         0.         ... 0.         0.         0.        ]
 [0.         0.23170264 0.45450755 ... 0.         0.         0.        ]
 [0.         0.45450755 0.47476486 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.47476486 0.45450755 0.        ]
 [0.         0.         0.         ... 0.45450755 0.23170264 0.        ]
 [0.         0.         0.         ... 0.         0.         0.59198284]]
