In [1]:
try:
    print("Importing CUPY...")
    import cupy as cp

    print("done")
except Exception as xEx:
    print(
        "CUDA python module 'cupy' could not be imported.\n"
        "Make sure you have the NVIDIA CUDA toolkit installed and\n"
        "the 'cupy' module. See 'https://cupy.dev/' for more information.\n"
        "Note that if the 'pip' install does not work, try the 'conda' install option.\n\n"
        f"Exception reported:\n{(str(xEx))}"
    )
# endtry

import numpy as np

aA = np.array([1,2,3])
caA = cp.asarray(aA)
fLen = cp.linalg.norm(caA)

print(fLen)



Importing CUPY...
done
3.7416573867739413


In [40]:
import os
os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
import cv2

from typing import Tuple
import numpy as np
from pathlib import Path

# Set the path from where all images can be found as environment variable "CATH_DEV_MAIN".
# With PowerShell this can be done with: [System.Environment]::SetEnvironmentVariable("CATH_DEV_MAIN", "[path]", "User")
# You need to restart VS Code after setting the path from a VS Code terminal.
sPathMain = os.environ["CATH_DEV_MAIN"]
pathMain = Path(sPathMain)
# print(f"Main path: {pathMain}")

pathImage = pathMain / "Image"
pathFlow = pathMain / "AT_Flow/test"

iFrame: int = 10

def LoadImages(_iFrame:int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    iChX: int = 2
    iChY: int = 1
    iChZ: int = 0

    sFrame = f"Frame_{_iFrame:04d}"
    pathImgImage = pathImage / f"{sFrame}.png"
    pathImgFlow = pathFlow / f"{sFrame}.exr"

    sFrameNext = f"Frame_{(_iFrame+1):04d}"
    pathImgImage2 = pathImage / f"{sFrameNext}.png"


    imgImageOrig = cv2.imread(
        pathImgImage.as_posix(),
        cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH | cv2.IMREAD_UNCHANGED,
    )

    imgImageOrig2 = cv2.imread(
        pathImgImage2.as_posix(),
        cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH | cv2.IMREAD_UNCHANGED,
    )

    imgFlowOrig = cv2.imread(
        pathImgFlow.as_posix(),
        cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH | cv2.IMREAD_UNCHANGED,
    )

    imgImage = imgImageOrig[:, :, [2, 1, 0]]
    imgImage2 = imgImageOrig2[:, :, [2, 1, 0]]
    imgFlow = imgFlowOrig[:, :, [2, 1, 0, 3]]

    return imgImage, imgImage2, imgFlow
# enddef

imgImageOrig, imgImageOrig2, imgFlow = LoadImages(iFrame)

fMaxImgVal = float(np.iinfo(imgImageOrig.dtype).max)
print(f"fMaxImgVal: {fMaxImgVal}")

# Scale image value to range [0, 1]
imgImage = imgImageOrig.astype(np.float32) / fMaxImgVal
imgImage2 = imgImageOrig2.astype(np.float32) / fMaxImgVal

# aOffset = np.array([[[1e4, 1e4, 1e4]]])
# imgPos1 = imgPos1 - aOffset
# imgPos2 = imgPos2 - aOffset

# imgLoc1 = imgLoc1 - aOffset
# imgLoc2 = imgLoc2 - aOffset


# print(np.min(imgPos1))
# print(np.max(imgPos1))


fMaxImgVal: 255.0


In [3]:
print(f"CUDA device count: {(cp.cuda.runtime.getDeviceCount())}")

CUDA device count: 1


In [41]:
import sys
from pathlib import Path

pathKernel = Path.cwd() / "Dev-EvalMotionBlur-v1.cu"
sKernelCode = pathKernel.read_text()

iThreadCnt = 32
tiFilterRadiusXY = (200, 200)
fFlowFactor = 1.0

iFlowRows, iFlowCols, iFlowChanCnt = imgFlow.shape
iRowStrideFlow = iFlowCols * iFlowChanCnt

iImgRows, iImgCols, iImgChanCnt = imgImage.shape
iRowStrideImage = iImgCols * iImgChanCnt

tiSizeXY = (iImgCols, iImgRows)
# Full image
tiStartXY = (0, 0)
tiRangeXY = (iImgCols, iImgRows)

tiRangeXY = tuple(tiRangeXY[i] if tiStartXY[i] + tiRangeXY[i] <= tiSizeXY[i] else tiSizeXY[i] - tiStartXY[0] for i in range(2))
tiBlockDimXY = (tiRangeXY[0] // iThreadCnt + (1 if tiRangeXY[0] % iThreadCnt > 0 else 0), tiRangeXY[1])

sFuncMbExp = (f"EvalMotionBlur<{tiStartXY[0]}, {tiStartXY[1]}, "
                f"{tiRangeXY[0]}, {tiRangeXY[1]}, "
                f"{tiSizeXY[0]}, {tiSizeXY[1]}, "
                f"{tiFilterRadiusXY[0]}, {tiFilterRadiusXY[1]}, "
                f"{iRowStrideImage}, {iRowStrideFlow}, "
                f"{iImgChanCnt}, {iFlowChanCnt}>");
                
modKernel = cp.RawModule(code=sKernelCode, options=("-std=c++11",), name_expressions=[sFuncMbExp])
# modKernel.compile(log_stream=sys.stdout)
kernFlow = modKernel.get_function(sFuncMbExp)

caImage = cp.asarray(imgImage, dtype=cp.float32)
caImage2 = cp.asarray(imgImage2, dtype=cp.float32)
caFlow = cp.asarray(imgFlow, dtype=cp.float32)
caResult = cp.full((iImgRows, iImgCols, iImgChanCnt), 0.0, dtype=cp.float32)

kernFlow(tiBlockDimXY, (iThreadCnt,), (caImage, caImage2, caFlow, cp.float32(fFlowFactor), caResult))

imgResult = cp.asnumpy(caResult)
imgResultWrite = imgResult[:, :, [2, 1, 0]]
cv2.imwrite((pathMain / f"MotionBlur_{iFrame}.exr").as_posix(), imgResultWrite)


True