GPU result is different from CPU

windows10 , cuda 11,1 , from MxNet , CPU result is right，GPU  results is error.
CPU:
device = 'x86.cpu'
ctx = tvm.cpu(0)
GPU:
device = 'x86.cuda'
ctx=tvm.gpu(0)

# **the whole .py:**

import mxnet as mx
import tvm
import tvm.relay as relay
import numpy as np
from tvm.contrib import util
import os

dtype = 'float32'
use_arm64 = False
use_android = False

network = 'IrisAttackCCL'

device = 'x86.cuda'  //device = 'x86.cpu'
ctx=tvm.gpu(0)        //ctx = tvm.cpu(0)

model_path = './'
path = model_path +  (network)

#set the input shape/layer
input_layer = 'data'
batch_size = 1
image_shape = (1, 240,320)  
input_shape = (batch_size,) + image_shape

######################################################################
if device == 'cpu':
    if use_arm64:
        target = tvm.target.create('llvm -device=arm_cpu -target=arm64-linux-android -mattr=+neon')
    else:
        target = tvm.target.create('llvm -device=arm_cpu -target=arm-linux-androideabi -mattr=+neon -mfloat-abi=soft')
    target_host = None

elif device == 'gpu':
    #target = tvm.target.create('opencl -device=mali')
    #target_host = 'llvm -target=aarch64-linux-gnu -mattr=+neon'
    target = tvm.target.create('opencl -device=mali')

    if use_arm64:
        target_host = 'llvm -target=arm64-linux-android -mattr=+neon'
    else:
        target_host = 'llvm -target=arm-linux-androideabi -mattr=+neon -mfloat-abi=soft'

elif device == 'x86.cpu':
    target = 'llvm'
    target_host = None

elif device == 'x86.cuda':
    target = 'cuda'
    #target = tvm.target.cuda(model='1080ti',options="-libs=cudnn, cublas")
    target = tvm.target.cuda(model='3060ti')
    target_host = 'llvm'

else:
    target = tvm.target.create('llvm -target=arm64-linux-android')
    target_host = None


######################################################################
# input the mxnet model
mx_sym, args, auxs = mx.model.load_checkpoint(path, 0)

import pdb
pdb.set_trace()
######################################################################
shape_dict = {'data': input_shape}
func, params = relay.frontend.from_mxnet(mx_sym, shape_dict, dtype, args, auxs)


######################################################################
# now compile the graph
with relay.build_config(opt_level=3):
    graph, lib, params = relay.build(func, target, params=params)

######################################################################
print("Compile...")

######################################################################
#save the relay model
temp = util.tempdir()
path_lib = temp.relpath("%s.%s.dll" % (path, device))

if use_android:
    from tvm.contrib import ndk
    if use_arm64:
        lib.export_library(path_lib, ndk.create_shared)
    else:
        lib.export_library(path_lib, ndk.create_shared, options=["-shared", "-fPIC", "-mfloat-abi=softfp", "-mfpu=neon"])

else:
    lib.export_library("%s.%s.dll" % (path, device))

with open("%s.%s.json" % (path, device), "w") as fo:
    fo.write(graph)
with open("%s.%s.params" % (path, device), "wb") as fo:
    fo.write(relay.save_param_dict(params))


print("------convert done!!!------")
import numpy as np
img = np.ones(input_shape) #NCHW(batch_size,1,240,320)
x = np.array(img)

######################################################################
from tvm.contrib import graph_runtime
import time
dtype = 'float32'

m = graph_runtime.create(graph, lib, ctx)
m.set_input('data', tvm.nd.array(x.astype(dtype)))
m.set_input(**params)

start = time.time()
count = 1
for i in range(count):
    m.run()

end = (time.time()- start)/count
print ("the cost time is ", end)

#evaluate
print("Evaluate inference time cost...")
ftimer = m.module.time_evaluator("run", ctx, number=1, repeat=10)
prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
      (np.mean(prof_res), np.std(prof_res)))
tvm_output0 = m.get_output(0)
print('- tvm_output 0 shape : ', tvm_output0.shape)


# ######################################################################
# test images
data_shape = input_shape
import cv2 
def preprocess_img_single(img_path,data_shape):
    img = cv2.imread(img_path,0)
    img = cv2.resize(img,(data_shape[3],data_shape[2]))-128.0    #img.shape(240,320)
    img = np.reshape(img,(data_shape[2],data_shape[3],1))        #单通道 img.shape(240,320,1)
    img_data = np.transpose(np.array(img), (2, 0, 1))            #img.shape(1,240,320)
    img_data = np.expand_dims(img_data, axis=0)                  #img.shape(1,1,240,320)
    return img_data

# Set inputs
img_path= model_path+"./00001.jpg"
img_data = preprocess_img_single(img_path,data_shape) 

m = graph_runtime.create(graph, lib, ctx)
m.set_input('data', tvm.nd.array(img_data.astype(dtype)))
m.set_input(**params)
m.run()
tvm_output = m.get_output(0).asnumpy() #, tvm.nd.empty(tuple(oshape[0]), dtype)
result=tvm_output[0,:]
resultfinal = result[result[:,0]!=-1].tolist()
print(resultfinal)

# **CPU result:**
------convert done!!!------
the cost time is  0.008001565933227539
Evaluate inference time cost...
Mean inference time (std dev): 8.99 ms (0.39 ms)
- tvm_output 0 shape :  (1, 5228, 6)
resultfinal:**_[[3.0, 0.9999833106994629, 0.266605406999588, 0.13155204057693481, 0.7422218322753906, 0.7783907651901245], [0.0, 0.9999711513519287, 0.424368679523468, 0.3192351758480072, 0.590116560459137, 0.5846942663192749]]_**

# **GPU result:**
------convert done!!!------
the cost time is  0.6151375770568848
Evaluate inference time cost...
Mean inference time (std dev): 6.12 ms (0.42 ms)
- tvm_output 0 shape :  (1, 5228, 6)
resultfinal:[[3.495429754257202, 8.788818359375, 0.696427583694458, 3.863145589828491, 2.051990509033203, 9.19469928741455], [1.7574224472045898, 4.362786769866943, 1.1608469486236572, -0.23158644139766693, 1.5107040405273438, 2.060492515563965], [0.342197448015213, 2.6218209266662598, -2.7281951904296875, -1.9947190284729004, -1.5453457832336426, -0.49175599217414856], [-1.8521333932876587, 1.7014127969741821, -0.8565990924835205, -2.5641982555389404, -0.38735270500183105, 3.0310404300689697], [0.07879126071929932, 1.0, 0.0, 0.8949449062347412, 0.04564562812447548, 1.0], _**[3.0, 0.9999833106994629, 0.2666054368019104, 0.1315521001815796, 0.7422217726707458, 0.7783908247947693], [0.0, 0.9999711513519287, 0.424368679523468, 0.3192351460456848, 0.590116560459137, 0.5846942067146301]]**_

the last two GPU result equal to CPU result.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

GPU result is different from CPU #8123

the whole .py:

input the mxnet model

now compile the graph

test images

Set inputs

CPU result:

GPU result:

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

GPU result is different from CPU #8123

Description

the whole .py:

input the mxnet model

now compile the graph

test images

Set inputs

CPU result:

GPU result:

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions