Skip to content

GPU result is different from CPU #8123

@cyyfighting12

Description

@cyyfighting12

windows10 , cuda 11,1 , from MxNet , CPU result is right,GPU results is error.
CPU:
device = 'x86.cpu'
ctx = tvm.cpu(0)
GPU:
device = 'x86.cuda'
ctx=tvm.gpu(0)

the whole .py:

import mxnet as mx
import tvm
import tvm.relay as relay
import numpy as np
from tvm.contrib import util
import os

dtype = 'float32'
use_arm64 = False
use_android = False

network = 'IrisAttackCCL'

device = 'x86.cuda' //device = 'x86.cpu'
ctx=tvm.gpu(0) //ctx = tvm.cpu(0)

model_path = './'
path = model_path + (network)

#set the input shape/layer
input_layer = 'data'
batch_size = 1
image_shape = (1, 240,320)
input_shape = (batch_size,) + image_shape

######################################################################
if device == 'cpu':
if use_arm64:
target = tvm.target.create('llvm -device=arm_cpu -target=arm64-linux-android -mattr=+neon')
else:
target = tvm.target.create('llvm -device=arm_cpu -target=arm-linux-androideabi -mattr=+neon -mfloat-abi=soft')
target_host = None

elif device == 'gpu':
#target = tvm.target.create('opencl -device=mali')
#target_host = 'llvm -target=aarch64-linux-gnu -mattr=+neon'
target = tvm.target.create('opencl -device=mali')

if use_arm64:
    target_host = 'llvm -target=arm64-linux-android -mattr=+neon'
else:
    target_host = 'llvm -target=arm-linux-androideabi -mattr=+neon -mfloat-abi=soft'

elif device == 'x86.cpu':
target = 'llvm'
target_host = None

elif device == 'x86.cuda':
target = 'cuda'
#target = tvm.target.cuda(model='1080ti',options="-libs=cudnn, cublas")
target = tvm.target.cuda(model='3060ti')
target_host = 'llvm'

else:
target = tvm.target.create('llvm -target=arm64-linux-android')
target_host = None

######################################################################

input the mxnet model

mx_sym, args, auxs = mx.model.load_checkpoint(path, 0)

import pdb
pdb.set_trace()
######################################################################
shape_dict = {'data': input_shape}
func, params = relay.frontend.from_mxnet(mx_sym, shape_dict, dtype, args, auxs)

######################################################################

now compile the graph

with relay.build_config(opt_level=3):
graph, lib, params = relay.build(func, target, params=params)

######################################################################
print("Compile...")

######################################################################
#save the relay model
temp = util.tempdir()
path_lib = temp.relpath("%s.%s.dll" % (path, device))

if use_android:
from tvm.contrib import ndk
if use_arm64:
lib.export_library(path_lib, ndk.create_shared)
else:
lib.export_library(path_lib, ndk.create_shared, options=["-shared", "-fPIC", "-mfloat-abi=softfp", "-mfpu=neon"])

else:
lib.export_library("%s.%s.dll" % (path, device))

with open("%s.%s.json" % (path, device), "w") as fo:
fo.write(graph)
with open("%s.%s.params" % (path, device), "wb") as fo:
fo.write(relay.save_param_dict(params))

print("------convert done!!!------")
import numpy as np
img = np.ones(input_shape) #NCHW(batch_size,1,240,320)
x = np.array(img)

######################################################################
from tvm.contrib import graph_runtime
import time
dtype = 'float32'

m = graph_runtime.create(graph, lib, ctx)
m.set_input('data', tvm.nd.array(x.astype(dtype)))
m.set_input(**params)

start = time.time()
count = 1
for i in range(count):
m.run()

end = (time.time()- start)/count
print ("the cost time is ", end)

#evaluate
print("Evaluate inference time cost...")
ftimer = m.module.time_evaluator("run", ctx, number=1, repeat=10)
prof_res = np.array(ftimer().results) * 1000 # convert to millisecond
print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
(np.mean(prof_res), np.std(prof_res)))
tvm_output0 = m.get_output(0)
print('- tvm_output 0 shape : ', tvm_output0.shape)

test images

data_shape = input_shape
import cv2
def preprocess_img_single(img_path,data_shape):
img = cv2.imread(img_path,0)
img = cv2.resize(img,(data_shape[3],data_shape[2]))-128.0 #img.shape(240,320)
img = np.reshape(img,(data_shape[2],data_shape[3],1)) #单通道 img.shape(240,320,1)
img_data = np.transpose(np.array(img), (2, 0, 1)) #img.shape(1,240,320)
img_data = np.expand_dims(img_data, axis=0) #img.shape(1,1,240,320)
return img_data

Set inputs

img_path= model_path+"./00001.jpg"
img_data = preprocess_img_single(img_path,data_shape)

m = graph_runtime.create(graph, lib, ctx)
m.set_input('data', tvm.nd.array(img_data.astype(dtype)))
m.set_input(**params)
m.run()
tvm_output = m.get_output(0).asnumpy() #, tvm.nd.empty(tuple(oshape[0]), dtype)
result=tvm_output[0,:]
resultfinal = result[result[:,0]!=-1].tolist()
print(resultfinal)

CPU result:

------convert done!!!------
the cost time is 0.008001565933227539
Evaluate inference time cost...
Mean inference time (std dev): 8.99 ms (0.39 ms)

  • tvm_output 0 shape : (1, 5228, 6)
    resultfinal:[[3.0, 0.9999833106994629, 0.266605406999588, 0.13155204057693481, 0.7422218322753906, 0.7783907651901245], [0.0, 0.9999711513519287, 0.424368679523468, 0.3192351758480072, 0.590116560459137, 0.5846942663192749]]

GPU result:

------convert done!!!------
the cost time is 0.6151375770568848
Evaluate inference time cost...
Mean inference time (std dev): 6.12 ms (0.42 ms)

  • tvm_output 0 shape : (1, 5228, 6)
    resultfinal:[[3.495429754257202, 8.788818359375, 0.696427583694458, 3.863145589828491, 2.051990509033203, 9.19469928741455], [1.7574224472045898, 4.362786769866943, 1.1608469486236572, -0.23158644139766693, 1.5107040405273438, 2.060492515563965], [0.342197448015213, 2.6218209266662598, -2.7281951904296875, -1.9947190284729004, -1.5453457832336426, -0.49175599217414856], [-1.8521333932876587, 1.7014127969741821, -0.8565990924835205, -2.5641982555389404, -0.38735270500183105, 3.0310404300689697], [0.07879126071929932, 1.0, 0.0, 0.8949449062347412, 0.04564562812447548, 1.0], [3.0, 0.9999833106994629, 0.2666054368019104, 0.1315521001815796, 0.7422217726707458, 0.7783908247947693], [0.0, 0.9999711513519287, 0.424368679523468, 0.3192351460456848, 0.590116560459137, 0.5846942067146301]]

the last two GPU result equal to CPU result.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions