In [1]:
import time
import copy
import onnx
import logging
import platform
import mxnet as mx
import numpy as np
import pandas as pd
import onnxruntime as ort
from sklearn.metrics import accuracy_score

In [2]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Mxnet Chcek
if platform.system().lower() != 'windows':
    print(mx.runtime.feature_list())
print(mx.context.num_gpus())
a = mx.nd.ones((2, 3), mx.cpu())
b = a * 2 + 1
print(b)

[✔ CUDA, ✔ CUDNN, ✔ NCCL, ✔ CUDA_RTC, ✖ TENSORRT, ✔ CPU_SSE, ✔ CPU_SSE2, ✔ CPU_SSE3, ✖ CPU_SSE4_1, ✖ CPU_SSE4_2, ✖ CPU_SSE4A, ✖ CPU_AVX, ✖ CPU_AVX2, ✔ OPENMP, ✖ SSE, ✖ F16C, ✖ JEMALLOC, ✔ BLAS_OPEN, ✖ BLAS_ATLAS, ✖ BLAS_MKL, ✖ BLAS_APPLE, ✔ LAPACK, ✔ MKLDNN, ✔ OPENCV, ✖ CAFFE, ✖ PROFILER, ✔ DIST_KVSTORE, ✖ CXX14, ✖ INT64_TENSOR_SIZE, ✔ SIGNAL_HANDLER, ✖ DEBUG, ✖ TVM_OP]
1

[[3. 3. 3.]
 [3. 3. 3.]]
<NDArray 2x3 @cpu(0)>


# Param Config

In [3]:
N_EPOCH = 1
N_BATCH = 32
N_BATCH_NUM = 900
S_DATA_PATH = r"mnist_train.csv"
S_MODEL_PATH = r"mxnet_cnn"
S_SYM_PATH = './mxnet_cnn-symbol.json'
S_PARAMS_PATH = './mxnet_cnn-0001.params'
S_ONNX_MODEL_PATH = './mxnet_cnn.onnx'
S_DEVICE, N_DEVICE_ID, S_DEVICE_FULL = "cuda", 0, "cuda:0"
# S_DEVICE, N_DEVICE_ID, S_DEVICE_FULL = "cpu", 0, "cpu"
CTX = mx.cpu() if S_DEVICE == "cpu" else mx.gpu(N_DEVICE_ID)
B_IS_UNIX = True

# Read Data

In [4]:
df = pd.read_csv(S_DATA_PATH, header=None)
print(df.shape)
np_mat = np.array(df)
print(np_mat.shape)
X = np_mat[:, 1:]
Y = np_mat[:, 0]
X = X.astype(np.float32) / 255
X_train = X[:N_BATCH * N_BATCH_NUM]
X_test = X[N_BATCH * N_BATCH_NUM:]
Y_train = Y[:N_BATCH * N_BATCH_NUM]
Y_test = Y[N_BATCH * N_BATCH_NUM:]
X_train = X_train.reshape(X_train.shape[0], 1, 28, 28)
X_test = X_test.reshape(X_test.shape[0], 1, 28, 28) 
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)
train_iter = mx.io.NDArrayIter(X_train, Y_train, batch_size=N_BATCH)
test_iter = mx.io.NDArrayIter(X_test, Y_test, batch_size=N_BATCH)
test_iter_2 = copy.copy(test_iter)

(37800, 785)
(37800, 785)
(28800, 1, 28, 28)
(28800,)
(9000, 1, 28, 28)
(9000,)


# Build Model

In [5]:
net = mx.gluon.nn.HybridSequential()
with net.name_scope():
    net.add(mx.gluon.nn.Conv2D(channels=32, kernel_size=3, activation='relu'))  # bx28x28 ==>
    net.add(mx.gluon.nn.MaxPool2D(pool_size=2, strides=2))
    net.add(mx.gluon.nn.Flatten())
    net.add(mx.gluon.nn.Dense(128, activation="relu"))
    net.add(mx.gluon.nn.Dense(10))
net.hybridize()
print(net)
net.collect_params().initialize(mx.init.Xavier(), ctx=CTX)
softmax_cross_entropy = mx.gluon.loss.SoftmaxCrossEntropyLoss()
trainer = mx.gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': .001})


HybridSequential(
  (0): Conv2D(None -> 32, kernel_size=(3, 3), stride=(1, 1), Activation(relu))
  (1): MaxPool2D(size=(2, 2), stride=(2, 2), padding=(0, 0), ceil_mode=False, global_pool=False, pool_type=max, layout=NCHW)
  (2): Flatten
  (3): Dense(None -> 128, Activation(relu))
  (4): Dense(None -> 10, linear)
)


# Model Train

In [6]:
for epoch in range(N_EPOCH):
    for batch_num, itr in enumerate(train_iter):
        data = itr.data[0].as_in_context(CTX)
        label = itr.label[0].as_in_context(CTX)
        with mx.autograd.record():
            output = net(data)  # Run the forward pass
            loss = softmax_cross_entropy(output, label)  # Compute the loss
        loss.backward()
        trainer.step(data.shape[0])
        if batch_num % 50 == 0:  # Print loss once in a while
            curr_loss = mx.nd.mean(loss)  # .asscalar()
            pred = mx.nd.argmax(output, axis=1)
            np_pred, np_lable = pred.asnumpy(), label.asnumpy()
            f_acc = accuracy_score(np_lable, np_pred)
            print(f"Epoch: {epoch}; Batch {batch_num}; ACC {f_acc}")
            print(f"loss: {curr_loss}")
            print()
            # print("Epoch: %d; Batch %d; Loss %s; ACC %f" %
            #       (epoch, batch_num, str(curr_loss), f_acc))
    print()

Epoch: 0; Batch 0; ACC 0.09375
loss: 
[2.2868602]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 50; ACC 0.875
loss: 
[0.512461]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 100; ACC 0.90625
loss: 
[0.4341575]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 150; ACC 0.84375
loss: 
[0.38547087]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 200; ACC 1.0
loss: 
[0.04192137]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 250; ACC 0.90625
loss: 
[0.2115657]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 300; ACC 0.9375
loss: 
[0.15938528]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 350; ACC 1.0
loss: 
[0.03794941]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 400; ACC 0.96875
loss: 
[0.17104591]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 450; ACC 0.96875
loss: 
[0.1219267]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 500; ACC 0.96875
loss: 
[0.09218916]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 550; ACC 0.9375
loss: 
[0.13641348]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 600; ACC 0.96875
loss: 
[0.07552807]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 650; ACC 0.96875
loss: 
[0.12182048]
<NDArr

# Model Predict

In [7]:
for batch_num, itr in enumerate(test_iter_2):
    data = itr.data[0].as_in_context(CTX)
    label = itr.label[0].as_in_context(CTX)

    output = net(data)  # Run the forward pass
    loss = softmax_cross_entropy(output, label)  # Compute the loss

    if batch_num % 50 == 0:  # Print loss once in a while
        curr_loss = mx.nd.mean(loss)  # .asscalar()
        pred = mx.nd.argmax(output, axis=1)
        np_pred, np_lable = pred.asnumpy(), label.asnumpy()
        f_acc = accuracy_score(np_lable, np_pred)
        print(f"Epoch: {epoch}; Batch {batch_num}; ACC {f_acc}")
        print(f"loss: {curr_loss}")
        print()

Epoch: 0; Batch 0; ACC 0.96875
loss: 
[0.22210263]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 50; ACC 0.96875
loss: 
[0.05497758]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 100; ACC 0.96875
loss: 
[0.08064993]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 150; ACC 1.0
loss: 
[0.02072265]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 200; ACC 0.9375
loss: 
[0.0786432]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 250; ACC 1.0
loss: 
[0.03645336]
<NDArray 1 @gpu(0)>



# Model Save

In [8]:
net.export(S_MODEL_PATH, epoch=N_EPOCH)  # 保存模型结构和全部参数

# Model Load and Loaded Model Use

In [9]:
print("load net and do test")
load_net = mx.gluon.nn.SymbolBlock.imports(S_SYM_PATH, ['data'], S_PARAMS_PATH, ctx=CTX)  # 加载模型
print("load ok")
for batch_num, itr in enumerate(test_iter):  # Test
    data = itr.data[0].as_in_context(CTX)
    label = itr.label[0].as_in_context(CTX)

    output = load_net(data)  # Run the forward pass
    loss = softmax_cross_entropy(output, label)  # Compute the loss

    if batch_num % 50 == 0:  # Print loss once in a while
        curr_loss = mx.nd.mean(loss)  # .asscalar()
        pred = mx.nd.argmax(output, axis=1)
        np_pred, np_lable = pred.asnumpy(), label.asnumpy()
        f_acc = accuracy_score(np_lable, np_pred)
        print(f"Epoch: {epoch}; Batch {batch_num}; ACC {f_acc}")
        print(f"loss: {curr_loss}")
        print()
print("finish")

load net and do test
load ok
Epoch: 0; Batch 0; ACC 0.96875
loss: 
[0.22210263]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 50; ACC 0.96875
loss: 
[0.05497758]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 100; ACC 0.96875
loss: 
[0.08064993]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 150; ACC 1.0
loss: 
[0.02072265]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 200; ACC 0.9375
loss: 
[0.0786432]
<NDArray 1 @gpu(0)>

Epoch: 0; Batch 250; ACC 1.0
loss: 
[0.03645336]
<NDArray 1 @gpu(0)>

finish


	data: None
  input_sym_arg_type = in_param.infer_type()[0]


# Export Onnx 

In [10]:
if platform.system().lower() != 'windows':
    mx.onnx.export_model(S_SYM_PATH, S_PARAMS_PATH, [(32, 1, 28, 28)], [np.float32], S_ONNX_MODEL_PATH, verbose=True, dynamic=True)

INFO:root:Converting json and weight file to sym and params
INFO:root:Converting idx: 0, op: null, name: data
INFO:root:Converting idx: 1, op: null, name: hybridsequential0_conv0_weight
INFO:root:Converting idx: 2, op: null, name: hybridsequential0_conv0_bias
INFO:root:Converting idx: 3, op: Convolution, name: hybridsequential0_conv0_fwd
INFO:root:Converting idx: 4, op: Activation, name: hybridsequential0_conv0_relu_fwd
INFO:root:Converting idx: 5, op: Pooling, name: hybridsequential0_pool0_fwd
INFO:root:Converting idx: 6, op: Flatten, name: hybridsequential0_flatten0_flatten0
INFO:root:Converting idx: 7, op: null, name: hybridsequential0_dense0_weight
INFO:root:Converting idx: 8, op: null, name: hybridsequential0_dense0_bias
INFO:root:Converting idx: 9, op: FullyConnected, name: hybridsequential0_dense0_fwd
INFO:root:Converting idx: 10, op: Activation, name: hybridsequential0_dense0_relu_fwd
INFO:root:Converting idx: 11, op: null, name: hybridsequential0_dense1_weight
INFO:root:Conver

# Import Onxx

In [11]:
if platform.system().lower() != 'windows':
    model = onnx.load(S_ONNX_MODEL_PATH)
    print(onnx.checker.check_model(model))  # Check that the model is well formed
    # print(onnx.helper.printable_graph(model.graph))  # Print a human readable representation of the graph
    ls_input_name, ls_output_name = [input.name for input in model.graph.input], [output.name for output in model.graph.output]
    print("input name ", ls_input_name)
    print("output name ", ls_output_name)
    s_input_name = ls_input_name[0]
    
    x_input = X_train[:N_BATCH*2, :, :, :].astype(np.float32)
    ort_val = ort.OrtValue.ortvalue_from_numpy(x_input, S_DEVICE, N_DEVICE_ID)
    print("val device ", ort_val.device_name())
    print("val shape ", ort_val.shape())
    print("val data type ", ort_val.data_type())
    print("is_tensor ", ort_val.is_tensor())
    print("array_equal ", np.array_equal(ort_val.numpy(), x_input))
    providers = 'CUDAExecutionProvider' if S_DEVICE == "cuda" else 'CPUExecutionProvider'
    print("providers ", providers)
    ort_session = ort.InferenceSession(S_ONNX_MODEL_PATH, providers=[providers])  # gpu运行
    ort_session.set_providers([providers])
    outputs = ort_session.run(None, {s_input_name: ort_val})
    print("sess env ", ort_session.get_providers())
    print(type(outputs))
    print(outputs[0])

    '''
    For example ['CUDAExecutionProvider', 'CPUExecutionProvider']
        means execute a node using CUDAExecutionProvider if capable, otherwise execute using CPUExecutionProvider.
    '''

None
input name  ['data', 'hybridsequential0_conv0_weight', 'hybridsequential0_conv0_bias', 'hybridsequential0_dense0_weight', 'hybridsequential0_dense0_bias', 'hybridsequential0_dense1_weight', 'hybridsequential0_dense1_bias']
output name  ['hybridsequential0_dense1_fwd']
val device  cuda
val shape  [64, 1, 28, 28]
val data type  tensor(float)
is_tensor  True
array_equal  True
providers  CUDAExecutionProvider
sess env  ['CUDAExecutionProvider', 'CPUExecutionProvider']
<class 'list'>
[[-2.69336128e+00  8.42524242e+00 -3.34120363e-01 -1.17912292e+00
   3.82278800e-01 -3.60794234e+00  3.58125120e-01 -2.58064723e+00
   1.55215383e+00 -2.03553891e+00]
 [ 1.02665892e+01 -6.65782404e+00 -2.04501271e-01 -2.25653172e+00
  -6.31941366e+00  1.13084137e+00 -3.83885235e-01  8.22283030e-01
  -1.21192622e+00  3.33601260e+00]
 [-3.27186418e+00  1.00050325e+01  5.39114550e-02 -1.44938648e+00
  -9.89762247e-01 -2.09957671e+00 -1.49389958e+00  6.52510405e-01
   1.73153889e+00 -1.25597775e+00]
 [ 5.72116

# Onnx Run